From f5280ebf6e0a657e1d046db64f9e54829346481c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Sun, 19 Jun 2022 01:23:45 +0300 Subject: [PATCH 001/364] mdbx: bump version to 0.12.0 (not a release but preparation for changing feature set and API). --- ChangeLog.md | 5 +++++ mdbx.h | 4 ++-- src/man1/mdbx_chk.1 | 2 +- src/man1/mdbx_copy.1 | 2 +- src/man1/mdbx_drop.1 | 2 +- src/man1/mdbx_dump.1 | 2 +- src/man1/mdbx_load.1 | 2 +- src/man1/mdbx_stat.1 | 2 +- 8 files changed, 13 insertions(+), 8 deletions(-) diff --git a/ChangeLog.md b/ChangeLog.md index 945b2eaa..b830b543 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -1,6 +1,11 @@ ChangeLog --------- +## v0.12.0 at 2022-06-19 + +Not a release but preparation for changing feature set and API. + + ## v0.11.8 at 2022-06-12 Acknowledgements: diff --git a/mdbx.h b/mdbx.h index c6211aba..12540c23 100644 --- a/mdbx.h +++ b/mdbx.h @@ -626,9 +626,9 @@ typedef mode_t mdbx_mode_t; extern "C" { #endif -/* MDBX version 0.11.x */ +/* MDBX version 0.12.x */ #define MDBX_VERSION_MAJOR 0 -#define MDBX_VERSION_MINOR 11 +#define MDBX_VERSION_MINOR 12 #ifndef LIBMDBX_API #if defined(LIBMDBX_EXPORTS) diff --git a/src/man1/mdbx_chk.1 b/src/man1/mdbx_chk.1 index c352c38f..343b80cb 100644 --- a/src/man1/mdbx_chk.1 +++ b/src/man1/mdbx_chk.1 @@ -1,6 +1,6 @@ .\" Copyright 2015-2022 Leonid Yuriev . .\" Copying restrictions apply. See COPYRIGHT/LICENSE. -.TH MDBX_CHK 1 "2022-04-22" "MDBX 0.11.8" +.TH MDBX_CHK 1 "2022-06-19" "MDBX 0.12.0" .SH NAME mdbx_chk \- MDBX checking tool .SH SYNOPSIS diff --git a/src/man1/mdbx_copy.1 b/src/man1/mdbx_copy.1 index 54512620..4a861172 100644 --- a/src/man1/mdbx_copy.1 +++ b/src/man1/mdbx_copy.1 @@ -2,7 +2,7 @@ .\" Copyright 2012-2015 Howard Chu, Symas Corp. All Rights Reserved. .\" Copyright 2015,2016 Peter-Service R&D LLC . .\" Copying restrictions apply. See COPYRIGHT/LICENSE. -.TH MDBX_COPY 1 "2022-04-22" "MDBX 0.11.8" +.TH MDBX_COPY 1 "2022-06-19" "MDBX 0.12.0" .SH NAME mdbx_copy \- MDBX environment copy tool .SH SYNOPSIS diff --git a/src/man1/mdbx_drop.1 b/src/man1/mdbx_drop.1 index d8859a57..15945800 100644 --- a/src/man1/mdbx_drop.1 +++ b/src/man1/mdbx_drop.1 @@ -1,7 +1,7 @@ .\" Copyright 2021-2022 Leonid Yuriev . .\" Copyright 2014-2021 Howard Chu, Symas Corp. All Rights Reserved. .\" Copying restrictions apply. See COPYRIGHT/LICENSE. -.TH MDBX_DROP 1 "2022-04-22" "MDBX 0.11.8" +.TH MDBX_DROP 1 "2022-06-19" "MDBX 0.12.0" .SH NAME mdbx_drop \- MDBX database delete tool .SH SYNOPSIS diff --git a/src/man1/mdbx_dump.1 b/src/man1/mdbx_dump.1 index 403b2fab..4e360edf 100644 --- a/src/man1/mdbx_dump.1 +++ b/src/man1/mdbx_dump.1 @@ -2,7 +2,7 @@ .\" Copyright 2014-2015 Howard Chu, Symas Corp. All Rights Reserved. .\" Copyright 2015,2016 Peter-Service R&D LLC . .\" Copying restrictions apply. See COPYRIGHT/LICENSE. -.TH MDBX_DUMP 1 "2022-04-22" "MDBX 0.11.8" +.TH MDBX_DUMP 1 "2022-06-19" "MDBX 0.12.0" .SH NAME mdbx_dump \- MDBX environment export tool .SH SYNOPSIS diff --git a/src/man1/mdbx_load.1 b/src/man1/mdbx_load.1 index 01c58b01..1363d56b 100644 --- a/src/man1/mdbx_load.1 +++ b/src/man1/mdbx_load.1 @@ -2,7 +2,7 @@ .\" Copyright 2014-2015 Howard Chu, Symas Corp. All Rights Reserved. .\" Copyright 2015,2016 Peter-Service R&D LLC . .\" Copying restrictions apply. See COPYRIGHT/LICENSE. -.TH MDBX_LOAD 1 "2022-04-22" "MDBX 0.11.8" +.TH MDBX_LOAD 1 "2022-06-19" "MDBX 0.12.0" .SH NAME mdbx_load \- MDBX environment import tool .SH SYNOPSIS diff --git a/src/man1/mdbx_stat.1 b/src/man1/mdbx_stat.1 index f2ebbcf9..1580ed44 100644 --- a/src/man1/mdbx_stat.1 +++ b/src/man1/mdbx_stat.1 @@ -2,7 +2,7 @@ .\" Copyright 2012-2015 Howard Chu, Symas Corp. All Rights Reserved. .\" Copyright 2015,2016 Peter-Service R&D LLC . .\" Copying restrictions apply. See COPYRIGHT/LICENSE. -.TH MDBX_STAT 1 "2022-04-22" "MDBX 0.11.8" +.TH MDBX_STAT 1 "2022-06-19" "MDBX 0.12.0" .SH NAME mdbx_stat \- MDBX environment status tool .SH SYNOPSIS From f1ccc717b48da5e673ada082c5dec1d898995a4b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Mon, 20 Jun 2022 20:16:54 +0300 Subject: [PATCH 002/364] mdbx: add update-gc context (extracted from `bigfoot`). --- src/core.c | 527 ++++++++++++++++++++++++++---------------------- src/internals.h | 8 +- 2 files changed, 289 insertions(+), 246 deletions(-) diff --git a/src/core.c b/src/core.c index fd367c86..17a960e9 100644 --- a/src/core.c +++ b/src/core.c @@ -9090,77 +9090,110 @@ __cold static int mdbx_audit_ex(MDBX_txn *txn, unsigned retired_stored, return MDBX_PROBLEM; } -static __always_inline unsigned backlog_size(MDBX_txn *txn) { +typedef struct gc_update_context { + unsigned retired_stored, loop; + unsigned settled, cleaned_slot, reused_slot, filled_slot; + txnid_t cleaned_id, rid; + bool lifo, dense; + MDBX_cursor_couple cursor; +} gcu_context_t; + +static __inline int gcu_context_init(MDBX_txn *txn, gcu_context_t *ctx) { + memset(ctx, 0, offsetof(gcu_context_t, cursor)); + ctx->lifo = (txn->mt_env->me_flags & MDBX_LIFORECLAIM) != 0; + return mdbx_cursor_init(&ctx->cursor.outer, txn, FREE_DBI); +} + +static __always_inline unsigned gcu_backlog_size(MDBX_txn *txn) { return MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) + txn->tw.loose_count; } +static int gcu_clean_stored_retired(MDBX_txn *txn, gcu_context_t *ctx) { + int err = MDBX_SUCCESS; + if (ctx->retired_stored) { + MDBX_val key, val; + key.iov_base = &txn->mt_txnid; + key.iov_len = sizeof(txnid_t); + const struct cursor_set_result csr = + mdbx_cursor_set(&ctx->cursor.outer, &key, &val, MDBX_SET); + if (csr.err == MDBX_SUCCESS && csr.exact) { + ctx->retired_stored = 0; + err = mdbx_cursor_del(&ctx->cursor.outer, 0); + mdbx_trace("== clear-4linear, backlog %u, err %d", gcu_backlog_size(txn), + err); + } + } + return err; +} + /* LY: Prepare a backlog of pages to modify GC itself, * while reclaiming is prohibited. It should be enough to prevent search * in mdbx_page_alloc() during a deleting, when GC tree is unbalanced. */ -static int mdbx_prep_backlog(MDBX_txn *txn, MDBX_cursor *gc_cursor, - const size_t pnl_bytes, unsigned *retired_stored) { - const unsigned linear4list = number_of_ovpages(txn->mt_env, pnl_bytes); +static int gcu_prepare_backlog(MDBX_txn *txn, gcu_context_t *ctx, + const bool reserve4retired) { + const unsigned pages4retiredlist = + reserve4retired ? number_of_ovpages( + txn->mt_env, MDBX_PNL_SIZEOF(txn->tw.retired_pages)) + : 0; const unsigned backlog4cow = txn->mt_dbs[FREE_DBI].md_depth; const unsigned backlog4rebalance = backlog4cow + 1; - if (likely(linear4list == 1 && - backlog_size(txn) > (pnl_bytes - ? backlog4rebalance - : (backlog4cow + backlog4rebalance)))) + if (likely(pages4retiredlist < 2 && + gcu_backlog_size(txn) > (reserve4retired + ? backlog4rebalance + : (backlog4cow + backlog4rebalance)))) return MDBX_SUCCESS; - mdbx_trace(">> pnl_bytes %zu, backlog %u, 4list %u, 4cow %u, 4rebalance %u", - pnl_bytes, backlog_size(txn), linear4list, backlog4cow, - backlog4rebalance); + mdbx_trace( + ">> reserve4retired %c, backlog %u, 4list %u, 4cow %u, 4rebalance %u", + reserve4retired ? 'Y' : 'N', gcu_backlog_size(txn), pages4retiredlist, + backlog4cow, backlog4rebalance); - MDBX_val gc_key, fake_val; int err; - if (unlikely(linear4list > 2)) { - gc_key.iov_base = fake_val.iov_base = nullptr; - gc_key.iov_len = sizeof(txnid_t); - fake_val.iov_len = pnl_bytes; - err = mdbx_cursor_spill(gc_cursor, &gc_key, &fake_val); + if (unlikely(pages4retiredlist > 2)) { + MDBX_val key, val; + key.iov_base = val.iov_base = nullptr; + key.iov_len = sizeof(txnid_t); + val.iov_len = MDBX_PNL_SIZEOF(txn->tw.retired_pages); + err = mdbx_cursor_spill(&ctx->cursor.outer, &key, &val); if (unlikely(err != MDBX_SUCCESS)) return err; } - gc_cursor->mc_flags &= ~C_RECLAIMING; - err = mdbx_cursor_touch(gc_cursor); - mdbx_trace("== after-touch, backlog %u, err %d", backlog_size(txn), err); + ctx->cursor.outer.mc_flags &= ~C_RECLAIMING; + err = mdbx_cursor_touch(&ctx->cursor.outer); + mdbx_trace("== after-touch, backlog %u, err %d", gcu_backlog_size(txn), err); - if (unlikely(linear4list > 1) && err == MDBX_SUCCESS) { - if (retired_stored) { - gc_key.iov_base = &txn->mt_txnid; - gc_key.iov_len = sizeof(txn->mt_txnid); - const struct cursor_set_result csr = - mdbx_cursor_set(gc_cursor, &gc_key, &fake_val, MDBX_SET); - if (csr.err == MDBX_SUCCESS && csr.exact) { - *retired_stored = 0; - err = mdbx_cursor_del(gc_cursor, 0); - mdbx_trace("== clear-4linear, backlog %u, err %d", backlog_size(txn), - err); - } - } - err = - mdbx_page_alloc(gc_cursor, linear4list, MDBX_ALLOC_GC | MDBX_ALLOC_FAKE) - .err; - mdbx_trace("== after-4linear, backlog %u, err %d", backlog_size(txn), err); - mdbx_cassert(gc_cursor, - backlog_size(txn) >= linear4list || err != MDBX_SUCCESS); + if (unlikely(pages4retiredlist > 1) && + MDBX_PNL_SIZE(txn->tw.retired_pages) != ctx->retired_stored && + err == MDBX_SUCCESS) { + mdbx_tassert(txn, reserve4retired); + err = gcu_clean_stored_retired(txn, ctx); + if (unlikely(err != MDBX_SUCCESS)) + return err; + err = mdbx_page_alloc(&ctx->cursor.outer, pages4retiredlist, + MDBX_ALLOC_GC | MDBX_ALLOC_FAKE) + .err; + mdbx_trace("== after-4linear, backlog %u, err %d", gcu_backlog_size(txn), + err); + mdbx_cassert(&ctx->cursor.outer, + gcu_backlog_size(txn) >= pages4retiredlist || + err != MDBX_SUCCESS); } - while (backlog_size(txn) < backlog4cow + linear4list && err == MDBX_SUCCESS) - err = mdbx_page_alloc(gc_cursor, 0, + while (gcu_backlog_size(txn) < backlog4cow + pages4retiredlist && + err == MDBX_SUCCESS) + err = mdbx_page_alloc(&ctx->cursor.outer, 0, MDBX_ALLOC_GC | MDBX_ALLOC_SLOT | MDBX_ALLOC_FAKE | MDBX_ALLOC_NOLOG) .err; - gc_cursor->mc_flags |= C_RECLAIMING; - mdbx_trace("<< backlog %u, err %d", backlog_size(txn), err); + ctx->cursor.outer.mc_flags |= C_RECLAIMING; + mdbx_trace("<< backlog %u, err %d", gcu_backlog_size(txn), err); return (err != MDBX_NOTFOUND) ? err : MDBX_SUCCESS; } -static __inline void clean_reserved_gc_pnl(MDBX_env *env, MDBX_val pnl) { +static __inline void gcu_clean_reserved(MDBX_env *env, MDBX_val pnl) { /* PNL is initially empty, zero out at least the length */ memset(pnl.iov_base, 0, sizeof(pgno_t)); if ((env->me_flags & (MDBX_WRITEMAP | MDBX_NOMEMINIT)) == 0) @@ -9179,61 +9212,54 @@ static __inline void clean_reserved_gc_pnl(MDBX_env *env, MDBX_val pnl) { * "checks and balances") to partially bypass the fundamental design problems * inherited from LMDB. So do not try to understand it completely in order to * avoid your madness. */ -static int mdbx_update_gc(MDBX_txn *txn) { +static int mdbx_update_gc(MDBX_txn *txn, gcu_context_t *ctx) { + mdbx_trace("\n>>> @%" PRIaTXN, txn->mt_txnid); + MDBX_env *const env = txn->mt_env; + const char *const dbg_prefix_mode = ctx->lifo ? " lifo" : " fifo"; + (void)dbg_prefix_mode; + ctx->cursor.outer.mc_flags |= C_RECLAIMING; + ctx->cursor.outer.mc_next = txn->mt_cursors[FREE_DBI]; + txn->mt_cursors[FREE_DBI] = &ctx->cursor.outer; + /* txn->tw.reclaimed_pglist[] can grow and shrink during this call. * txn->tw.last_reclaimed and txn->tw.retired_pages[] can only grow. * Page numbers cannot disappear from txn->tw.retired_pages[]. */ - MDBX_env *const env = txn->mt_env; - const bool lifo = (env->me_flags & MDBX_LIFORECLAIM) != 0; - const char *dbg_prefix_mode = lifo ? " lifo" : " fifo"; - (void)dbg_prefix_mode; - mdbx_trace("\n>>> @%" PRIaTXN, txn->mt_txnid); - - unsigned retired_stored = 0, loop = 0; - MDBX_cursor_couple couple; - int rc = mdbx_cursor_init(&couple.outer, txn, FREE_DBI); - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout_notracking; - - couple.outer.mc_flags |= C_RECLAIMING; - couple.outer.mc_next = txn->mt_cursors[FREE_DBI]; - txn->mt_cursors[FREE_DBI] = &couple.outer; - bool dense_gc = false; retry: - ++loop; + ++ctx->loop; mdbx_trace("%s", " >> restart"); + int rc = MDBX_SUCCESS; mdbx_tassert(txn, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, txn->mt_next_pgno - MDBX_ENABLE_REFUND)); mdbx_tassert(txn, mdbx_dirtylist_check(txn)); - if (unlikely(/* paranoia */ loop > ((MDBX_DEBUG > 0) ? 12 : 42))) { - mdbx_error("too more loops %u, bailout", loop); + if (unlikely(/* paranoia */ ctx->loop > ((MDBX_DEBUG > 0) ? 12 : 42))) { + mdbx_error("too more loops %u, bailout", ctx->loop); rc = MDBX_PROBLEM; goto bailout; } - if (unlikely(dense_gc) && retired_stored) { - rc = mdbx_prep_backlog(txn, &couple.outer, - MDBX_PNL_SIZEOF(txn->tw.retired_pages), - &retired_stored); + if (unlikely(ctx->dense)) { + rc = gcu_clean_stored_retired(txn, ctx); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } - unsigned settled = 0, cleaned_gc_slot = 0, reused_gc_slot = 0, - filled_gc_slot = ~0u; - txnid_t cleaned_gc_id = 0, gc_rid = txn->tw.last_reclaimed; + ctx->settled = 0; + ctx->cleaned_slot = 0; + ctx->reused_slot = 0; + ctx->filled_slot = ~0u; + ctx->cleaned_id = 0; + ctx->rid = txn->tw.last_reclaimed; while (true) { /* Come back here after each Put() in case retired-list changed */ MDBX_val key, data; mdbx_trace("%s", " >> continue"); - if (retired_stored != MDBX_PNL_SIZE(txn->tw.retired_pages) && - MDBX_PNL_SIZE(txn->tw.retired_pages) > env->me_maxgc_ov1page) { - rc = mdbx_prep_backlog(txn, &couple.outer, - MDBX_PNL_SIZEOF(txn->tw.retired_pages), - &retired_stored); + if (ctx->retired_stored != MDBX_PNL_SIZE(txn->tw.retired_pages) && + (MDBX_PNL_SIZE(txn->tw.retired_pages) > env->me_maxgc_ov1page || + ctx->retired_stored > env->me_maxgc_ov1page)) { + rc = gcu_prepare_backlog(txn, ctx, true); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } @@ -9241,48 +9267,48 @@ retry: mdbx_tassert(txn, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, txn->mt_next_pgno - MDBX_ENABLE_REFUND)); - if (lifo) { - if (cleaned_gc_slot < (txn->tw.lifo_reclaimed - ? MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) - : 0)) { - settled = 0; - cleaned_gc_slot = 0; - reused_gc_slot = 0; - filled_gc_slot = ~0u; + if (ctx->lifo) { + if (ctx->cleaned_slot < (txn->tw.lifo_reclaimed + ? MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) + : 0)) { + ctx->settled = 0; + ctx->cleaned_slot = 0; + ctx->reused_slot = 0; + ctx->filled_slot = ~0u; /* LY: cleanup reclaimed records. */ do { - cleaned_gc_id = txn->tw.lifo_reclaimed[++cleaned_gc_slot]; - mdbx_tassert(txn, - cleaned_gc_slot > 0 && - cleaned_gc_id < env->me_lck->mti_oldest_reader.weak); - key.iov_base = &cleaned_gc_id; - key.iov_len = sizeof(cleaned_gc_id); - rc = mdbx_cursor_get(&couple.outer, &key, NULL, MDBX_SET); + ctx->cleaned_id = txn->tw.lifo_reclaimed[++ctx->cleaned_slot]; + mdbx_tassert(txn, ctx->cleaned_slot > 0 && + ctx->cleaned_id < + env->me_lck->mti_oldest_reader.weak); + key.iov_base = &ctx->cleaned_id; + key.iov_len = sizeof(ctx->cleaned_id); + rc = mdbx_cursor_get(&ctx->cursor.outer, &key, NULL, MDBX_SET); if (rc == MDBX_NOTFOUND) continue; if (unlikely(rc != MDBX_SUCCESS)) goto bailout; - if (likely(!dense_gc)) { - rc = mdbx_prep_backlog(txn, &couple.outer, 0, nullptr); + if (likely(!ctx->dense)) { + rc = gcu_prepare_backlog(txn, ctx, false); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } mdbx_tassert(txn, - cleaned_gc_id < env->me_lck->mti_oldest_reader.weak); + ctx->cleaned_id < env->me_lck->mti_oldest_reader.weak); mdbx_trace("%s: cleanup-reclaimed-id [%u]%" PRIaTXN, dbg_prefix_mode, - cleaned_gc_slot, cleaned_gc_id); - mdbx_tassert(txn, *txn->mt_cursors == &couple.outer); - rc = mdbx_cursor_del(&couple.outer, 0); + ctx->cleaned_slot, ctx->cleaned_id); + mdbx_tassert(txn, *txn->mt_cursors == &ctx->cursor.outer); + rc = mdbx_cursor_del(&ctx->cursor.outer, 0); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; - } while (cleaned_gc_slot < MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)); + } while (ctx->cleaned_slot < MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)); mdbx_txl_sort(txn->tw.lifo_reclaimed); } } else { /* If using records from GC which we have not yet deleted, * now delete them and any we reserved for tw.reclaimed_pglist. */ - while (cleaned_gc_id <= txn->tw.last_reclaimed) { - rc = mdbx_cursor_first(&couple.outer, &key, NULL); + while (ctx->cleaned_id <= txn->tw.last_reclaimed) { + rc = mdbx_cursor_first(&ctx->cursor.outer, &key, NULL); if (unlikely(rc != MDBX_SUCCESS)) { if (rc == MDBX_NOTFOUND) break; @@ -9293,28 +9319,29 @@ retry: rc = MDBX_CORRUPTED; goto bailout; } - gc_rid = cleaned_gc_id; - settled = 0; - reused_gc_slot = 0; - cleaned_gc_id = unaligned_peek_u64(4, key.iov_base); - if (!MDBX_DISABLE_PAGECHECKS && - unlikely(cleaned_gc_id < MIN_TXNID || cleaned_gc_id > MAX_TXNID)) { + ctx->rid = ctx->cleaned_id; + ctx->settled = 0; + ctx->reused_slot = 0; + ctx->cleaned_id = unaligned_peek_u64(4, key.iov_base); + if (!MDBX_DISABLE_PAGECHECKS && unlikely(ctx->cleaned_id < MIN_TXNID || + ctx->cleaned_id > MAX_TXNID)) { rc = MDBX_CORRUPTED; goto bailout; } - if (cleaned_gc_id > txn->tw.last_reclaimed) + if (ctx->cleaned_id > txn->tw.last_reclaimed) break; - if (likely(!dense_gc)) { - rc = mdbx_prep_backlog(txn, &couple.outer, 0, nullptr); + if (likely(!ctx->dense)) { + rc = gcu_prepare_backlog(txn, ctx, false); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } - mdbx_tassert(txn, cleaned_gc_id <= txn->tw.last_reclaimed); - mdbx_tassert(txn, cleaned_gc_id < env->me_lck->mti_oldest_reader.weak); + mdbx_tassert(txn, ctx->cleaned_id <= txn->tw.last_reclaimed); + mdbx_tassert(txn, + ctx->cleaned_id < env->me_lck->mti_oldest_reader.weak); mdbx_trace("%s: cleanup-reclaimed-id %" PRIaTXN, dbg_prefix_mode, - cleaned_gc_id); - mdbx_tassert(txn, *txn->mt_cursors == &couple.outer); - rc = mdbx_cursor_del(&couple.outer, 0); + ctx->cleaned_id); + mdbx_tassert(txn, *txn->mt_cursors == &ctx->cursor.outer); + rc = mdbx_cursor_del(&ctx->cursor.outer, 0); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } @@ -9325,7 +9352,7 @@ retry: txn->mt_next_pgno - MDBX_ENABLE_REFUND)); mdbx_tassert(txn, mdbx_dirtylist_check(txn)); if (mdbx_audit_enabled()) { - rc = mdbx_audit_ex(txn, retired_stored, false); + rc = mdbx_audit_ex(txn, ctx->retired_stored, false); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } @@ -9336,7 +9363,7 @@ retry: txn, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, txn->mt_next_pgno - MDBX_ENABLE_REFUND)); if (mdbx_audit_enabled()) { - rc = mdbx_audit_ex(txn, retired_stored, false); + rc = mdbx_audit_ex(txn, ctx->retired_stored, false); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } @@ -9349,6 +9376,18 @@ retry: * The pages themselves remain in dirtylist. */ if (unlikely(!txn->tw.lifo_reclaimed && txn->tw.last_reclaimed < 1)) { if (txn->tw.loose_count > 0) { + mdbx_trace("%s: try allocate gc-slot for %u loose-pages", + dbg_prefix_mode, txn->tw.loose_count); + rc = + mdbx_page_alloc(&ctx->cursor.outer, 0, + MDBX_ALLOC_GC | MDBX_ALLOC_SLOT | MDBX_ALLOC_FAKE) + .err; + if (rc == MDBX_SUCCESS) { + mdbx_trace("%s: retry since gc-slot for %u loose-pages available", + dbg_prefix_mode, txn->tw.loose_count); + continue; + } + /* Put loose page numbers in tw.retired_pages, * since unable to return them to tw.reclaimed_pglist. */ if (unlikely((rc = mdbx_pnl_need(&txn->tw.retired_pages, @@ -9416,47 +9455,47 @@ retry: const unsigned amount = (unsigned)MDBX_PNL_SIZE(txn->tw.reclaimed_pglist); /* handle retired-list - store ones into single gc-record */ - if (retired_stored < MDBX_PNL_SIZE(txn->tw.retired_pages)) { - if (unlikely(!retired_stored)) { + if (ctx->retired_stored < MDBX_PNL_SIZE(txn->tw.retired_pages)) { + if (unlikely(!ctx->retired_stored)) { /* Make sure last page of GC is touched and on retired-list */ - couple.outer.mc_flags &= ~C_RECLAIMING; - rc = mdbx_page_search(&couple.outer, NULL, + ctx->cursor.outer.mc_flags &= ~C_RECLAIMING; + rc = mdbx_page_search(&ctx->cursor.outer, NULL, MDBX_PS_LAST | MDBX_PS_MODIFY); - couple.outer.mc_flags |= C_RECLAIMING; + ctx->cursor.outer.mc_flags |= C_RECLAIMING; if (unlikely(rc != MDBX_SUCCESS) && rc != MDBX_NOTFOUND) goto bailout; } + /* Write to last page of GC */ - key.iov_len = sizeof(txn->mt_txnid); + key.iov_len = sizeof(txnid_t); key.iov_base = &txn->mt_txnid; do { + gcu_prepare_backlog(txn, ctx, true); data.iov_len = MDBX_PNL_SIZEOF(txn->tw.retired_pages); - mdbx_prep_backlog(txn, &couple.outer, data.iov_len, &retired_stored); - rc = mdbx_cursor_put(&couple.outer, &key, &data, MDBX_RESERVE); + rc = mdbx_cursor_put(&ctx->cursor.outer, &key, &data, MDBX_RESERVE); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; /* Retry if tw.retired_pages[] grew during the Put() */ } while (data.iov_len < MDBX_PNL_SIZEOF(txn->tw.retired_pages)); - retired_stored = (unsigned)MDBX_PNL_SIZE(txn->tw.retired_pages); + ctx->retired_stored = (unsigned)MDBX_PNL_SIZE(txn->tw.retired_pages); mdbx_pnl_sort(txn->tw.retired_pages, txn->mt_next_pgno); mdbx_assert(env, data.iov_len == MDBX_PNL_SIZEOF(txn->tw.retired_pages)); memcpy(data.iov_base, txn->tw.retired_pages, data.iov_len); mdbx_trace("%s: put-retired #%u @ %" PRIaTXN, dbg_prefix_mode, - retired_stored, txn->mt_txnid); - + ctx->retired_stored, txn->mt_txnid); if (mdbx_log_enabled(MDBX_LOG_EXTRA)) { - unsigned i = retired_stored; - mdbx_debug_extra("PNL write txn %" PRIaTXN " root %" PRIaPGNO - " num %u, PNL", + unsigned i = ctx->retired_stored; + mdbx_debug_extra("txn %" PRIaTXN " root %" PRIaPGNO + " num %u, retired-PNL", txn->mt_txnid, txn->mt_dbs[FREE_DBI].md_root, i); for (; i; i--) mdbx_debug_extra_print(" %" PRIaPGNO, txn->tw.retired_pages[i]); mdbx_debug_extra_print("%s\n", "."); } if (unlikely(amount != MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) && - settled)) { + ctx->settled)) { mdbx_trace("%s: reclaimed-list changed %u -> %u, retry", dbg_prefix_mode, amount, (unsigned)MDBX_PNL_SIZE(txn->tw.reclaimed_pglist)); @@ -9475,24 +9514,24 @@ retry: mdbx_trace("%s", " >> reserving"); if (mdbx_audit_enabled()) { - rc = mdbx_audit_ex(txn, retired_stored, false); + rc = mdbx_audit_ex(txn, ctx->retired_stored, false); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } - const unsigned left = amount - settled; + const unsigned left = amount - ctx->settled; mdbx_trace("%s: amount %u, settled %d, left %d, lifo-reclaimed-slots %u, " "reused-gc-slots %u", - dbg_prefix_mode, amount, settled, (int)left, + dbg_prefix_mode, amount, ctx->settled, (int)left, txn->tw.lifo_reclaimed ? (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) : 0, - reused_gc_slot); + ctx->reused_slot); if (0 >= (int)left) break; const unsigned prefer_max_scatter = 257; txnid_t reservation_gc_id; - if (lifo) { + if (ctx->lifo) { if (txn->tw.lifo_reclaimed == nullptr) { txn->tw.lifo_reclaimed = mdbx_txl_alloc(); if (unlikely(!txn->tw.lifo_reclaimed)) { @@ -9503,18 +9542,18 @@ retry: if ((unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) < prefer_max_scatter && left > ((unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) - - reused_gc_slot) * + ctx->reused_slot) * env->me_maxgc_ov1page && - !dense_gc) { + !ctx->dense) { /* LY: need just a txn-id for save page list. */ bool need_cleanup = false; txnid_t snap_oldest; retry_rid: - couple.outer.mc_flags &= ~C_RECLAIMING; + ctx->cursor.outer.mc_flags &= ~C_RECLAIMING; do { snap_oldest = mdbx_find_oldest(txn); rc = - mdbx_page_alloc(&couple.outer, 0, + mdbx_page_alloc(&ctx->cursor.outer, 0, MDBX_ALLOC_GC | MDBX_ALLOC_SLOT | MDBX_ALLOC_FAKE) .err; if (likely(rc == MDBX_SUCCESS)) { @@ -9526,9 +9565,9 @@ retry: (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) < prefer_max_scatter && left > ((unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) - - reused_gc_slot) * + ctx->reused_slot) * env->me_maxgc_ov1page); - couple.outer.mc_flags |= C_RECLAIMING; + ctx->cursor.outer.mc_flags |= C_RECLAIMING; if (likely(rc == MDBX_SUCCESS)) { mdbx_trace("%s: got enough from GC.", dbg_prefix_mode); @@ -9540,9 +9579,9 @@ retry: if (MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)) { if (need_cleanup) { mdbx_txl_sort(txn->tw.lifo_reclaimed); - cleaned_gc_slot = 0; + ctx->cleaned_slot = 0; } - gc_rid = MDBX_PNL_LAST(txn->tw.lifo_reclaimed); + ctx->rid = MDBX_PNL_LAST(txn->tw.lifo_reclaimed); } else { mdbx_tassert(txn, txn->tw.last_reclaimed == 0); if (unlikely(mdbx_find_oldest(txn) != snap_oldest)) @@ -9551,42 +9590,42 @@ retry: goto retry_rid; /* no reclaimable GC entries, * therefore no entries with ID < mdbx_find_oldest(txn) */ - txn->tw.last_reclaimed = gc_rid = snap_oldest - 1; + txn->tw.last_reclaimed = ctx->rid = snap_oldest - 1; mdbx_trace("%s: none recycled yet, set rid to @%" PRIaTXN, - dbg_prefix_mode, gc_rid); + dbg_prefix_mode, ctx->rid); } /* LY: GC is empty, will look any free txn-id in high2low order. */ while (MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) < prefer_max_scatter && left > ((unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) - - reused_gc_slot) * + ctx->reused_slot) * env->me_maxgc_ov1page) { - if (unlikely(gc_rid <= MIN_TXNID)) { + if (unlikely(ctx->rid <= MIN_TXNID)) { if (unlikely(MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) <= - reused_gc_slot)) { + ctx->reused_slot)) { mdbx_notice("** restart: reserve depleted (reused_gc_slot %u >= " "lifo_reclaimed %u" PRIaTXN, - reused_gc_slot, + ctx->reused_slot, (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)); goto retry; } break; } - mdbx_tassert(txn, gc_rid >= MIN_TXNID && gc_rid <= MAX_TXNID); - --gc_rid; - key.iov_base = &gc_rid; - key.iov_len = sizeof(gc_rid); - rc = mdbx_cursor_get(&couple.outer, &key, &data, MDBX_SET_KEY); + mdbx_tassert(txn, ctx->rid >= MIN_TXNID && ctx->rid <= MAX_TXNID); + --ctx->rid; + key.iov_base = &ctx->rid; + key.iov_len = sizeof(ctx->rid); + rc = mdbx_cursor_get(&ctx->cursor.outer, &key, &data, MDBX_SET_KEY); if (unlikely(rc == MDBX_SUCCESS)) { mdbx_debug("%s: GC's id %" PRIaTXN " is used, continue bottom-up search", - dbg_prefix_mode, gc_rid); - ++gc_rid; - rc = mdbx_cursor_get(&couple.outer, &key, &data, MDBX_FIRST); + dbg_prefix_mode, ctx->rid); + ++ctx->rid; + rc = mdbx_cursor_get(&ctx->cursor.outer, &key, &data, MDBX_FIRST); if (rc == MDBX_NOTFOUND) { mdbx_debug("%s: GC is empty (going dense-mode)", dbg_prefix_mode); - dense_gc = true; + ctx->dense = true; break; } if (unlikely(rc != MDBX_SUCCESS || @@ -9603,52 +9642,52 @@ retry: if (gc_first <= MIN_TXNID) { mdbx_debug("%s: no free GC's id(s) less than %" PRIaTXN " (going dense-mode)", - dbg_prefix_mode, gc_rid); - dense_gc = true; + dbg_prefix_mode, ctx->rid); + ctx->dense = true; break; } - gc_rid = gc_first - 1; + ctx->rid = gc_first - 1; } - mdbx_assert(env, !dense_gc); - rc = mdbx_txl_append(&txn->tw.lifo_reclaimed, gc_rid); + mdbx_assert(env, !ctx->dense); + rc = mdbx_txl_append(&txn->tw.lifo_reclaimed, ctx->rid); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; - if (reused_gc_slot) + if (ctx->reused_slot) /* rare case, but it is better to clear and re-create GC entries * with less fragmentation. */ need_cleanup = true; else - cleaned_gc_slot += + ctx->cleaned_slot += 1 /* mark cleanup is not needed for added slot. */; mdbx_trace("%s: append @%" PRIaTXN " to lifo-reclaimed, cleaned-gc-slot = %u", - dbg_prefix_mode, gc_rid, cleaned_gc_slot); + dbg_prefix_mode, ctx->rid, ctx->cleaned_slot); } - if (need_cleanup || dense_gc) { - if (cleaned_gc_slot) + if (need_cleanup || ctx->dense) { + if (ctx->cleaned_slot) mdbx_trace( "%s: restart inner-loop to clear and re-create GC entries", dbg_prefix_mode); - cleaned_gc_slot = 0; + ctx->cleaned_slot = 0; continue; } } const unsigned i = - (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) - reused_gc_slot; + (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) - ctx->reused_slot; mdbx_tassert(txn, i > 0 && i <= MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)); reservation_gc_id = txn->tw.lifo_reclaimed[i]; mdbx_trace("%s: take @%" PRIaTXN " from lifo-reclaimed[%u]", dbg_prefix_mode, reservation_gc_id, i); } else { mdbx_tassert(txn, txn->tw.lifo_reclaimed == NULL); - if (unlikely(gc_rid == 0)) { - gc_rid = mdbx_find_oldest(txn) - 1; - rc = mdbx_cursor_get(&couple.outer, &key, NULL, MDBX_FIRST); + if (unlikely(ctx->rid == 0)) { + ctx->rid = mdbx_find_oldest(txn) - 1; + rc = mdbx_cursor_get(&ctx->cursor.outer, &key, NULL, MDBX_FIRST); if (rc == MDBX_SUCCESS) { if (!MDBX_DISABLE_PAGECHECKS && unlikely(key.iov_len != sizeof(txnid_t))) { @@ -9661,32 +9700,32 @@ retry: rc = MDBX_CORRUPTED; goto bailout; } - if (gc_rid >= gc_first) - gc_rid = gc_first - 1; - if (unlikely(gc_rid == 0)) { + if (ctx->rid >= gc_first) + ctx->rid = gc_first - 1; + if (unlikely(ctx->rid == 0)) { mdbx_error("%s", "** no GC tail-space to store (going dense-mode)"); - dense_gc = true; + ctx->dense = true; goto retry; } } else if (rc != MDBX_NOTFOUND) goto bailout; - txn->tw.last_reclaimed = gc_rid; - cleaned_gc_id = gc_rid + 1; + txn->tw.last_reclaimed = ctx->rid; + ctx->cleaned_id = ctx->rid + 1; } - reservation_gc_id = gc_rid--; + reservation_gc_id = ctx->rid--; mdbx_trace("%s: take @%" PRIaTXN " from head-gc-id", dbg_prefix_mode, reservation_gc_id); } - ++reused_gc_slot; + ++ctx->reused_slot; unsigned chunk = left; if (unlikely(chunk > env->me_maxgc_ov1page)) { const unsigned avail_gc_slots = txn->tw.lifo_reclaimed ? (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) - - reused_gc_slot + 1 - : (gc_rid < INT16_MAX) ? (unsigned)gc_rid - : INT16_MAX; + ctx->reused_slot + 1 + : (ctx->rid < INT16_MAX) ? (unsigned)ctx->rid + : INT16_MAX; if (avail_gc_slots > 1) { if (chunk < env->me_maxgc_ov1page * 2) chunk /= 2; @@ -9720,7 +9759,7 @@ retry: chunk = (avail >= tail) ? tail - span : (avail_gc_slots > 3 && - reused_gc_slot < prefer_max_scatter - 3) + ctx->reused_slot < prefer_max_scatter - 3) ? avail - span : tail; } @@ -9731,7 +9770,7 @@ retry: mdbx_trace("%s: gc_rid %" PRIaTXN ", reused_gc_slot %u, reservation-id " "%" PRIaTXN, - dbg_prefix_mode, gc_rid, reused_gc_slot, reservation_gc_id); + dbg_prefix_mode, ctx->rid, ctx->reused_slot, reservation_gc_id); mdbx_trace("%s: chunk %u, gc-per-ovpage %u", dbg_prefix_mode, chunk, env->me_maxgc_ov1page); @@ -9751,9 +9790,9 @@ retry: key.iov_base = &reservation_gc_id; data.iov_len = (chunk + 1) * sizeof(pgno_t); mdbx_trace("%s: reserve %u [%u...%u) @%" PRIaTXN, dbg_prefix_mode, chunk, - settled + 1, settled + chunk + 1, reservation_gc_id); - mdbx_prep_backlog(txn, &couple.outer, data.iov_len, nullptr); - rc = mdbx_cursor_put(&couple.outer, &key, &data, + ctx->settled + 1, ctx->settled + chunk + 1, reservation_gc_id); + gcu_prepare_backlog(txn, ctx, true); + rc = mdbx_cursor_put(&ctx->cursor.outer, &key, &data, MDBX_RESERVE | MDBX_NOOVERWRITE); mdbx_tassert(txn, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, @@ -9761,15 +9800,15 @@ retry: if (unlikely(rc != MDBX_SUCCESS)) goto bailout; - clean_reserved_gc_pnl(env, data); - settled += chunk; - mdbx_trace("%s: settled %u (+%u), continue", dbg_prefix_mode, settled, + gcu_clean_reserved(env, data); + ctx->settled += chunk; + mdbx_trace("%s: settled %u (+%u), continue", dbg_prefix_mode, ctx->settled, chunk); if (txn->tw.lifo_reclaimed && unlikely(amount < MDBX_PNL_SIZE(txn->tw.reclaimed_pglist)) && - (loop < 5 || MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) - amount > - env->me_maxgc_ov1page)) { + (ctx->loop < 5 || MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) - amount > + env->me_maxgc_ov1page)) { mdbx_notice("** restart: reclaimed-list growth %u -> %u", amount, (unsigned)MDBX_PNL_SIZE(txn->tw.reclaimed_pglist)); goto retry; @@ -9780,15 +9819,15 @@ retry: mdbx_tassert( txn, - cleaned_gc_slot == + ctx->cleaned_slot == (txn->tw.lifo_reclaimed ? MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) : 0)); mdbx_trace("%s", " >> filling"); /* Fill in the reserved records */ - filled_gc_slot = + ctx->filled_slot = txn->tw.lifo_reclaimed - ? (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) - reused_gc_slot - : reused_gc_slot; + ? (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) - ctx->reused_slot + : ctx->reused_slot; rc = MDBX_SUCCESS; mdbx_tassert(txn, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, @@ -9802,12 +9841,12 @@ retry: const unsigned amount = MDBX_PNL_SIZE(txn->tw.reclaimed_pglist); unsigned left = amount; if (txn->tw.lifo_reclaimed == nullptr) { - mdbx_tassert(txn, lifo == 0); - rc = mdbx_cursor_first(&couple.outer, &key, &data); + mdbx_tassert(txn, ctx->lifo == 0); + rc = mdbx_cursor_first(&ctx->cursor.outer, &key, &data); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } else { - mdbx_tassert(txn, lifo != 0); + mdbx_tassert(txn, ctx->lifo != 0); } while (true) { @@ -9815,35 +9854,35 @@ retry: mdbx_trace("%s: left %u of %u", dbg_prefix_mode, left, (unsigned)MDBX_PNL_SIZE(txn->tw.reclaimed_pglist)); if (txn->tw.lifo_reclaimed == nullptr) { - mdbx_tassert(txn, lifo == 0); + mdbx_tassert(txn, ctx->lifo == 0); fill_gc_id = unaligned_peek_u64(4, key.iov_base); - if (filled_gc_slot-- == 0 || fill_gc_id > txn->tw.last_reclaimed) { + if (ctx->filled_slot-- == 0 || fill_gc_id > txn->tw.last_reclaimed) { mdbx_notice( "** restart: reserve depleted (filled_slot %u, fill_id %" PRIaTXN " > last_reclaimed %" PRIaTXN, - filled_gc_slot, fill_gc_id, txn->tw.last_reclaimed); + ctx->filled_slot, fill_gc_id, txn->tw.last_reclaimed); goto retry; } } else { - mdbx_tassert(txn, lifo != 0); - if (++filled_gc_slot > + mdbx_tassert(txn, ctx->lifo != 0); + if (++ctx->filled_slot > (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)) { mdbx_notice("** restart: reserve depleted (filled_gc_slot %u > " "lifo_reclaimed %u" PRIaTXN, - filled_gc_slot, + ctx->filled_slot, (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)); goto retry; } - fill_gc_id = txn->tw.lifo_reclaimed[filled_gc_slot]; + fill_gc_id = txn->tw.lifo_reclaimed[ctx->filled_slot]; mdbx_trace("%s: seek-reservation @%" PRIaTXN " at lifo_reclaimed[%u]", - dbg_prefix_mode, fill_gc_id, filled_gc_slot); + dbg_prefix_mode, fill_gc_id, ctx->filled_slot); key.iov_base = &fill_gc_id; key.iov_len = sizeof(fill_gc_id); - rc = mdbx_cursor_get(&couple.outer, &key, &data, MDBX_SET_KEY); + rc = mdbx_cursor_get(&ctx->cursor.outer, &key, &data, MDBX_SET_KEY); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } - mdbx_tassert(txn, cleaned_gc_slot == + mdbx_tassert(txn, ctx->cleaned_slot == (txn->tw.lifo_reclaimed ? MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) : 0)); @@ -9853,25 +9892,25 @@ retry: key.iov_len = sizeof(fill_gc_id); mdbx_tassert(txn, data.iov_len >= sizeof(pgno_t) * 2); - couple.outer.mc_flags |= C_GCFREEZE; + ctx->cursor.outer.mc_flags |= C_GCFREEZE; unsigned chunk = (unsigned)(data.iov_len / sizeof(pgno_t)) - 1; if (unlikely(chunk > left)) { mdbx_trace("%s: chunk %u > left %u, @%" PRIaTXN, dbg_prefix_mode, chunk, left, fill_gc_id); - if ((loop < 5 && chunk - left > loop / 2) || + if ((ctx->loop < 5 && chunk - left > ctx->loop / 2) || chunk - left > env->me_maxgc_ov1page) { data.iov_len = (left + 1) * sizeof(pgno_t); - if (loop < 7) - couple.outer.mc_flags &= ~C_GCFREEZE; + if (ctx->loop < 7) + ctx->cursor.outer.mc_flags &= ~C_GCFREEZE; } chunk = left; } - rc = mdbx_cursor_put(&couple.outer, &key, &data, + rc = mdbx_cursor_put(&ctx->cursor.outer, &key, &data, MDBX_CURRENT | MDBX_RESERVE); - couple.outer.mc_flags &= ~C_GCFREEZE; + ctx->cursor.outer.mc_flags &= ~C_GCFREEZE; if (unlikely(rc != MDBX_SUCCESS)) goto bailout; - clean_reserved_gc_pnl(env, data); + gcu_clean_reserved(env, data); if (unlikely(txn->tw.loose_count || amount != MDBX_PNL_SIZE(txn->tw.reclaimed_pglist))) { @@ -9881,16 +9920,18 @@ retry: goto retry; } if (unlikely(txn->tw.lifo_reclaimed - ? cleaned_gc_slot < MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) - : cleaned_gc_id < txn->tw.last_reclaimed)) { + ? ctx->cleaned_slot < + MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) + : ctx->cleaned_id < txn->tw.last_reclaimed)) { mdbx_notice("%s", "** restart: reclaimed-slots changed"); goto retry; } - if (unlikely(retired_stored != MDBX_PNL_SIZE(txn->tw.retired_pages))) { - mdbx_tassert(txn, - retired_stored < MDBX_PNL_SIZE(txn->tw.retired_pages)); + if (unlikely(ctx->retired_stored != + MDBX_PNL_SIZE(txn->tw.retired_pages))) { + mdbx_tassert(txn, ctx->retired_stored < + MDBX_PNL_SIZE(txn->tw.retired_pages)); mdbx_notice("** restart: retired-list growth (%u -> %u)", - retired_stored, MDBX_PNL_SIZE(txn->tw.retired_pages)); + ctx->retired_stored, MDBX_PNL_SIZE(txn->tw.retired_pages)); goto retry; } @@ -9907,7 +9948,7 @@ retry: left -= chunk; if (mdbx_audit_enabled()) { - rc = mdbx_audit_ex(txn, retired_stored + amount - left, true); + rc = mdbx_audit_ex(txn, ctx->retired_stored + amount - left, true); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } @@ -9917,12 +9958,12 @@ retry: } if (txn->tw.lifo_reclaimed == nullptr) { - mdbx_tassert(txn, lifo == 0); - rc = mdbx_cursor_next(&couple.outer, &key, &data, MDBX_NEXT); + mdbx_tassert(txn, ctx->lifo == 0); + rc = mdbx_cursor_next(&ctx->cursor.outer, &key, &data, MDBX_NEXT); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } else { - mdbx_tassert(txn, lifo != 0); + mdbx_tassert(txn, ctx->lifo != 0); } } } @@ -9932,28 +9973,27 @@ retry: mdbx_notice("** restart: got %u loose pages", txn->tw.loose_count); goto retry; } - if (unlikely(filled_gc_slot != + if (unlikely(ctx->filled_slot != (txn->tw.lifo_reclaimed ? (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) : 0))) { - const bool will_retry = loop < 9; + const bool will_retry = ctx->loop < 9; mdbx_notice("** %s: reserve excess (filled-slot %u, loop %u)", - will_retry ? "restart" : "ignore", filled_gc_slot, loop); + will_retry ? "restart" : "ignore", ctx->filled_slot, ctx->loop); if (will_retry) goto retry; } mdbx_tassert(txn, txn->tw.lifo_reclaimed == NULL || - cleaned_gc_slot == MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)); + ctx->cleaned_slot == MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)); bailout: - txn->mt_cursors[FREE_DBI] = couple.outer.mc_next; + txn->mt_cursors[FREE_DBI] = ctx->cursor.outer.mc_next; -bailout_notracking: MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) = 0; - mdbx_trace("<<< %u loops, rc = %d", loop, rc); + mdbx_trace("<<< %u loops, rc = %d", ctx->loop, rc); return rc; } @@ -10591,7 +10631,11 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { } ts_1 = latency ? mdbx_osal_monotime() : 0; - rc = mdbx_update_gc(txn); + gcu_context_t gcu_ctx; + rc = gcu_context_init(txn, &gcu_ctx); + if (unlikely(rc != MDBX_SUCCESS)) + goto fail; + rc = mdbx_update_gc(txn, &gcu_ctx); if (unlikely(rc != MDBX_SUCCESS)) goto fail; @@ -10613,11 +10657,11 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { goto fail; } - struct mdbx_iov_ctx ctx; - mdbx_iov_init(txn, &ctx); - rc = mdbx_txn_write(txn, &ctx); + struct mdbx_iov_ctx write_ctx; + mdbx_iov_init(txn, &write_ctx); + rc = mdbx_txn_write(txn, &write_ctx); if (likely(rc == MDBX_SUCCESS)) - mdbx_iov_done(txn, &ctx); + mdbx_iov_done(txn, &write_ctx); /* TODO: use ctx.flush_begin & ctx.flush_end for range-sync */ ts_3 = latency ? mdbx_osal_monotime() : 0; @@ -10636,7 +10680,6 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { meta.mm_dbs[MAIN_DBI] = txn->mt_dbs[MAIN_DBI]; meta.mm_canary = txn->mt_canary; meta_set_txnid(env, &meta, txn->mt_txnid); - rc = mdbx_sync_locked( env, env->me_flags | txn->mt_flags | MDBX_SHRINK_ALLOWED, &meta); } diff --git a/src/internals.h b/src/internals.h index 0f7b6e54..ead89830 100644 --- a/src/internals.h +++ b/src/internals.h @@ -923,9 +923,9 @@ struct MDBX_txn { /* corresponding to the current size of datafile */ #define mt_end_pgno mt_geo.now - /* The ID of this transaction. IDs are integers incrementing from 1. - * Only committed write transactions increment the ID. If a transaction - * aborts, the ID may be re-used by the next writer. */ + /* The ID of this transaction. IDs are integers incrementing from + * INITIAL_TXNID. Only committed write transactions increment the ID. If a + * transaction aborts, the ID may be re-used by the next writer. */ txnid_t mt_txnid; txnid_t mt_front; @@ -986,11 +986,11 @@ struct MDBX_txn { MDBX_page *loose_pages; /* Number of loose pages (tw.loose_pages) */ unsigned loose_count; + unsigned spill_least_removed; /* The sorted list of dirty pages we temporarily wrote to disk * because the dirty list was full. page numbers in here are * shifted left by 1, deleted slots have the LSB set. */ MDBX_PNL spill_pages; - unsigned spill_least_removed; } tw; }; }; From ece2fe2514e828096bef40b7c69701c1e744141a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Tue, 21 Jun 2022 19:48:49 +0300 Subject: [PATCH 003/364] mdbx: split `page_alloc()` and `page_new()` to fast- and slow/rare- parts. --- src/core.c | 233 ++++++++++++++++++++++++++++++++--------------------- 1 file changed, 140 insertions(+), 93 deletions(-) diff --git a/src/core.c b/src/core.c index 17a960e9..9fc92bba 100644 --- a/src/core.c +++ b/src/core.c @@ -3819,13 +3819,12 @@ struct page_result { int err; }; -static struct page_result mdbx_page_alloc(MDBX_cursor *mc, const pgno_t num, - int flags); static txnid_t mdbx_kick_longlived_readers(MDBX_env *env, const txnid_t laggard); -static struct page_result mdbx_page_new(MDBX_cursor *mc, const unsigned flags, - const unsigned npages); +static struct page_result page_new(MDBX_cursor *mc, const unsigned flags); +static struct page_result page_new_large(MDBX_cursor *mc, + const unsigned npages); static int mdbx_page_touch(MDBX_cursor *mc); static int mdbx_cursor_touch(MDBX_cursor *mc); static int mdbx_touch_dbi(MDBX_cursor *mc); @@ -6508,8 +6507,8 @@ __cold static int mdbx_wipe_steady(MDBX_env *env, const txnid_t last_steady) { #define MDBX_ALLOC_NOLOG 32 #define MDBX_ALLOC_ALL (MDBX_ALLOC_CACHE | MDBX_ALLOC_GC | MDBX_ALLOC_NEW) -__hot static struct page_result mdbx_page_alloc(MDBX_cursor *mc, - const pgno_t num, int flags) { +__cold static struct page_result +page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num, int flags) { struct page_result ret; MDBX_txn *const txn = mc->mc_txn; MDBX_env *const env = txn->mt_env; @@ -6535,37 +6534,6 @@ __hot static struct page_result mdbx_page_alloc(MDBX_cursor *mc, flags &= ~(MDBX_ALLOC_GC | MDBX_COALESCE); } - if (likely(num == 1 && (flags & MDBX_ALLOC_CACHE) != 0)) { - /* If there are any loose pages, just use them */ - mdbx_assert(env, (flags & MDBX_ALLOC_SLOT) == 0); - if (likely(txn->tw.loose_pages)) { -#if MDBX_ENABLE_REFUND - if (txn->tw.loose_refund_wl > txn->mt_next_pgno) { - mdbx_refund(txn); - if (unlikely(!txn->tw.loose_pages)) - goto no_loose; - } -#endif /* MDBX_ENABLE_REFUND */ - - ret.page = txn->tw.loose_pages; - txn->tw.loose_pages = ret.page->mp_next; - txn->tw.loose_count--; - mdbx_debug_extra("db %d use loose page %" PRIaPGNO, DDBI(mc), - ret.page->mp_pgno); - mdbx_tassert(txn, ret.page->mp_pgno < txn->mt_next_pgno); - mdbx_ensure(env, ret.page->mp_pgno >= NUM_METAS); - VALGRIND_MAKE_MEM_UNDEFINED(page_data(ret.page), page_space(txn->mt_env)); - MDBX_ASAN_UNPOISON_MEMORY_REGION(page_data(ret.page), - page_space(txn->mt_env)); - ret.page->mp_txnid = txn->mt_front; - ret.err = MDBX_SUCCESS; - return ret; - } - } -#if MDBX_ENABLE_REFUND -no_loose: -#endif /* MDBX_ENABLE_REFUND */ - mdbx_tassert(txn, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, txn->mt_next_pgno - MDBX_ENABLE_REFUND)); @@ -6968,7 +6936,6 @@ done: if (env->me_flags & MDBX_WRITEMAP) { ret.page = pgno2page(env, pgno); - /* LY: reset no-access flag from mdbx_page_loose() */ VALGRIND_MAKE_MEM_UNDEFINED(ret.page, pgno2bytes(env, num)); MDBX_ASAN_UNPOISON_MEMORY_REGION(ret.page, pgno2bytes(env, num)); } else { @@ -7022,6 +6989,76 @@ done: return ret; } +__hot static struct page_result page_alloc(MDBX_cursor *mc) { + MDBX_txn *const txn = mc->mc_txn; + + /* If there are any loose pages, just use them */ + while (likely(txn->tw.loose_pages)) { +#if MDBX_ENABLE_REFUND + if (unlikely(txn->tw.loose_refund_wl > txn->mt_next_pgno)) { + mdbx_refund(txn); + if (!txn->tw.loose_pages) + break; + } +#endif /* MDBX_ENABLE_REFUND */ + + MDBX_page *page = txn->tw.loose_pages; + txn->tw.loose_pages = page->mp_next; + txn->tw.loose_count--; + mdbx_debug_extra("db %d use loose page %" PRIaPGNO, DDBI(mc), + page->mp_pgno); + mdbx_tassert(txn, page->mp_pgno < txn->mt_next_pgno); + mdbx_tassert(txn, page->mp_pgno >= NUM_METAS); + VALGRIND_MAKE_MEM_UNDEFINED(page_data(page), page_space(txn->mt_env)); + MDBX_ASAN_UNPOISON_MEMORY_REGION(page_data(page), page_space(txn->mt_env)); + page->mp_txnid = txn->mt_front; + struct page_result ret = {page, MDBX_SUCCESS}; + return ret; + } + + if (likely(!(mc->mc_flags & C_GCFREEZE))) { + MDBX_PNL pnl = txn->tw.reclaimed_pglist; + const unsigned len = MDBX_PNL_SIZE(pnl); + if (likely(len > 0)) { + MDBX_PNL_SIZE(pnl) = len - 1; +#if MDBX_PNL_ASCENDING + const pgno_t pgno = pnl[1]; + for (unsigned i = 1; i < len; ++i) + pnl[i] = pnl[i + 1]; +#else + const pgno_t pgno = pnl[len]; +#endif + + MDBX_env *const env = txn->mt_env; + struct page_result ret; + if (env->me_flags & MDBX_WRITEMAP) { + ret.page = pgno2page(env, pgno); + MDBX_ASAN_UNPOISON_MEMORY_REGION(ret.page, env->me_psize); + } else { + ret.page = mdbx_page_malloc(txn, 1); + if (unlikely(!ret.page)) { + ret.err = MDBX_ENOMEM; + return ret; + } + } + + VALGRIND_MAKE_MEM_UNDEFINED(ret.page, env->me_psize); + ret.page->mp_pgno = pgno; + ret.page->mp_leaf2_ksize = 0; + ret.page->mp_flags = 0; + mdbx_tassert(txn, ret.page->mp_pgno >= NUM_METAS); + + ret.err = mdbx_page_dirty(txn, ret.page, 1); + mdbx_tassert( + txn, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, + txn->mt_next_pgno - MDBX_ENABLE_REFUND)); + return ret; + } + } + + return page_alloc_slowpath(mc, 1, MDBX_ALLOC_ALL); +} + /* Copy the used portions of a non-overflow page. */ __hot static void mdbx_page_copy(MDBX_page *dst, const MDBX_page *src, size_t psize) { @@ -7131,7 +7168,7 @@ __hot static int mdbx_page_touch(MDBX_cursor *mc) { rc = mdbx_pnl_need(&txn->tw.retired_pages, 1); if (unlikely(rc != MDBX_SUCCESS)) goto fail; - const struct page_result par = mdbx_page_alloc(mc, 1, MDBX_ALLOC_ALL); + const struct page_result par = page_alloc(mc); rc = par.err; np = par.page; if (unlikely(rc != MDBX_SUCCESS)) @@ -9126,9 +9163,9 @@ static int gcu_clean_stored_retired(MDBX_txn *txn, gcu_context_t *ctx) { return err; } -/* LY: Prepare a backlog of pages to modify GC itself, - * while reclaiming is prohibited. It should be enough to prevent search - * in mdbx_page_alloc() during a deleting, when GC tree is unbalanced. */ +/* Prepare a backlog of pages to modify GC itself, while reclaiming is + * prohibited. It should be enough to prevent search in page_alloc_slowpath() + * during a deleting, when GC tree is unbalanced. */ static int gcu_prepare_backlog(MDBX_txn *txn, gcu_context_t *ctx, const bool reserve4retired) { const unsigned pages4retiredlist = @@ -9171,8 +9208,8 @@ static int gcu_prepare_backlog(MDBX_txn *txn, gcu_context_t *ctx, err = gcu_clean_stored_retired(txn, ctx); if (unlikely(err != MDBX_SUCCESS)) return err; - err = mdbx_page_alloc(&ctx->cursor.outer, pages4retiredlist, - MDBX_ALLOC_GC | MDBX_ALLOC_FAKE) + err = page_alloc_slowpath(&ctx->cursor.outer, pages4retiredlist, + MDBX_ALLOC_GC | MDBX_ALLOC_FAKE) .err; mdbx_trace("== after-4linear, backlog %u, err %d", gcu_backlog_size(txn), err); @@ -9183,9 +9220,9 @@ static int gcu_prepare_backlog(MDBX_txn *txn, gcu_context_t *ctx, while (gcu_backlog_size(txn) < backlog4cow + pages4retiredlist && err == MDBX_SUCCESS) - err = mdbx_page_alloc(&ctx->cursor.outer, 0, - MDBX_ALLOC_GC | MDBX_ALLOC_SLOT | MDBX_ALLOC_FAKE | - MDBX_ALLOC_NOLOG) + err = page_alloc_slowpath(&ctx->cursor.outer, 0, + MDBX_ALLOC_GC | MDBX_ALLOC_SLOT | + MDBX_ALLOC_FAKE | MDBX_ALLOC_NOLOG) .err; ctx->cursor.outer.mc_flags |= C_RECLAIMING; @@ -9378,10 +9415,10 @@ retry: if (txn->tw.loose_count > 0) { mdbx_trace("%s: try allocate gc-slot for %u loose-pages", dbg_prefix_mode, txn->tw.loose_count); - rc = - mdbx_page_alloc(&ctx->cursor.outer, 0, - MDBX_ALLOC_GC | MDBX_ALLOC_SLOT | MDBX_ALLOC_FAKE) - .err; + rc = page_alloc_slowpath(&ctx->cursor.outer, 0, + MDBX_ALLOC_GC | MDBX_ALLOC_SLOT | + MDBX_ALLOC_FAKE) + .err; if (rc == MDBX_SUCCESS) { mdbx_trace("%s: retry since gc-slot for %u loose-pages available", dbg_prefix_mode, txn->tw.loose_count); @@ -9552,10 +9589,10 @@ retry: ctx->cursor.outer.mc_flags &= ~C_RECLAIMING; do { snap_oldest = mdbx_find_oldest(txn); - rc = - mdbx_page_alloc(&ctx->cursor.outer, 0, - MDBX_ALLOC_GC | MDBX_ALLOC_SLOT | MDBX_ALLOC_FAKE) - .err; + rc = page_alloc_slowpath(&ctx->cursor.outer, 0, + MDBX_ALLOC_GC | MDBX_ALLOC_SLOT | + MDBX_ALLOC_FAKE) + .err; if (likely(rc == MDBX_SUCCESS)) { mdbx_trace("%s: took @%" PRIaTXN " from GC", dbg_prefix_mode, MDBX_PNL_LAST(txn->tw.lifo_reclaimed)); @@ -9585,7 +9622,7 @@ retry: } else { mdbx_tassert(txn, txn->tw.last_reclaimed == 0); if (unlikely(mdbx_find_oldest(txn) != snap_oldest)) - /* should retry mdbx_page_alloc(MDBX_ALLOC_GC) + /* should retry page_alloc_slowpath(MDBX_ALLOC_GC) * if the oldest reader changes since the last attempt */ goto retry_rid; /* no reclaimable GC entries, @@ -15655,7 +15692,7 @@ int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, if (unlikely(err != MDBX_SUCCESS)) return err; } - struct page_result npr = mdbx_page_new(mc, P_LEAF, 1); + struct page_result npr = page_new(mc, P_LEAF); if (unlikely(npr.err != MDBX_SUCCESS)) return npr.err; npr.err = mdbx_cursor_push(mc, npr.page); @@ -15945,7 +15982,7 @@ int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, nested_dupdb.md_entries = page_numkeys(fp); xdata.iov_len = sizeof(nested_dupdb); xdata.iov_base = &nested_dupdb; - const struct page_result par = mdbx_page_alloc(mc, 1, MDBX_ALLOC_ALL); + const struct page_result par = page_alloc(mc); mp = par.page; if (unlikely(par.err != MDBX_SUCCESS)) return par.err; @@ -16301,49 +16338,59 @@ fail: } /* Allocate and initialize new pages for a database. - * Set MDBX_TXN_ERROR on failure. - * - * [in] mc a cursor on the database being added to. - * [in] flags flags defining what type of page is being allocated. - * [in] num the number of pages to allocate. This is usually 1, - * unless allocating overflow pages for a large record. - * [out] mp Address of a page, or NULL on failure. - * - * Returns 0 on success, non-zero on failure. */ -static struct page_result mdbx_page_new(MDBX_cursor *mc, const unsigned flags, - const unsigned npages) { - struct page_result ret = mdbx_page_alloc(mc, npages, MDBX_ALLOC_ALL); + * Set MDBX_TXN_ERROR on failure. */ +static struct page_result page_new(MDBX_cursor *mc, const unsigned flags) { + mdbx_cassert(mc, (flags & P_OVERFLOW) == 0); + struct page_result ret = page_alloc(mc); if (unlikely(ret.err != MDBX_SUCCESS)) return ret; - mdbx_debug("db %u allocated new page %" PRIaPGNO ", num %u", mc->mc_dbi, - ret.page->mp_pgno, npages); + mdbx_debug("db %u allocated new page %" PRIaPGNO, mc->mc_dbi, + ret.page->mp_pgno); ret.page->mp_flags = (uint16_t)flags; ret.page->mp_txnid = mc->mc_txn->mt_front; mdbx_cassert(mc, *mc->mc_dbistate & DBI_DIRTY); mdbx_cassert(mc, mc->mc_txn->mt_flags & MDBX_TXN_DIRTY); +#if MDBX_ENABLE_PGOP_STAT + mc->mc_txn->mt_env->me_lck->mti_pgop_stat.newly.weak += 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ + + STATIC_ASSERT(P_BRANCH == 1); + const unsigned is_branch = flags & P_BRANCH; + + ret.page->mp_lower = 0; + ret.page->mp_upper = (indx_t)(mc->mc_txn->mt_env->me_psize - PAGEHDRSZ); + mc->mc_db->md_branch_pages += is_branch; + mc->mc_db->md_leaf_pages += 1 - is_branch; + if (unlikely(mc->mc_flags & C_SUB)) { + MDBX_db *outer = mdbx_outer_db(mc); + outer->md_branch_pages += is_branch; + outer->md_leaf_pages += 1 - is_branch; + } + return ret; +} + +static struct page_result page_new_large(MDBX_cursor *mc, + const unsigned npages) { + struct page_result ret = + likely(npages == 1) ? page_alloc(mc) + : page_alloc_slowpath(mc, npages, MDBX_ALLOC_ALL); + if (unlikely(ret.err != MDBX_SUCCESS)) + return ret; + + mdbx_debug("db %u allocated new large-page %" PRIaPGNO ", num %u", mc->mc_dbi, + ret.page->mp_pgno, npages); + ret.page->mp_flags = P_OVERFLOW; + ret.page->mp_txnid = mc->mc_txn->mt_front; + mdbx_cassert(mc, *mc->mc_dbistate & DBI_DIRTY); + mdbx_cassert(mc, mc->mc_txn->mt_flags & MDBX_TXN_DIRTY); #if MDBX_ENABLE_PGOP_STAT mc->mc_txn->mt_env->me_lck->mti_pgop_stat.newly.weak += npages; #endif /* MDBX_ENABLE_PGOP_STAT */ - if (likely((flags & P_OVERFLOW) == 0)) { - STATIC_ASSERT(P_BRANCH == 1); - const bool is_branch = flags & P_BRANCH; - ret.page->mp_lower = 0; - ret.page->mp_upper = (indx_t)(mc->mc_txn->mt_env->me_psize - PAGEHDRSZ); - mc->mc_db->md_branch_pages += is_branch; - mc->mc_db->md_leaf_pages += 1 - is_branch; - if (unlikely(mc->mc_flags & C_SUB)) { - MDBX_db *outer = mdbx_outer_db(mc); - outer->md_branch_pages += is_branch; - outer->md_leaf_pages += 1 - is_branch; - } - } else { - mc->mc_db->md_overflow_pages += npages; - ret.page->mp_pages = npages; - mdbx_cassert(mc, !(mc->mc_flags & C_SUB)); - } - + mc->mc_db->md_overflow_pages += npages; + ret.page->mp_pages = npages; + mdbx_cassert(mc, !(mc->mc_flags & C_SUB)); return ret; } @@ -16464,7 +16511,7 @@ static int __must_check_result mdbx_node_add_leaf(MDBX_cursor *mc, return MDBX_PROBLEM; } const pgno_t ovpages = number_of_ovpages(mc->mc_txn->mt_env, data->iov_len); - const struct page_result npr = mdbx_page_new(mc, P_OVERFLOW, ovpages); + const struct page_result npr = page_new_large(mc, ovpages); if (unlikely(npr.err != MDBX_SUCCESS)) return npr.err; largepage = npr.page; @@ -18599,7 +18646,7 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *const newkey, mdbx_cassert(mc, nkeys + 1 >= minkeys * 2); /* Create a new sibling page. */ - struct page_result npr = mdbx_page_new(mc, mp->mp_flags, 1); + struct page_result npr = page_new(mc, mp->mp_flags); if (unlikely(npr.err != MDBX_SUCCESS)) return npr.err; MDBX_page *const sister = npr.page; @@ -18611,7 +18658,7 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *const newkey, * the cursor height may be greater because it walks * up the stack while finding the branch slot to update. */ if (mc->mc_top < 1) { - npr = mdbx_page_new(mc, P_BRANCH, 1); + npr = page_new(mc, P_BRANCH); rc = npr.err; if (unlikely(rc != MDBX_SUCCESS)) goto done; From e3a09db3da1111d5d7469676a0c302ce0d4c1625 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Wed, 22 Jun 2022 18:33:00 +0300 Subject: [PATCH 004/364] mdbx: always coalescing GC records, regardless to `MDBX_COALESCE` flag. --- mdbx.h | 5 ++-- src/core.c | 76 +++++++++++++++++++++++-------------------------- src/internals.h | 2 ++ 3 files changed, 41 insertions(+), 42 deletions(-) diff --git a/mdbx.h b/mdbx.h index 12540c23..8ca3d625 100644 --- a/mdbx.h +++ b/mdbx.h @@ -1087,8 +1087,8 @@ enum MDBX_env_flags_t { * while opening the database/environment which is already used by another * process(es) with unknown mode/flags. In such cases, if there is a * difference in the specified flags (\ref MDBX_NOMETASYNC, - * \ref MDBX_SAFE_NOSYNC, \ref MDBX_UTTERLY_NOSYNC, \ref MDBX_LIFORECLAIM, - * \ref MDBX_COALESCE and \ref MDBX_NORDAHEAD), instead of returning an error, + * \ref MDBX_SAFE_NOSYNC, \ref MDBX_UTTERLY_NOSYNC, \ref MDBX_LIFORECLAIM + * and \ref MDBX_NORDAHEAD), instead of returning an error, * the database will be opened in a compatibility with the already used mode. * * `MDBX_ACCEDE` has no effect if the current process is the only one either @@ -1195,6 +1195,7 @@ enum MDBX_env_flags_t { MDBX_NOMEMINIT = UINT32_C(0x1000000), /** Aims to coalesce a Garbage Collection items. + * \note Always enabled since v0.12 * * With `MDBX_COALESCE` flag MDBX will aims to coalesce items while recycling * a Garbage Collection. Technically, when possible short lists of pages diff --git a/src/core.c b/src/core.c index 9fc92bba..b9a6131e 100644 --- a/src/core.c +++ b/src/core.c @@ -6499,13 +6499,13 @@ __cold static int mdbx_wipe_steady(MDBX_env *env, const txnid_t last_steady) { * * Returns 0 on success, non-zero on failure.*/ -#define MDBX_ALLOC_CACHE 1 -#define MDBX_ALLOC_GC 2 -#define MDBX_ALLOC_NEW 4 +#define MDBX_ALLOC_GC 1 +#define MDBX_ALLOC_NEW 2 +#define MDBX_ALLOC_COALESCE 4 #define MDBX_ALLOC_SLOT 8 #define MDBX_ALLOC_FAKE 16 #define MDBX_ALLOC_NOLOG 32 -#define MDBX_ALLOC_ALL (MDBX_ALLOC_CACHE | MDBX_ALLOC_GC | MDBX_ALLOC_NEW) +#define MDBX_ALLOC_ALL (MDBX_ALLOC_GC | MDBX_ALLOC_NEW) __cold static struct page_result page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num, int flags) { @@ -6518,9 +6518,9 @@ page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num, int flags) { const unsigned coalesce_threshold = env->me_maxgc_ov1page - env->me_maxgc_ov1page / 4; if (likely(flags & MDBX_ALLOC_GC)) { - flags |= env->me_flags & (MDBX_COALESCE | MDBX_LIFORECLAIM); - if (MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) > coalesce_threshold) - flags &= ~MDBX_COALESCE; + flags |= env->me_flags & MDBX_LIFORECLAIM; + if (MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) < coalesce_threshold) + flags |= MDBX_ALLOC_COALESCE; if (unlikely( /* If mc is updating the GC, then the retired-list cannot play catch-up with itself by growing while trying to save it. */ @@ -6531,7 +6531,7 @@ page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num, int flags) { /* If our dirty list is already full, we can't touch GC */ (txn->tw.dirtyroom < txn->mt_dbs[FREE_DBI].md_depth && !(txn->mt_dbistate[FREE_DBI] & DBI_DIRTY)))) - flags &= ~(MDBX_ALLOC_GC | MDBX_COALESCE); + flags &= ~(MDBX_ALLOC_GC | MDBX_ALLOC_COALESCE); } mdbx_tassert(txn, @@ -6551,7 +6551,7 @@ page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num, int flags) { * Prefer pages with lower pgno. */ mdbx_tassert(txn, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, txn->mt_next_pgno)); - if (!(flags & (MDBX_COALESCE | MDBX_ALLOC_SLOT)) && re_len >= num) { + if (!(flags & (MDBX_ALLOC_COALESCE | MDBX_ALLOC_SLOT)) && re_len >= num) { mdbx_tassert(txn, MDBX_PNL_LAST(re_list) < txn->mt_next_pgno && MDBX_PNL_FIRST(re_list) < txn->mt_next_pgno); range_begin = MDBX_PNL_ASCENDING ? 1 : re_len; @@ -6714,7 +6714,7 @@ page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num, int flags) { "(chunk) -> %u", MDBX_PNL_SIZE(txn->tw.reclaimed_pglist), gc_len, gc_len + MDBX_PNL_SIZE(txn->tw.reclaimed_pglist)); - flags &= ~(MDBX_ALLOC_GC | MDBX_COALESCE); + flags &= ~(MDBX_ALLOC_GC | MDBX_ALLOC_COALESCE); break; } ret.err = mdbx_pnl_need(&txn->tw.reclaimed_pglist, gc_len); @@ -6769,20 +6769,21 @@ page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num, int flags) { } /* Don't try to coalesce too much. */ - if (flags & MDBX_COALESCE) { + if (flags & MDBX_ALLOC_COALESCE) { if (re_len /* current size */ > coalesce_threshold || (re_len > prev_re_len && re_len - prev_re_len /* delta from prev */ >= coalesce_threshold / 2)) { - mdbx_trace("clear %s %s", "MDBX_COALESCE", "since got threshold"); - flags &= ~MDBX_COALESCE; + mdbx_trace("clear %s %s", "MDBX_ALLOC_COALESCE", + "since got threshold"); + flags &= ~MDBX_ALLOC_COALESCE; } } } - if (F_ISSET(flags, MDBX_COALESCE | MDBX_ALLOC_GC)) { - mdbx_debug_extra("clear %s and continue", "MDBX_COALESCE"); - flags &= ~MDBX_COALESCE; + if (F_ISSET(flags, MDBX_ALLOC_COALESCE | MDBX_ALLOC_GC)) { + mdbx_debug_extra("clear %s and continue", "MDBX_ALLOC_COALESCE"); + flags &= ~MDBX_ALLOC_COALESCE; continue; } @@ -13228,29 +13229,21 @@ __cold int mdbx_env_open(MDBX_env *env, const char *pathname, if (unlikely(flags & ~ENV_USABLE_FLAGS)) return MDBX_EINVAL; - if (flags & MDBX_RDONLY) - mode = 0; - - if (env->me_lazy_fd != INVALID_HANDLE_VALUE || - (env->me_flags & MDBX_ENV_ACTIVE) != 0 || env->me_map) + if (unlikely(env->me_lazy_fd != INVALID_HANDLE_VALUE || + (env->me_flags & MDBX_ENV_ACTIVE) != 0 || env->me_map)) return MDBX_EPERM; - /* pickup previously mdbx_env_set_flags(), + /* Pickup previously mdbx_env_set_flags(), * but avoid MDBX_UTTERLY_NOSYNC by disjunction */ const uint32_t saved_me_flags = env->me_flags; - flags = merge_sync_flags(flags, env->me_flags); - - MDBX_handle_env_pathname env_pathname; - rc = mdbx_handle_env_pathname(&env_pathname, pathname, &flags, mode); - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; + flags = merge_sync_flags(flags | MDBX_DEPRECATED_COALESCE, env->me_flags); if (flags & MDBX_RDONLY) { - /* LY: silently ignore irrelevant flags when - * we're only getting read access */ + /* Silently ignore irrelevant flags when we're only getting read access */ flags &= ~(MDBX_WRITEMAP | MDBX_DEPRECATED_MAPASYNC | MDBX_SAFE_NOSYNC | - MDBX_NOMETASYNC | MDBX_COALESCE | MDBX_LIFORECLAIM | + MDBX_NOMETASYNC | MDBX_DEPRECATED_COALESCE | MDBX_LIFORECLAIM | MDBX_NOMEMINIT | MDBX_ACCEDE); + mode = 0; } else { #if MDBX_MMAP_INCOHERENT_FILE_WRITE /* Temporary `workaround` for OpenBSD kernel's flaw. @@ -13262,13 +13255,17 @@ __cold int mdbx_env_open(MDBX_env *env, const char *pathname, mdbx_debug_log(MDBX_LOG_ERROR, __func__, __LINE__, "System (i.e. OpenBSD) requires MDBX_WRITEMAP because " "of an internal flaw(s) in a file/buffer/page cache.\n"); - rc = 42 /* ENOPROTOOPT */; - goto bailout; + return 42 /* ENOPROTOOPT */; } } #endif /* MDBX_MMAP_INCOHERENT_FILE_WRITE */ } + MDBX_handle_env_pathname env_pathname; + rc = mdbx_handle_env_pathname(&env_pathname, pathname, &flags, mode); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + env->me_flags = (flags & ~MDBX_FATAL_ERROR) | MDBX_ENV_ACTIVE; env->me_pathname = mdbx_calloc(env_pathname.ent_len + 1, 1); env->me_dbxs = mdbx_calloc(env->me_maxdbs, sizeof(MDBX_dbx)); @@ -13337,8 +13334,8 @@ __cold int mdbx_env_open(MDBX_env *env, const char *pathname, const MDBX_env_flags_t rigorous_flags = MDBX_SAFE_NOSYNC | MDBX_DEPRECATED_MAPASYNC; const MDBX_env_flags_t mode_flags = rigorous_flags | MDBX_NOMETASYNC | - MDBX_LIFORECLAIM | MDBX_COALESCE | - MDBX_NORDAHEAD; + MDBX_LIFORECLAIM | + MDBX_DEPRECATED_COALESCE | MDBX_NORDAHEAD; MDBX_lockinfo *const lck = env->me_lck_mmap.lck; if (lck && lck_rc != MDBX_RESULT_TRUE && (env->me_flags & MDBX_RDONLY) == 0) { @@ -13349,11 +13346,11 @@ __cold int mdbx_env_open(MDBX_env *env, const char *pathname, * - let's assume that for some reason the DB file is smaller * than it should be according to the geometry, * but not smaller than the last page used; - * - the first process that opens the database (lc_rc = true) + * - the first process that opens the database (lck_rc == RESULT_TRUE) * does this in readonly mode and therefore cannot bring * the file size back to normal; - * - some next process (lc_rc = false) opens the DB in read-write - * mode and now is here. + * - some next process (lck_rc != RESULT_TRUE) opens the DB in + * read-write mode and now is here. * * FIXME: Should we re-check and set the size of DB-file right here? */ break; @@ -13362,8 +13359,7 @@ __cold int mdbx_env_open(MDBX_env *env, const char *pathname, } if (env->me_flags & MDBX_ACCEDE) { - /* pickup current mode-flags, including MDBX_LIFORECLAIM | - * MDBX_COALESCE | MDBX_NORDAHEAD */ + /* Pickup current mode-flags (MDBX_LIFORECLAIM, MDBX_NORDAHEAD, etc). */ const unsigned diff = (lck->mti_envmode.weak ^ env->me_flags) & mode_flags; mdbx_notice("accede mode-flags: 0x%X, 0x%X -> 0x%X", diff, env->me_flags, diff --git a/src/internals.h b/src/internals.h index ead89830..21a88711 100644 --- a/src/internals.h +++ b/src/internals.h @@ -1086,6 +1086,8 @@ struct MDBX_env { #define MDBX_ENV_TXKEY UINT32_C(0x10000000) /* Legacy MDBX_MAPASYNC (prior v0.9) */ #define MDBX_DEPRECATED_MAPASYNC UINT32_C(0x100000) + /* Legacy MDBX_MAPASYNC (prior v0.12) */ +#define MDBX_DEPRECATED_COALESCE UINT32_C(0x2000000) #define ENV_INTERNAL_FLAGS (MDBX_FATAL_ERROR | MDBX_ENV_ACTIVE | MDBX_ENV_TXKEY) uint32_t me_flags; mdbx_mmap_t me_dxb_mmap; /* The main data file */ From acce7d4b16cf7f4cc69422f5d6532dd205c4506b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Thu, 23 Jun 2022 14:19:46 +0300 Subject: [PATCH 005/364] mdbx-test: remove obsolete `coalesce` option. --- test/config.cc | 1 - test/long_stochastic.sh | 2 +- test/main.cc | 1 - test/stochastic_small.sh | 2 +- 4 files changed, 2 insertions(+), 4 deletions(-) diff --git a/test/config.cc b/test/config.cc index 61b299b4..38063892 100644 --- a/test/config.cc +++ b/test/config.cc @@ -369,7 +369,6 @@ const struct option_verb mode_bits[] = { {"notls", unsigned(MDBX_NOTLS)}, {"nordahead", unsigned(MDBX_NORDAHEAD)}, {"nomeminit", unsigned(MDBX_NOMEMINIT)}, - {"coalesce", unsigned(MDBX_COALESCE)}, {"lifo", unsigned(MDBX_LIFORECLAIM)}, {"perturb", unsigned(MDBX_PAGEPERTURB)}, {"accede", unsigned(MDBX_ACCEDE)}, diff --git a/test/long_stochastic.sh b/test/long_stochastic.sh index 04b9976e..16023f73 100755 --- a/test/long_stochastic.sh +++ b/test/long_stochastic.sh @@ -284,7 +284,7 @@ else fi syncmodes=("" ,+nosync-safe ,+nosync-utterly) -options=(writemap coalesce lifo notls perturb) +options=(writemap lifo notls perturb) function join { local IFS="$1"; shift; echo "$*"; } diff --git a/test/main.cc b/test/main.cc index b4b8022b..88d47799 100644 --- a/test/main.cc +++ b/test/main.cc @@ -98,7 +98,6 @@ MDBX_NORETURN void usage(void) { " accede == MDBX_ACCEDE\n" " nometasync == MDBX_NOMETASYNC\n" " lifo == MDBX_LIFORECLAIM\n" - " coalesce == MDBX_COALESCE\n" " nosync-safe == MDBX_SAFE_NOSYNC\n" " writemap == MDBX_WRITEMAP\n" " nosync-utterly == MDBX_UTTERLY_NOSYNC\n" diff --git a/test/stochastic_small.sh b/test/stochastic_small.sh index 8c9bba5f..5e216ced 100755 --- a/test/stochastic_small.sh +++ b/test/stochastic_small.sh @@ -263,7 +263,7 @@ else fi syncmodes=("" ,+nosync-safe ,+nosync-utterly) -options=(writemap coalesce lifo notls perturb) +options=(writemap lifo notls perturb) function join { local IFS="$1"; shift; echo "$*"; } From 065e5849dad8639412b1e020881632c8e39887d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Fri, 24 Jun 2022 22:05:41 +0300 Subject: [PATCH 006/364] mdbx: speedup GC-related pnl-merge and sequence-search. --- src/core.c | 145 ++++++++++++++++++++++++++++++++++------------------- 1 file changed, 93 insertions(+), 52 deletions(-) diff --git a/src/core.c b/src/core.c index b9a6131e..a81c600a 100644 --- a/src/core.c +++ b/src/core.c @@ -3203,18 +3203,22 @@ static __always_inline bool mdbx_pnl_check4assert(const MDBX_PNL pl, static void __hot mdbx_pnl_xmerge(MDBX_PNL dst, const MDBX_PNL src) { assert(mdbx_pnl_check4assert(dst, MAX_PAGENO + 1)); assert(mdbx_pnl_check(src, MAX_PAGENO + 1)); - const size_t total = MDBX_PNL_SIZE(dst) + MDBX_PNL_SIZE(src); - assert(MDBX_PNL_ALLOCLEN(dst) >= total); - pgno_t *w = dst + total; - pgno_t *d = dst + MDBX_PNL_SIZE(dst); - const pgno_t *s = src + MDBX_PNL_SIZE(src); - dst[0] = /* detent for scan below */ (MDBX_PNL_ASCENDING ? 0 : ~(pgno_t)0); - while (s > src) { - while (MDBX_PNL_ORDERED(*s, *d)) - *w-- = *d--; - *w-- = *s--; + if (likely(MDBX_PNL_SIZE(src) > 0)) { + const size_t total = MDBX_PNL_SIZE(dst) + MDBX_PNL_SIZE(src); + assert(MDBX_PNL_ALLOCLEN(dst) >= total); + pgno_t *w = dst + total; + pgno_t *d = dst + MDBX_PNL_SIZE(dst); + const pgno_t *s = src + MDBX_PNL_SIZE(src); + dst[0] = /* detent for scan below */ (MDBX_PNL_ASCENDING ? 0 : ~(pgno_t)0); + do { + const bool cmp = MDBX_PNL_ORDERED(*s, *d); + *w = cmp ? *d : *s; + d -= cmp ? 1 : 0; + s -= cmp ? 0 : 1; + --w; + } while (s > src); + MDBX_PNL_SIZE(dst) = (pgno_t)total; } - MDBX_PNL_SIZE(dst) = (pgno_t)total; assert(mdbx_pnl_check4assert(dst, MAX_PAGENO + 1)); } @@ -6483,6 +6487,63 @@ __cold static int mdbx_wipe_steady(MDBX_env *env, const txnid_t last_steady) { return MDBX_SUCCESS; } +__hot static pgno_t *scan4range(const MDBX_PNL pnl, const unsigned len, + const int num) { + assert(num > 0 && len >= (unsigned)num && len == MDBX_PNL_SIZE(pnl)); +#if MDBX_PNL_ASCENDING + const pgno_t *const detent = pnl + len - num; + pgno_t *scan = pnl + 1; + while (likely(scan + 7 <= detent)) { + if (unlikely(scan[num] == *scan + num)) + return scan; + if (unlikely(scan[num + 1] == scan[1] + num)) + return scan + 1; + if (unlikely(scan[num + 2] == scan[2] + num)) + return scan + 2; + if (unlikely(scan[num + 3] == scan[3] + num)) + return scan + 3; + if (unlikely(scan[num + 4] == scan[4] + num)) + return scan + 4; + if (unlikely(scan[num + 5] == scan[5] + num)) + return scan + 5; + if (unlikely(scan[num + 6] == scan[6] + num)) + return scan + 6; + if (unlikely(scan[num + 7] == scan[7] + num)) + return scan + 7; + scan += 8; + } + for (; scan <= detent; ++scan) + if (scan[num] == *scan + num) + return scan; +#else + const pgno_t *const detent = pnl + num; + pgno_t *scan = pnl + len; + while (likely(scan - 7 >= detent)) { + if (unlikely(scan[-num] == *scan + num)) + return scan; + if (unlikely(scan[-num - 1] == scan[-1] + num)) + return scan - 1; + if (unlikely(scan[-num - 2] == scan[-2] + num)) + return scan - 2; + if (unlikely(scan[-num - 3] == scan[-3] + num)) + return scan - 3; + if (unlikely(scan[-num - 4] == scan[-4] + num)) + return scan - 4; + if (unlikely(scan[-num - 5] == scan[-5] + num)) + return scan - 5; + if (unlikely(scan[-num - 6] == scan[-6] + num)) + return scan - 6; + if (unlikely(scan[-num - 7] == scan[-7] + num)) + return scan - 7; + scan -= 8; + } + for (; scan >= detent; --scan) + if (scan[-num] == *scan + num) + return scan; +#endif /* MDBX_PNL sort-order */ + return nullptr; +} + /* Allocate page numbers and memory for writing. Maintain mt_last_reclaimed, * mt_reclaimed_pglist and mt_next_pgno. Set MDBX_TXN_ERROR on failure. * @@ -6534,11 +6595,12 @@ page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num, int flags) { flags &= ~(MDBX_ALLOC_GC | MDBX_ALLOC_COALESCE); } - mdbx_tassert(txn, - mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, - txn->mt_next_pgno - MDBX_ENABLE_REFUND)); + mdbx_assert(env, + mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, + txn->mt_next_pgno - MDBX_ENABLE_REFUND)); pgno_t pgno, *re_list = txn->tw.reclaimed_pglist; - unsigned range_begin = 0, re_len = MDBX_PNL_SIZE(re_list); + unsigned re_len = MDBX_PNL_SIZE(re_list); + pgno_t *range = nullptr; txnid_t oldest = 0, last = 0; while (true) { /* hsr-kick retry loop */ @@ -6549,37 +6611,16 @@ page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num, int flags) { /* Seek a big enough contiguous page range. * Prefer pages with lower pgno. */ - mdbx_tassert(txn, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, - txn->mt_next_pgno)); + mdbx_assert(env, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, + txn->mt_next_pgno)); if (!(flags & (MDBX_ALLOC_COALESCE | MDBX_ALLOC_SLOT)) && re_len >= num) { - mdbx_tassert(txn, MDBX_PNL_LAST(re_list) < txn->mt_next_pgno && - MDBX_PNL_FIRST(re_list) < txn->mt_next_pgno); - range_begin = MDBX_PNL_ASCENDING ? 1 : re_len; - pgno = MDBX_PNL_LEAST(re_list); - if (likely(num == 1)) + mdbx_assert(env, MDBX_PNL_LAST(re_list) < txn->mt_next_pgno && + MDBX_PNL_FIRST(re_list) < txn->mt_next_pgno); + range = scan4range(re_list, re_len, num); + if (likely(range)) { + pgno = *range; goto done; - - const unsigned wanna_range = num - 1; -#if MDBX_PNL_ASCENDING - mdbx_tassert(txn, pgno == re_list[1] && range_begin == 1); - while (true) { - unsigned range_end = range_begin + wanna_range; - if (re_list[range_end] - pgno == wanna_range) - goto done; - if (range_end == re_len) - break; - pgno = re_list[++range_begin]; } -#else - mdbx_tassert(txn, pgno == re_list[re_len] && range_begin == re_len); - while (true) { - if (re_list[range_begin - wanna_range] - pgno == wanna_range) - goto done; - if (range_begin == wanna_range) - break; - pgno = re_list[--range_begin]; - } -#endif /* MDBX_PNL sort-order */ } if (op == MDBX_FIRST) { /* 1st iteration, setup cursor, etc */ @@ -6795,7 +6836,7 @@ page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num, int flags) { * - extend the database file. */ /* Will use new pages from the map if nothing is suitable in the GC. */ - range_begin = 0; + range = nullptr; pgno = txn->mt_next_pgno; const size_t next = (size_t)pgno + num; @@ -6947,20 +6988,20 @@ done: } } - if (range_begin) { + if (range) { mdbx_cassert(mc, (mc->mc_flags & C_GCFREEZE) == 0); mdbx_tassert(txn, pgno < txn->mt_next_pgno); - mdbx_tassert(txn, pgno == re_list[range_begin]); + mdbx_tassert(txn, pgno == *range); /* Cutoff allocated pages from tw.reclaimed_pglist */ #if MDBX_PNL_ASCENDING - for (unsigned i = range_begin + num; i <= re_len;) - re_list[range_begin++] = re_list[i++]; - MDBX_PNL_SIZE(re_list) = re_len = range_begin - 1; + for (const pgno_t *const end = re_list + re_len - num; range <= end; + ++range) + *range = range[num]; #else - MDBX_PNL_SIZE(re_list) = re_len -= num; - for (unsigned i = range_begin - num; i < re_len;) - re_list[++i] = re_list[++range_begin]; + for (const pgno_t *const end = re_list + re_len; ++range <= end;) + range[-(ptrdiff_t)num] = *range; #endif + MDBX_PNL_SIZE(re_list) = re_len -= num; mdbx_tassert(txn, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, txn->mt_next_pgno - MDBX_ENABLE_REFUND)); From 23bbceb367a575d0c0322c88ec4e3f9b6a5d1b61 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Mon, 27 Jun 2022 13:53:22 +0300 Subject: [PATCH 007/364] mdbx: minor fix `EINVAL` from `mdbx_env_set_geometry()`. Silently growth `size_lower` to the `MIN_PAGENO` instead of returning `MDBX_EINVAL`. --- src/core.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/core.c b/src/core.c index a81c600a..351ce980 100644 --- a/src/core.c +++ b/src/core.c @@ -11918,8 +11918,13 @@ mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t size_now, } if ((uint64_t)size_lower / pagesize < MIN_PAGENO) { - rc = MDBX_EINVAL; - goto bailout; + size_lower = pagesize * MIN_PAGENO; + if (unlikely(size_lower > size_upper)) { + rc = MDBX_EINVAL; + goto bailout; + } + if (size_now < size_lower) + size_now = size_lower; } if (unlikely((size_t)size_upper > MAX_MAPSIZE || From 48c60514828b751510fc76eb4954ab82986239c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Wed, 29 Jun 2022 13:35:07 +0300 Subject: [PATCH 008/364] mdbx: minor fix `meta_checktxnid()` to avoid assertion in debug mode. --- src/core.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/core.c b/src/core.c index 351ce980..e150c878 100644 --- a/src/core.c +++ b/src/core.c @@ -7803,7 +7803,7 @@ __cold int mdbx_thread_unregister(const MDBX_env *env) { /* check against todo4recovery://erased_by_github/libmdbx/issues/269 */ static bool meta_checktxnid(const MDBX_env *env, const MDBX_meta *meta, bool report) { - const txnid_t meta_txnid = constmeta_txnid(env, meta); + const txnid_t head_txnid = meta_txnid(env, meta); const txnid_t freedb_mod_txnid = meta->mm_dbs[FREE_DBI].md_mod_txnid; const txnid_t maindb_mod_txnid = meta->mm_dbs[MAIN_DBI].md_mod_txnid; @@ -7820,25 +7820,25 @@ static bool meta_checktxnid(const MDBX_env *env, const MDBX_meta *meta, const uint64_t magic_and_version = unaligned_peek_u64(4, &meta->mm_magic_and_version); bool ok = true; - if (unlikely(meta_txnid < freedb_mod_txnid || + if (unlikely(!head_txnid || head_txnid < freedb_mod_txnid || (!freedb_mod_txnid && freedb_root && likely(magic_and_version == MDBX_DATA_MAGIC)))) { if (report) mdbx_warning( "catch invalid %sdb_mod_txnid %" PRIaTXN " for meta_txnid %" PRIaTXN " %s", - "free", freedb_mod_txnid, meta_txnid, + "free", freedb_mod_txnid, head_txnid, "(workaround for incoherent flaw of unified page/buffer cache)"); ok = false; } - if (unlikely(meta_txnid < maindb_mod_txnid || + if (unlikely(head_txnid < maindb_mod_txnid || (!maindb_mod_txnid && maindb_root && likely(magic_and_version == MDBX_DATA_MAGIC)))) { if (report) mdbx_warning( "catch invalid %sdb_mod_txnid %" PRIaTXN " for meta_txnid %" PRIaTXN " %s", - "main", maindb_mod_txnid, meta_txnid, + "main", maindb_mod_txnid, head_txnid, "(workaround for incoherent flaw of unified page/buffer cache)"); ok = false; } @@ -7879,7 +7879,7 @@ static bool meta_checktxnid(const MDBX_env *env, const MDBX_meta *meta, * for todo4recovery://erased_by_github/libmdbx/issues/269 */ static int meta_waittxnid(const MDBX_env *env, const MDBX_meta *meta, uint64_t *timestamp) { - if (likely(meta_checktxnid(env, (const MDBX_meta *)meta, !*timestamp))) + if (likely(meta_checktxnid(env, meta, !*timestamp))) return MDBX_SUCCESS; if (!*timestamp) From 4f6b92248d8ed2adf19e173b6e4824186a57ebe2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Tue, 7 Jun 2022 21:20:35 +0300 Subject: [PATCH 009/364] mdbx: add `pgop_stat.gcrtime` for collect the time spent loading and searching inside GC. --- mdbx.h | 3 +++ src/core.c | 23 +++++++++++++++++++++++ src/internals.h | 3 +++ 3 files changed, 29 insertions(+) diff --git a/mdbx.h b/mdbx.h index 80eb7325..d67525cb 100644 --- a/mdbx.h +++ b/mdbx.h @@ -2491,6 +2491,9 @@ struct MDBX_envinfo { uint64_t unspill; /**< Quantity of unspilled/reloaded pages */ uint64_t wops; /**< Number of explicit write operations (not a pages) to a disk */ + uint64_t + gcrtime_seconds16dot16; /**< Time spent loading and searching inside + GC (aka FreeDB) in 1/65536 of second. */ } mi_pgop_stat; }; #ifndef __cplusplus diff --git a/src/core.c b/src/core.c index e150c878..efcc160c 100644 --- a/src/core.c +++ b/src/core.c @@ -6602,6 +6602,9 @@ page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num, int flags) { unsigned re_len = MDBX_PNL_SIZE(re_list); pgno_t *range = nullptr; txnid_t oldest = 0, last = 0; +#if MDBX_ENABLE_PGOP_STAT + uint64_t timestamp = 0; +#endif /* MDBX_ENABLE_PGOP_STAT */ while (true) { /* hsr-kick retry loop */ MDBX_cursor_couple recur; @@ -6632,6 +6635,10 @@ page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num, int flags) { ? mdbx_find_oldest(txn) : atomic_load64(&env->me_lck->mti_oldest_reader, mo_AcquireRelease); +#if MDBX_ENABLE_PGOP_STAT + if (likely(timestamp == 0)) + timestamp = mdbx_osal_monotime(); +#endif /* MDBX_ENABLE_PGOP_STAT */ ret.err = mdbx_cursor_init(&recur.outer, txn, FREE_DBI); if (unlikely(ret.err != MDBX_SUCCESS)) goto fail; @@ -6804,6 +6811,11 @@ page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num, int flags) { /* Done for a kick-reclaim mode, actually no page needed */ if (unlikely(flags & MDBX_ALLOC_SLOT)) { mdbx_debug("early-return NULL-page for %s mode", "MDBX_ALLOC_SLOT"); +#if MDBX_ENABLE_PGOP_STAT + mdbx_assert(env, timestamp != 0); + env->me_lck->mti_pgop_stat.gcrtime.weak += + mdbx_osal_monotime() - timestamp; +#endif /* MDBX_ENABLE_PGOP_STAT */ ret.err = MDBX_SUCCESS; ret.page = NULL; return ret; @@ -6942,6 +6954,11 @@ page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num, int flags) { } fail: +#if MDBX_ENABLE_PGOP_STAT + if (timestamp) + env->me_lck->mti_pgop_stat.gcrtime.weak += + mdbx_osal_monotime() - timestamp; +#endif /* MDBX_ENABLE_PGOP_STAT */ mdbx_assert(env, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, txn->mt_next_pgno - MDBX_ENABLE_REFUND)); @@ -6968,6 +6985,10 @@ page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num, int flags) { done: mdbx_assert(env, !(flags & MDBX_ALLOC_SLOT)); mdbx_ensure(env, pgno >= NUM_METAS); +#if MDBX_ENABLE_PGOP_STAT + if (likely(timestamp)) + env->me_lck->mti_pgop_stat.gcrtime.weak += mdbx_osal_monotime() - timestamp; +#endif /* MDBX_ENABLE_PGOP_STAT */ if (unlikely(flags & MDBX_ALLOC_FAKE)) { mdbx_debug("return NULL-page for %u pages %s allocation", num, "gc-slot/backlog"); @@ -20410,6 +20431,8 @@ __cold static int fetch_envinfo_ex(const MDBX_env *env, const MDBX_txn *txn, atomic_load64(&lck->mti_pgop_stat.unspill, mo_Relaxed); arg->mi_pgop_stat.wops = atomic_load64(&lck->mti_pgop_stat.wops, mo_Relaxed); + arg->mi_pgop_stat.gcrtime_seconds16dot16 = mdbx_osal_monotime_to_16dot16( + atomic_load64(&lck->mti_pgop_stat.gcrtime, mo_Relaxed)); #else memset(&arg->mi_pgop_stat, 0, sizeof(arg->mi_pgop_stat)); #endif /* MDBX_ENABLE_PGOP_STAT*/ diff --git a/src/internals.h b/src/internals.h index 21a88711..309d3113 100644 --- a/src/internals.h +++ b/src/internals.h @@ -561,6 +561,9 @@ typedef struct { MDBX_atomic_uint64_t unspill; /* Quantity of unspilled/reloaded pages */ MDBX_atomic_uint64_t wops; /* Number of explicit write operations (not a pages) to a disk */ + MDBX_atomic_uint64_t + gcrtime; /* Time spending for reading/searching GC (aka FreeDB). The + unit/scale is platform-depended, see mdbx_osal_monotime(). */ } MDBX_pgop_stat_t; #endif /* MDBX_ENABLE_PGOP_STAT */ From 720b4d56be84ec6f2528ac16091a956bdcc59ddf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Sat, 2 Jul 2022 09:05:59 +0300 Subject: [PATCH 010/364] mdbx: bigfoot feature. Chunking long list of retired pages during huge transactions commit to avoid use sequences of pages: - splits a long retired page-number-list into chunks which fits one per single overflow/large page; - this requires a few unique id for keys for create such records into GC/freeDB; - just use the necessary subsequent IDs following the current transaction ID and then take the last of ones to update a meta-page. Thus avoids using/allocating/searching a sequence of free pages but just increase txnid more than one during the commit a huge write transaction with a long retired-pages-list. --- src/core.c | 257 +++++++++++++++++++++++++++++++++----------------- src/options.h | 12 +++ 2 files changed, 183 insertions(+), 86 deletions(-) diff --git a/src/core.c b/src/core.c index efcc160c..1af2703d 100644 --- a/src/core.c +++ b/src/core.c @@ -5828,7 +5828,7 @@ static int meta_eq_mask(const MDBX_env *env) { return rc; } -static __inline volatile const MDBX_meta * +static __always_inline volatile const MDBX_meta * meta_recent(const enum meta_choise_mode mode, const MDBX_env *env, volatile const MDBX_meta *a, volatile const MDBX_meta *b) { const bool a_older_that_b = meta_ot(mode, env, a, b); @@ -5844,7 +5844,7 @@ static const MDBX_meta *meta_ancient_prefer_weak(const MDBX_env *env, return a_older_that_b ? a : b; } -static __inline volatile const MDBX_meta * +static __always_inline volatile const MDBX_meta * meta_mostrecent(const enum meta_choise_mode mode, const MDBX_env *env) { volatile const MDBX_meta *m0 = METAPAGE(env, 0); volatile const MDBX_meta *m1 = METAPAGE(env, 1); @@ -5907,21 +5907,19 @@ static const char *mdbx_durable_str(volatile const MDBX_meta *const meta) { /*----------------------------------------------------------------------------*/ /* Find oldest txnid still referenced. */ -static txnid_t mdbx_find_oldest(const MDBX_txn *txn) { - mdbx_tassert(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); - MDBX_env *env = txn->mt_env; +static txnid_t find_oldest_reader(const MDBX_env *env) { const txnid_t edge = mdbx_recent_steady_txnid(env); - mdbx_tassert(txn, edge <= txn->mt_txnid); + mdbx_assert(env, edge <= env->me_txn0->mt_txnid); MDBX_lockinfo *const lck = env->me_lck_mmap.lck; - if (unlikely(lck == NULL /* exclusive mode */)) { + if (unlikely(lck == NULL /* exclusive without-lck mode */)) { mdbx_assert(env, env->me_lck == (void *)&env->x_lckless_stub); return env->me_lck->mti_oldest_reader.weak = edge; } const txnid_t last_oldest = atomic_load64(&lck->mti_oldest_reader, mo_AcquireRelease); - mdbx_tassert(txn, edge >= last_oldest); + mdbx_assert(env, edge >= last_oldest); if (likely(last_oldest == edge)) return edge; @@ -5932,15 +5930,15 @@ static txnid_t mdbx_find_oldest(const MDBX_txn *txn) { if (snap_readers_refresh_flag == nothing_changed) return last_oldest; - txnid_t oldest = edge; atomic_store32(&lck->mti_readers_refresh_flag, nothing_changed, mo_Relaxed); const unsigned snap_nreaders = atomic_load32(&lck->mti_numreaders, mo_AcquireRelease); + txnid_t oldest = edge; for (unsigned i = 0; i < snap_nreaders; ++i) { if (atomic_load32(&lck->mti_readers[i].mr_pid, mo_AcquireRelease)) { /* mdbx_jitter4testing(true); */ const txnid_t snap = safe64_read(&lck->mti_readers[i].mr_txnid); - if (oldest > snap && last_oldest <= /* ignore pending updates */ snap) { + if (oldest > snap && /* ignore pending updates */ snap <= edge) { oldest = snap; if (oldest == last_oldest) return oldest; @@ -5951,20 +5949,21 @@ static txnid_t mdbx_find_oldest(const MDBX_txn *txn) { if (oldest != last_oldest) { mdbx_verbose("update oldest %" PRIaTXN " -> %" PRIaTXN, last_oldest, oldest); - mdbx_tassert(txn, oldest >= lck->mti_oldest_reader.weak); + mdbx_assert(env, oldest >= lck->mti_oldest_reader.weak); atomic_store64(&lck->mti_oldest_reader, oldest, mo_Relaxed); } return oldest; } /* Find largest mvcc-snapshot still referenced. */ -__cold static pgno_t mdbx_find_largest(MDBX_env *env, pgno_t largest) { +__cold static pgno_t find_largest_snapshot(const MDBX_env *env, + pgno_t last_used_page) { MDBX_lockinfo *const lck = env->me_lck_mmap.lck; - if (likely(lck != NULL /* exclusive mode */)) { + if (likely(lck != NULL /* check for exclusive without-lck mode */)) { + retry:; const unsigned snap_nreaders = atomic_load32(&lck->mti_numreaders, mo_AcquireRelease); for (unsigned i = 0; i < snap_nreaders; ++i) { - retry: if (atomic_load32(&lck->mti_readers[i].mr_pid, mo_AcquireRelease)) { /* mdbx_jitter4testing(true); */ const pgno_t snap_pages = atomic_load32( @@ -5976,16 +5975,13 @@ __cold static pgno_t mdbx_find_largest(MDBX_env *env, pgno_t largest) { mo_AcquireRelease) || snap_txnid != safe64_read(&lck->mti_readers[i].mr_txnid))) goto retry; - if (largest < snap_pages && - atomic_load64(&lck->mti_oldest_reader, mo_AcquireRelease) <= - /* ignore pending updates */ snap_txnid && - snap_txnid <= env->me_txn0->mt_txnid) - largest = snap_pages; + if (last_used_page < snap_pages && snap_txnid <= env->me_txn0->mt_txnid) + last_used_page = snap_pages; } } } - return largest; + return last_used_page; } /* Add a page to the txn's dirty list */ @@ -6601,7 +6597,7 @@ page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num, int flags) { pgno_t pgno, *re_list = txn->tw.reclaimed_pglist; unsigned re_len = MDBX_PNL_SIZE(re_list); pgno_t *range = nullptr; - txnid_t oldest = 0, last = 0; + txnid_t detent = 0, last = 0; #if MDBX_ENABLE_PGOP_STAT uint64_t timestamp = 0; #endif /* MDBX_ENABLE_PGOP_STAT */ @@ -6630,22 +6626,20 @@ page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num, int flags) { if (unlikely(!(flags & MDBX_ALLOC_GC))) break /* reclaiming is prohibited for now */; - /* Prepare to fetch more and coalesce */ - oldest = (flags & MDBX_LIFORECLAIM) - ? mdbx_find_oldest(txn) - : atomic_load64(&env->me_lck->mti_oldest_reader, - mo_AcquireRelease); + /* Prepare to fetch and coalesce */ #if MDBX_ENABLE_PGOP_STAT if (likely(timestamp == 0)) timestamp = mdbx_osal_monotime(); #endif /* MDBX_ENABLE_PGOP_STAT */ + detent = find_oldest_reader(env) + 1; + ret.err = mdbx_cursor_init(&recur.outer, txn, FREE_DBI); if (unlikely(ret.err != MDBX_SUCCESS)) goto fail; if (flags & MDBX_LIFORECLAIM) { /* Begin from oldest reader if any */ - if (oldest > MIN_TXNID) { - last = oldest - 1; + if (detent > MIN_TXNID) { + last = detent - 1; op = MDBX_SET_RANGE; } } else if (txn->tw.last_reclaimed) { @@ -6660,9 +6654,9 @@ page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num, int flags) { if (!(flags & MDBX_LIFORECLAIM)) { /* Do not try fetch more if the record will be too recent */ - if (op != MDBX_FIRST && ++last >= oldest) { - oldest = mdbx_find_oldest(txn); - if (oldest <= last) + if (op != MDBX_FIRST && ++last >= detent) { + detent = find_oldest_reader(env) + 1; + if (detent <= last) break; } } @@ -6671,10 +6665,10 @@ page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num, int flags) { if (ret.err == MDBX_NOTFOUND && (flags & MDBX_LIFORECLAIM)) { if (op == MDBX_SET_RANGE) continue; - txnid_t snap = mdbx_find_oldest(txn); - if (oldest < snap) { - oldest = snap; - last = oldest - 1; + const txnid_t snap = find_oldest_reader(env); + if (unlikely(detent <= snap)) { + detent = snap + 1; + last = snap; key.iov_base = &last; key.iov_len = sizeof(last); op = MDBX_SET_RANGE; @@ -6698,9 +6692,9 @@ page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num, int flags) { ret.err = MDBX_CORRUPTED; goto fail; } - if (oldest <= last) { - oldest = mdbx_find_oldest(txn); - if (oldest <= last) { + if (detent <= last) { + detent = find_oldest_reader(env) + 1; + if (detent <= last) { if (flags & MDBX_LIFORECLAIM) continue; break; @@ -6857,12 +6851,12 @@ page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num, int flags) { const MDBX_meta *const steady = constmeta_prefer_steady(env); /* does reclaiming stopped at the last steady point? */ if (head != steady && META_IS_STEADY(steady) && - oldest == constmeta_txnid(env, steady)) { + detent == constmeta_txnid(env, steady) + 1) { mdbx_debug("gc-kick-steady: head %" PRIaTXN "-%s, tail %" PRIaTXN "-%s, oldest %" PRIaTXN, constmeta_txnid(env, head), mdbx_durable_str(head), constmeta_txnid(env, steady), mdbx_durable_str(steady), - oldest); + detent); ret.err = MDBX_RESULT_TRUE; const pgno_t autosync_threshold = atomic_load32(&env->me_lck->mti_autosync_threshold, mo_Relaxed); @@ -6881,7 +6875,7 @@ page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num, int flags) { next >= steady->mm_geo.now)) { /* wipe steady checkpoint in MDBX_UTTERLY_NOSYNC mode * without any auto-sync threshold(s). */ - ret.err = mdbx_wipe_steady(env, oldest); + ret.err = mdbx_wipe_steady(env, detent); mdbx_debug("gc-wipe-steady, rc %d", ret.err); mdbx_assert(env, steady != meta_prefer_steady(env)); } else if ((flags & MDBX_ALLOC_NEW) == 0 || @@ -6902,16 +6896,11 @@ page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num, int flags) { mdbx_debug("gc-make-steady, rc %d", ret.err); mdbx_assert(env, steady != meta_prefer_steady(env)); } - if (ret.err == MDBX_SUCCESS) { - if (mdbx_find_oldest(txn) > oldest) - continue; - /* it is reasonable check/kick lagging reader(s) here, - * since we made a new steady point or wipe the last. */ - if (oldest < txn->mt_txnid - xMDBX_TXNID_STEP && - mdbx_kick_longlived_readers(env, oldest) > oldest) - continue; - } else if (unlikely(ret.err != MDBX_RESULT_TRUE)) - goto fail; + if (likely(ret.err != MDBX_RESULT_TRUE)) { + if (unlikely(ret.err != MDBX_SUCCESS)) + goto fail; + continue; + } } } @@ -6919,9 +6908,14 @@ page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num, int flags) { * at the end of database file. */ if ((flags & MDBX_ALLOC_NEW) && next <= txn->mt_end_pgno) goto done; - if ((flags & MDBX_ALLOC_GC) && oldest < txn->mt_txnid - xMDBX_TXNID_STEP && - mdbx_kick_longlived_readers(env, oldest) > oldest) - continue; + + if (flags & MDBX_ALLOC_GC) { + const txnid_t laggard = find_oldest_reader(env); + if (laggard >= detent || + (laggard < txn->mt_txnid - xMDBX_TXNID_STEP && + mdbx_kick_longlived_readers(env, laggard) >= detent)) + continue; + } ret.err = MDBX_NOTFOUND; if (flags & MDBX_ALLOC_NEW) { @@ -7420,7 +7414,7 @@ retry:; env->me_txn0->mt_txnid = head_txnid; mdbx_assert(env, head_txnid == meta_txnid(env, head)); mdbx_assert(env, head_txnid == mdbx_recent_committed_txnid(env)); - mdbx_find_oldest(env->me_txn0); + find_oldest_reader(env); flags |= MDBX_SHRINK_ALLOWED; } @@ -8042,8 +8036,6 @@ static int mdbx_txn_renew0(MDBX_txn *txn, const unsigned flags) { snap == meta_txnid(env, meta) && snap >= atomic_load64(&env->me_lck->mti_oldest_reader, mo_AcquireRelease))) { - /* workaround for todo4recovery://erased_by_github/libmdbx/issues/269 - */ rc = meta_waittxnid(env, (const MDBX_meta *)meta, ×tamp); mdbx_jitter4testing(false); if (likely(rc == MDBX_SUCCESS)) @@ -8633,7 +8625,7 @@ int mdbx_txn_info(const MDBX_txn *txn, MDBX_txn_info *info, bool scan_rlt) { const unsigned snap_nreaders = atomic_load32(&lck->mti_numreaders, mo_AcquireRelease); if (snap_nreaders) { - oldest_snapshot = mdbx_find_oldest(txn); + oldest_snapshot = find_oldest_reader(env); if (oldest_snapshot == txn->mt_txnid - 1) { /* check if there is at least one reader */ bool exists = false; @@ -9195,12 +9187,18 @@ typedef struct gc_update_context { unsigned settled, cleaned_slot, reused_slot, filled_slot; txnid_t cleaned_id, rid; bool lifo, dense; +#if MDBX_ENABLE_BIGFOOT + txnid_t bigfoot; +#endif /* MDBX_ENABLE_BIGFOOT */ MDBX_cursor_couple cursor; } gcu_context_t; static __inline int gcu_context_init(MDBX_txn *txn, gcu_context_t *ctx) { memset(ctx, 0, offsetof(gcu_context_t, cursor)); ctx->lifo = (txn->mt_env->me_flags & MDBX_LIFORECLAIM) != 0; +#if MDBX_ENABLE_BIGFOOT + ctx->bigfoot = txn->mt_txnid; +#endif /* MDBX_ENABLE_BIGFOOT */ return mdbx_cursor_init(&ctx->cursor.outer, txn, FREE_DBI); } @@ -9210,19 +9208,29 @@ static __always_inline unsigned gcu_backlog_size(MDBX_txn *txn) { static int gcu_clean_stored_retired(MDBX_txn *txn, gcu_context_t *ctx) { int err = MDBX_SUCCESS; - if (ctx->retired_stored) { - MDBX_val key, val; - key.iov_base = &txn->mt_txnid; - key.iov_len = sizeof(txnid_t); - const struct cursor_set_result csr = - mdbx_cursor_set(&ctx->cursor.outer, &key, &val, MDBX_SET); - if (csr.err == MDBX_SUCCESS && csr.exact) { - ctx->retired_stored = 0; - err = mdbx_cursor_del(&ctx->cursor.outer, 0); - mdbx_trace("== clear-4linear, backlog %u, err %d", gcu_backlog_size(txn), - err); + if (ctx->retired_stored) + do { + MDBX_val key, val; +#if MDBX_ENABLE_BIGFOOT + key.iov_base = &ctx->bigfoot; +#else + key.iov_base = &txn->mt_txnid; +#endif /* MDBX_ENABLE_BIGFOOT */ + key.iov_len = sizeof(txnid_t); + const struct cursor_set_result csr = + mdbx_cursor_set(&ctx->cursor.outer, &key, &val, MDBX_SET); + if (csr.err == MDBX_SUCCESS && csr.exact) { + ctx->retired_stored = 0; + err = mdbx_cursor_del(&ctx->cursor.outer, 0); + mdbx_trace("== clear-4linear, backlog %u, err %d", + gcu_backlog_size(txn), err); + } } - } +#if MDBX_ENABLE_BIGFOOT + while (!err && --ctx->bigfoot >= txn->mt_txnid); +#else + while (0); +#endif /* MDBX_ENABLE_BIGFOOT */ return err; } @@ -9379,7 +9387,7 @@ retry: do { ctx->cleaned_id = txn->tw.lifo_reclaimed[++ctx->cleaned_slot]; mdbx_tassert(txn, ctx->cleaned_slot > 0 && - ctx->cleaned_id < + ctx->cleaned_id <= env->me_lck->mti_oldest_reader.weak); key.iov_base = &ctx->cleaned_id; key.iov_len = sizeof(ctx->cleaned_id); @@ -9394,7 +9402,7 @@ retry: goto bailout; } mdbx_tassert(txn, - ctx->cleaned_id < env->me_lck->mti_oldest_reader.weak); + ctx->cleaned_id <= env->me_lck->mti_oldest_reader.weak); mdbx_trace("%s: cleanup-reclaimed-id [%u]%" PRIaTXN, dbg_prefix_mode, ctx->cleaned_slot, ctx->cleaned_id); mdbx_tassert(txn, *txn->mt_cursors == &ctx->cursor.outer); @@ -9437,7 +9445,7 @@ retry: } mdbx_tassert(txn, ctx->cleaned_id <= txn->tw.last_reclaimed); mdbx_tassert(txn, - ctx->cleaned_id < env->me_lck->mti_oldest_reader.weak); + ctx->cleaned_id <= env->me_lck->mti_oldest_reader.weak); mdbx_trace("%s: cleanup-reclaimed-id %" PRIaTXN, dbg_prefix_mode, ctx->cleaned_id); mdbx_tassert(txn, *txn->mt_cursors == &ctx->cursor.outer); @@ -9566,6 +9574,63 @@ retry: goto bailout; } +#if MDBX_ENABLE_BIGFOOT + unsigned retired_pages_before; + do { + if (ctx->bigfoot > txn->mt_txnid) { + rc = gcu_clean_stored_retired(txn, ctx); + mdbx_tassert(txn, ctx->bigfoot <= txn->mt_txnid); + } + + retired_pages_before = MDBX_PNL_SIZE(txn->tw.retired_pages); + rc = gcu_prepare_backlog(txn, ctx, true); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + + mdbx_pnl_sort(txn->tw.retired_pages, txn->mt_next_pgno); + ctx->retired_stored = 0; + ctx->bigfoot = txn->mt_txnid; + do { + key.iov_len = sizeof(txnid_t); + key.iov_base = &ctx->bigfoot; + const unsigned left = (unsigned)MDBX_PNL_SIZE(txn->tw.retired_pages) - + ctx->retired_stored; + const unsigned chunk = + (left > env->me_maxgc_ov1page && ctx->bigfoot < MAX_TXNID) + ? env->me_maxgc_ov1page + : left; + data.iov_len = (chunk + 1) * sizeof(pgno_t); + rc = mdbx_cursor_put(&ctx->cursor.outer, &key, &data, MDBX_RESERVE); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + + if (retired_pages_before == MDBX_PNL_SIZE(txn->tw.retired_pages)) { + const unsigned at = (ctx->lifo == MDBX_PNL_ASCENDING) + ? left - chunk + : ctx->retired_stored; + pgno_t *const begin = txn->tw.retired_pages + at; + /* MDBX_PNL_ASCENDING == false && LIFO == false: + * - the larger pgno is at the beginning of retired list + * and should be placed with the larger txnid. + * MDBX_PNL_ASCENDING == true && LIFO == true: + * - the larger pgno is at the ending of retired list + * and should be placed with the smaller txnid. + */ + const pgno_t save = *begin; + *begin = chunk; + memcpy(data.iov_base, begin, data.iov_len); + *begin = save; + mdbx_trace("%s: put-retired/bigfoot @ %" PRIaTXN + " (slice #%u) #%u [%u..%u] of %u", + dbg_prefix_mode, ctx->bigfoot, + (unsigned)(ctx->bigfoot - txn->mt_txnid), chunk, at, + at + chunk, retired_pages_before); + } + ctx->retired_stored += chunk; + } while (ctx->retired_stored < MDBX_PNL_SIZE(txn->tw.retired_pages) && + (++ctx->bigfoot, true)); + } while (retired_pages_before != MDBX_PNL_SIZE(txn->tw.retired_pages)); +#else /* Write to last page of GC */ key.iov_len = sizeof(txnid_t); key.iov_base = &txn->mt_txnid; @@ -9585,6 +9650,7 @@ retry: mdbx_trace("%s: put-retired #%u @ %" PRIaTXN, dbg_prefix_mode, ctx->retired_stored, txn->mt_txnid); +#endif /* MDBX_ENABLE_BIGFOOT */ if (mdbx_log_enabled(MDBX_LOG_EXTRA)) { unsigned i = ctx->retired_stored; mdbx_debug_extra("txn %" PRIaTXN " root %" PRIaPGNO @@ -9651,7 +9717,7 @@ retry: retry_rid: ctx->cursor.outer.mc_flags &= ~C_RECLAIMING; do { - snap_oldest = mdbx_find_oldest(txn); + snap_oldest = find_oldest_reader(env); rc = page_alloc_slowpath(&ctx->cursor.outer, 0, MDBX_ALLOC_GC | MDBX_ALLOC_SLOT | MDBX_ALLOC_FAKE) @@ -9684,13 +9750,13 @@ retry: ctx->rid = MDBX_PNL_LAST(txn->tw.lifo_reclaimed); } else { mdbx_tassert(txn, txn->tw.last_reclaimed == 0); - if (unlikely(mdbx_find_oldest(txn) != snap_oldest)) + if (unlikely(find_oldest_reader(env) != snap_oldest)) /* should retry page_alloc_slowpath(MDBX_ALLOC_GC) * if the oldest reader changes since the last attempt */ goto retry_rid; /* no reclaimable GC entries, * therefore no entries with ID < mdbx_find_oldest(txn) */ - txn->tw.last_reclaimed = ctx->rid = snap_oldest - 1; + txn->tw.last_reclaimed = ctx->rid = snap_oldest; mdbx_trace("%s: none recycled yet, set rid to @%" PRIaTXN, dbg_prefix_mode, ctx->rid); } @@ -9786,7 +9852,7 @@ retry: } else { mdbx_tassert(txn, txn->tw.lifo_reclaimed == NULL); if (unlikely(ctx->rid == 0)) { - ctx->rid = mdbx_find_oldest(txn) - 1; + ctx->rid = find_oldest_reader(env); rc = mdbx_cursor_get(&ctx->cursor.outer, &key, NULL, MDBX_FIRST); if (rc == MDBX_SUCCESS) { if (!MDBX_DISABLE_PAGECHECKS && @@ -9875,10 +9941,10 @@ retry: mdbx_trace("%s: chunk %u, gc-per-ovpage %u", dbg_prefix_mode, chunk, env->me_maxgc_ov1page); - mdbx_tassert(txn, reservation_gc_id < env->me_lck->mti_oldest_reader.weak); + mdbx_tassert(txn, reservation_gc_id <= env->me_lck->mti_oldest_reader.weak); if (unlikely( reservation_gc_id < MIN_TXNID || - reservation_gc_id >= + reservation_gc_id > atomic_load64(&env->me_lck->mti_oldest_reader, mo_Relaxed))) { mdbx_error("** internal error (reservation_gc_id %" PRIaTXN ")", reservation_gc_id); @@ -9987,7 +10053,7 @@ retry: ? MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) : 0)); mdbx_tassert(txn, fill_gc_id > 0 && - fill_gc_id < env->me_lck->mti_oldest_reader.weak); + fill_gc_id <= env->me_lck->mti_oldest_reader.weak); key.iov_base = &fill_gc_id; key.iov_len = sizeof(fill_gc_id); @@ -10779,7 +10845,17 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { meta.mm_dbs[FREE_DBI] = txn->mt_dbs[FREE_DBI]; meta.mm_dbs[MAIN_DBI] = txn->mt_dbs[MAIN_DBI]; meta.mm_canary = txn->mt_canary; - meta_set_txnid(env, &meta, txn->mt_txnid); + + txnid_t commit_txnid = txn->mt_txnid; +#if MDBX_ENABLE_BIGFOOT + if (gcu_ctx.bigfoot > txn->mt_txnid) { + commit_txnid = gcu_ctx.bigfoot; + mdbx_trace("use @%" PRIaTXN " (+%u) for commit bigfoot-txn", commit_txnid, + (unsigned)(commit_txnid - txn->mt_txnid)); + } +#endif + meta_set_txnid(env, &meta, commit_txnid); + rc = mdbx_sync_locked( env, env->me_flags | txn->mt_flags | MDBX_SHRINK_ALLOWED, &meta); } @@ -11276,7 +11352,7 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, pgno_t shrink = 0; if (flags & MDBX_SHRINK_ALLOWED) { /* LY: check conditions to discard unused pages */ - const pgno_t largest_pgno = mdbx_find_largest( + const pgno_t largest_pgno = find_largest_snapshot( env, (head->mm_geo.next > pending->mm_geo.next) ? head->mm_geo.next : pending->mm_geo.next); mdbx_assert(env, largest_pgno >= NUM_METAS); @@ -11566,7 +11642,15 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, if (rc != MDBX_SUCCESS) goto undo; } - mdbx_assert(env, meta_checktxnid(env, target, true)); + } + + uint64_t timestamp = 0; + while ("workaround for todo4recovery://erased_by_github/libmdbx/issues/269") { + rc = meta_waittxnid(env, target, ×tamp); + if (likely(rc == MDBX_SUCCESS)) + break; + if (unlikely(rc != MDBX_RESULT_TRUE)) + goto fail; } env->me_lck->mti_meta_sync_txnid.weak = (uint32_t)unaligned_peek_u64(4, pending->mm_txnid_a) - @@ -11823,7 +11907,7 @@ mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t size_now, const MDBX_meta *head = constmeta_prefer_last(env); if (!inside_txn) { env->me_txn0->mt_txnid = constmeta_txnid(env, head); - mdbx_find_oldest(env->me_txn0); + find_oldest_reader(env); } /* get untouched params from DB */ @@ -11845,7 +11929,7 @@ mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t size_now, goto bailout; } const size_t usedbytes = - pgno2bytes(env, mdbx_find_largest(env, head->mm_geo.next)); + pgno2bytes(env, find_largest_snapshot(env, head->mm_geo.next)); if ((size_t)size_upper < usedbytes) { rc = MDBX_MAP_FULL; goto bailout; @@ -21405,7 +21489,7 @@ __cold static txnid_t mdbx_kick_longlived_readers(MDBX_env *env, /* LY: notify end of hsr-loop */ env->me_hsr_callback(env, env->me_txn, 0, 0, laggard, 0, 0, -retry); } - return mdbx_find_oldest(env->me_txn); + return find_oldest_reader(env); } #ifndef LIBMDBX_NO_EXPORTS_LEGACY_API @@ -23319,6 +23403,7 @@ __dll_export #else #error "FIXME: Unsupported byte order" #endif /* __BYTE_ORDER__ */ + " MDBX_ENABLE_BIGFOOT=" MDBX_STRINGIFY(MDBX_ENABLE_BIGFOOT) " MDBX_ENV_CHECKPID=" MDBX_ENV_CHECKPID_CONFIG " MDBX_TXN_CHECKOWNER=" MDBX_TXN_CHECKOWNER_CONFIG " MDBX_64BIT_ATOMIC=" MDBX_64BIT_ATOMIC_CONFIG diff --git a/src/options.h b/src/options.h index 30ffdf1b..bf4b71f3 100644 --- a/src/options.h +++ b/src/options.h @@ -80,6 +80,18 @@ #error MDBX_ENABLE_PGOP_STAT must be defined as 0 or 1 #endif /* MDBX_ENABLE_PGOP_STAT */ +/** Enables chunking long list of retired pages during huge transactions commit + * to avoid use sequences of pages. */ +#ifndef MDBX_ENABLE_BIGFOOT +#if MDBX_WORDBITS >= 64 || defined(DOXYGEN) +#define MDBX_ENABLE_BIGFOOT 1 +#else +#define MDBX_ENABLE_BIGFOOT 0 +#endif +#elif !(MDBX_ENABLE_BIGFOOT == 0 || MDBX_ENABLE_BIGFOOT == 1) +#error MDBX_ENABLE_BIGFOOT must be defined as 0 or 1 +#endif /* MDBX_ENABLE_BIGFOOT */ + /** Controls use of POSIX madvise() hints and friends. */ #ifndef MDBX_ENABLE_MADVISE #define MDBX_ENABLE_MADVISE 1 From b9835389f448147ca45dd144c0e5a9067b2a287a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Sat, 2 Jul 2022 18:35:13 +0300 Subject: [PATCH 011/364] mdbx: add cache for pointers to last/steady meta-pages (off by default). --- src/core.c | 57 +++++++++++++++++++++++++++++++++++++++++++------ src/internals.h | 4 ++++ src/options.h | 6 ++++++ 3 files changed, 61 insertions(+), 6 deletions(-) diff --git a/src/core.c b/src/core.c index 1af2703d..580106c6 100644 --- a/src/core.c +++ b/src/core.c @@ -5720,6 +5720,15 @@ constmeta_txnid(const MDBX_env *env, const MDBX_meta *meta) { return (a == b) ? a : 0; } +static __inline void meta_cache_clear(MDBX_env *env) { +#if MDBX_CACHE_METAS + env->cache_last_meta = nullptr; + env->cache_steady_meta = nullptr; +#else + (void)env; +#endif /* MDBX_CACHE_METAS */ +} + static __inline txnid_t meta_txnid(const MDBX_env *env, volatile const MDBX_meta *meta) { (void)env; @@ -5856,28 +5865,52 @@ meta_mostrecent(const enum meta_choise_mode mode, const MDBX_env *env) { } static volatile const MDBX_meta *meta_prefer_steady(const MDBX_env *env) { - return meta_mostrecent(prefer_steady, env); + return +#if MDBX_CACHE_METAS + ((MDBX_env *)env)->cache_steady_meta = +#endif /* MDBX_CACHE_METAS */ + meta_mostrecent(prefer_steady, env); } MDBX_NOTHROW_PURE_FUNCTION static const MDBX_meta * constmeta_prefer_steady(const MDBX_env *env) { - return (const MDBX_meta *)meta_mostrecent(prefer_steady, env); +#if MDBX_CACHE_METAS + mdbx_assert(env, !env->cache_steady_meta || + env->cache_steady_meta == + meta_mostrecent(prefer_steady, env)); + return (const MDBX_meta *)(env->cache_steady_meta ? env->cache_steady_meta : +#else + return (const MDBX_meta *)( +#endif /* MDBX_CACHE_METAS */ + meta_prefer_steady(env)); } static volatile const MDBX_meta *meta_prefer_last(const MDBX_env *env) { - return meta_mostrecent(prefer_last, env); + return +#if MDBX_CACHE_METAS + ((MDBX_env *)env)->cache_last_meta = +#endif /* MDBX_CACHE_METAS */ + meta_mostrecent(prefer_last, env); } MDBX_NOTHROW_PURE_FUNCTION static const MDBX_meta * constmeta_prefer_last(const MDBX_env *env) { - return (const MDBX_meta *)meta_mostrecent(prefer_last, env); +#if MDBX_CACHE_METAS + mdbx_assert(env, + !env->cache_last_meta || + env->cache_last_meta == meta_mostrecent(prefer_last, env)); + return (const MDBX_meta *)(env->cache_last_meta ? env->cache_last_meta : +#else + return (const MDBX_meta *)( +#endif /* MDBX_CACHE_METAS */ + meta_prefer_last(env)); } static txnid_t mdbx_recent_committed_txnid(const MDBX_env *env) { while (true) { volatile const MDBX_meta *head = meta_prefer_last(env); const txnid_t recent = meta_txnid(env, head); - mdbx_compiler_barrier(); + mdbx_memory_barrier(); if (likely(head == meta_prefer_last(env) && recent == meta_txnid(env, head))) return recent; @@ -6328,6 +6361,7 @@ __cold static int mdbx_mapresize(MDBX_env *env, const pgno_t used_pgno, } #endif /* MDBX_ENABLE_MADVISE */ + meta_cache_clear(env); rc = mdbx_mresize(mresize_flags, &env->me_dxb_mmap, size_bytes, limit_bytes); #if MDBX_ENABLE_MADVISE @@ -6480,6 +6514,7 @@ __cold static int mdbx_wipe_steady(MDBX_env *env, const txnid_t last_steady) { /* force oldest refresh */ atomic_store32(&env->me_lck->mti_readers_refresh_flag, true, mo_Relaxed); + meta_cache_clear(env); return MDBX_SUCCESS; } @@ -7409,6 +7444,7 @@ retry:; #if MDBX_ENABLE_PGOP_STAT env->me_lck->mti_pgop_stat.wops.weak += wops; #endif /* MDBX_ENABLE_PGOP_STAT */ + meta_cache_clear(env); goto retry; } env->me_txn0->mt_txnid = head_txnid; @@ -7653,6 +7689,7 @@ static void mdbx_txn_valgrind(MDBX_env *env, MDBX_txn *txn) { /* no write-txn */ last = NUM_METAS; should_unlock = true; + meta_cache_clear(env); } else { /* write txn is running, therefore shouldn't poison any memory range */ return; @@ -7994,6 +8031,7 @@ static int mdbx_txn_renew0(MDBX_txn *txn, const unsigned flags) { if (likely(/* not recovery mode */ env->me_stuck_meta < 0)) { uint64_t timestamp = 0; while (1) { + meta_cache_clear(env); volatile const MDBX_meta *const meta = meta_prefer_last(env); mdbx_jitter4testing(false); const txnid_t snap = meta_txnid(env, meta); @@ -8120,6 +8158,7 @@ static int mdbx_txn_renew0(MDBX_txn *txn, const unsigned flags) { } #endif /* Windows */ + meta_cache_clear(env); mdbx_jitter4testing(false); const MDBX_meta *meta = constmeta_prefer_last(env); uint64_t timestamp = 0; @@ -11644,6 +11683,7 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, } } + meta_cache_clear(env); uint64_t timestamp = 0; while ("workaround for todo4recovery://erased_by_github/libmdbx/issues/269") { rc = meta_waittxnid(env, target, ×tamp); @@ -11903,6 +11943,7 @@ mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t size_now, if (unlikely(err != MDBX_SUCCESS)) return err; need_unlock = true; + meta_cache_clear(env); } const MDBX_meta *head = constmeta_prefer_last(env); if (!inside_txn) { @@ -13127,6 +13168,7 @@ __cold static int __must_check_result mdbx_override_meta( } mdbx_flush_incoherent_mmap(env->me_map, pgno2bytes(env, NUM_METAS), env->me_os_psize); + meta_cache_clear(env); return rc; } @@ -13600,7 +13642,7 @@ __cold int mdbx_env_open(MDBX_env *env, const char *pathname, #if MDBX_DEBUG if (rc == MDBX_SUCCESS) { - const MDBX_meta *meta = constmeta_prefer_last(env); + const MDBX_meta *meta = (const MDBX_meta *)meta_prefer_last(env); const MDBX_db *db = &meta->mm_dbs[MAIN_DBI]; mdbx_debug("opened database version %u, pagesize %u", @@ -20136,6 +20178,7 @@ __cold int mdbx_env_set_flags(MDBX_env *env, MDBX_env_flags_t flags, if (unlikely(rc)) return rc; should_unlock = true; + meta_cache_clear(env); } if (onoff) @@ -22968,6 +23011,7 @@ __cold int mdbx_env_set_option(MDBX_env *env, const MDBX_option_t option, if (unlikely(err != MDBX_SUCCESS)) return err; should_unlock = true; + meta_cache_clear(env); } env->me_options.dp_reserve_limit = (unsigned)value; while (env->me_dp_reserve_len > env->me_options.dp_reserve_limit) { @@ -23004,6 +23048,7 @@ __cold int mdbx_env_set_option(MDBX_env *env, const MDBX_option_t option, if (unlikely(err != MDBX_SUCCESS)) return err; should_unlock = true; + meta_cache_clear(env); } if (env->me_txn) err = MDBX_EPERM /* unable change during transaction */; diff --git a/src/internals.h b/src/internals.h index 309d3113..cda28c2b 100644 --- a/src/internals.h +++ b/src/internals.h @@ -1169,6 +1169,10 @@ struct MDBX_env { MDBX_txn *me_txn; /* current write transaction */ mdbx_fastmutex_t me_dbi_lock; +#if MDBX_CACHE_METAS + volatile const MDBX_meta *cache_last_meta; + volatile const MDBX_meta *cache_steady_meta; +#endif /* MDBX_CACHE_METAS */ MDBX_dbi me_numdbs; /* number of DBs opened */ MDBX_page *me_dp_reserve; /* list of malloc'ed blocks for re-use */ diff --git a/src/options.h b/src/options.h index bf4b71f3..283eec9e 100644 --- a/src/options.h +++ b/src/options.h @@ -92,6 +92,12 @@ #error MDBX_ENABLE_BIGFOOT must be defined as 0 or 1 #endif /* MDBX_ENABLE_BIGFOOT */ +#ifndef MDBX_CACHE_METAS +#define MDBX_CACHE_METAS 0 +#elif !(MDBX_CACHE_METAS == 0 || MDBX_CACHE_METAS == 1) +#error MDBX_CACHE_METAS must be defined as 0 or 1 +#endif /* MDBX_CACHE_METAS */ + /** Controls use of POSIX madvise() hints and friends. */ #ifndef MDBX_ENABLE_MADVISE #define MDBX_ENABLE_MADVISE 1 From d61c0963138577d5097f4e5287d5ceb906ae9d32 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Sat, 2 Jul 2022 19:28:55 +0300 Subject: [PATCH 012/364] mdbx: drop `mdbx_recent_steady_txnid()` and fix extra search for steady meta-page. --- src/core.c | 30 ++++++++++-------------------- 1 file changed, 10 insertions(+), 20 deletions(-) diff --git a/src/core.c b/src/core.c index 580106c6..ba66033d 100644 --- a/src/core.c +++ b/src/core.c @@ -5917,17 +5917,6 @@ static txnid_t mdbx_recent_committed_txnid(const MDBX_env *env) { } } -static txnid_t mdbx_recent_steady_txnid(const MDBX_env *env) { - while (true) { - volatile const MDBX_meta *head = meta_prefer_steady(env); - const txnid_t recent = meta_txnid(env, head); - mdbx_compiler_barrier(); - if (likely(head == meta_prefer_steady(env) && - recent == meta_txnid(env, head))) - return recent; - } -} - static const char *mdbx_durable_str(volatile const MDBX_meta *const meta) { if (META_IS_STEADY(meta)) return (unaligned_peek_u64_volatile(4, meta->mm_datasync_sign) == @@ -5941,20 +5930,21 @@ static const char *mdbx_durable_str(volatile const MDBX_meta *const meta) { /* Find oldest txnid still referenced. */ static txnid_t find_oldest_reader(const MDBX_env *env) { - const txnid_t edge = mdbx_recent_steady_txnid(env); - mdbx_assert(env, edge <= env->me_txn0->mt_txnid); + const txnid_t steady_edge = + constmeta_txnid(env, constmeta_prefer_steady(env)); + mdbx_assert(env, steady_edge <= env->me_txn0->mt_txnid); MDBX_lockinfo *const lck = env->me_lck_mmap.lck; if (unlikely(lck == NULL /* exclusive without-lck mode */)) { mdbx_assert(env, env->me_lck == (void *)&env->x_lckless_stub); - return env->me_lck->mti_oldest_reader.weak = edge; + return env->me_lck->mti_oldest_reader.weak = steady_edge; } const txnid_t last_oldest = atomic_load64(&lck->mti_oldest_reader, mo_AcquireRelease); - mdbx_assert(env, edge >= last_oldest); - if (likely(last_oldest == edge)) - return edge; + mdbx_assert(env, steady_edge >= last_oldest); + if (likely(last_oldest == steady_edge)) + return steady_edge; const uint32_t nothing_changed = MDBX_STRING_TETRAD("None"); const uint32_t snap_readers_refresh_flag = @@ -5966,12 +5956,12 @@ static txnid_t find_oldest_reader(const MDBX_env *env) { atomic_store32(&lck->mti_readers_refresh_flag, nothing_changed, mo_Relaxed); const unsigned snap_nreaders = atomic_load32(&lck->mti_numreaders, mo_AcquireRelease); - txnid_t oldest = edge; + txnid_t oldest = steady_edge; for (unsigned i = 0; i < snap_nreaders; ++i) { if (atomic_load32(&lck->mti_readers[i].mr_pid, mo_AcquireRelease)) { /* mdbx_jitter4testing(true); */ const txnid_t snap = safe64_read(&lck->mti_readers[i].mr_txnid); - if (oldest > snap && /* ignore pending updates */ snap <= edge) { + if (oldest > snap && /* ignore pending updates */ snap <= steady_edge) { oldest = snap; if (oldest == last_oldest) return oldest; @@ -21442,7 +21432,7 @@ __cold static txnid_t mdbx_kick_longlived_readers(MDBX_env *env, int retry; for (retry = 0; retry < INT_MAX; ++retry) { - txnid_t oldest = mdbx_recent_steady_txnid(env); + txnid_t oldest = constmeta_txnid(env, constmeta_prefer_steady(env)); mdbx_assert(env, oldest < env->me_txn0->mt_txnid); mdbx_assert(env, oldest >= laggard); mdbx_assert(env, oldest >= env->me_lck->mti_oldest_reader.weak); From 6c5ff863ff8cda40339d23f211814e319186b7a3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Thu, 30 Jun 2022 13:42:10 +0300 Subject: [PATCH 013/364] mdbx: remove `pp_txnid4chk()`, preparing to rework of page checking/validation. --- src/core.c | 128 +++++++++++++++++++----------------------------- src/internals.h | 3 +- 2 files changed, 52 insertions(+), 79 deletions(-) diff --git a/src/core.c b/src/core.c index e150c878..557f266c 100644 --- a/src/core.c +++ b/src/core.c @@ -3856,11 +3856,9 @@ enum { static int mdbx_txn_end(MDBX_txn *txn, const unsigned mode); __hot static struct page_result __must_check_result -mdbx_page_get_ex(MDBX_cursor *const mc, const pgno_t pgno, txnid_t front); -static __always_inline int __must_check_result mdbx_page_get(MDBX_cursor *mc, - pgno_t pgno, - MDBX_page **mp, - txnid_t front) { +mdbx_page_get_ex(MDBX_cursor *const mc, const pgno_t pgno, const txnid_t front); +static __always_inline int __must_check_result mdbx_page_get( + MDBX_cursor *mc, const pgno_t pgno, MDBX_page **mp, const txnid_t front) { struct page_result ret = mdbx_page_get_ex(mc, pgno, front); *mp = ret.page; @@ -4798,16 +4796,6 @@ static __inline void mdbx_page_wash(MDBX_txn *txn, const unsigned di, mdbx_dpage_free(txn->mt_env, mp, npages); } -static __inline txnid_t pp_txnid4chk(const MDBX_page *mp, const MDBX_txn *txn) { - (void)txn; -#if MDBX_DISABLE_PAGECHECKS - (void)mp; - return 0; -#else - return /* maybe zero in legacy DB */ mp->mp_txnid; -#endif /* !MDBX_DISABLE_PAGECHECKS */ -} - /* Retire, loosen or free a single page. * * For dirty pages, saves single pages to a list for future reuse in this same @@ -6717,7 +6705,7 @@ page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num, int flags) { if (unlikely((ret.err = mdbx_node_read( &recur.outer, page_node(mp, recur.outer.mc_ki[recur.outer.mc_top]), - &data, pp_txnid4chk(mp, txn))) != MDBX_SUCCESS)) + &data, mp->mp_txnid)) != MDBX_SUCCESS)) goto fail; if ((flags & MDBX_LIFORECLAIM) && !txn->tw.lifo_reclaimed) { @@ -13923,9 +13911,9 @@ static __inline int mdbx_cursor_push(MDBX_cursor *mc, MDBX_page *mp) { return MDBX_SUCCESS; } -__hot static struct page_result -mdbx_page_get_ex(MDBX_cursor *const mc, const pgno_t pgno, - /* TODO: use parent-page ptr */ txnid_t front) { +__hot static struct page_result mdbx_page_get_ex(MDBX_cursor *const mc, + const pgno_t pgno, + const txnid_t front) { struct page_result ret; MDBX_txn *const txn = mc->mc_txn; mdbx_tassert(txn, front <= txn->mt_front); @@ -14058,8 +14046,8 @@ __hot static int mdbx_page_search_root(MDBX_cursor *mc, const MDBX_val *key, mdbx_cassert(mc, i >= 0 && i < (int)page_numkeys(mp)); node = page_node(mp, i); - if (unlikely((rc = mdbx_page_get(mc, node_pgno(node), &mp, - pp_txnid4chk(mp, mc->mc_txn))) != 0)) + rc = mdbx_page_get(mc, node_pgno(node), &mp, mp->mp_txnid); + if (unlikely(rc != MDBX_SUCCESS)) return rc; mc->mc_ki[mc->mc_top] = (indx_t)i; @@ -14155,8 +14143,7 @@ static int mdbx_fetch_sdb(MDBX_txn *txn, MDBX_dbi dbi) { return MDBX_INCOMPATIBLE; /* not a named DB */ } - const txnid_t pp_txnid = - pp_txnid4chk(couple.outer.mc_pg[couple.outer.mc_top], txn); + const txnid_t pp_txnid = couple.outer.mc_pg[couple.outer.mc_top]->mp_txnid; rc = mdbx_node_read(&couple.outer, nsr.node, &data, pp_txnid); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -14208,10 +14195,9 @@ __hot static int mdbx_page_search_lowest(MDBX_cursor *mc) { MDBX_page *mp = mc->mc_pg[mc->mc_top]; mdbx_cassert(mc, IS_BRANCH(mp)); MDBX_node *node = page_node(mp, 0); - int rc; - if (unlikely((rc = mdbx_page_get(mc, node_pgno(node), &mp, - pp_txnid4chk(mp, mc->mc_txn))) != 0)) + int rc = mdbx_page_get(mc, node_pgno(node), &mp, mp->mp_txnid); + if (unlikely(rc != MDBX_SUCCESS)) return rc; mc->mc_ki[mc->mc_top] = 0; @@ -14460,8 +14446,8 @@ static int mdbx_cursor_sibling(MDBX_cursor *mc, int dir) { mdbx_cassert(mc, IS_BRANCH(mc->mc_pg[mc->mc_top])); node = page_node(mp = mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); - if (unlikely((rc = mdbx_page_get(mc, node_pgno(node), &mp, - pp_txnid4chk(mp, mc->mc_txn))) != 0)) { + rc = mdbx_page_get(mc, node_pgno(node), &mp, mp->mp_txnid); + if (unlikely(rc != MDBX_SUCCESS)) { /* mc will be inconsistent if caller does mc_snum++ as above */ mc->mc_flags &= ~(C_INITIALIZED | C_EOF); return rc; @@ -14472,7 +14458,7 @@ static int mdbx_cursor_sibling(MDBX_cursor *mc, int dir) { return rc; mc->mc_ki[mc->mc_top] = - (indx_t)((dir == SIBLING_LEFT) ? page_numkeys(mp) - 1 : 0); + (dir == SIBLING_LEFT) ? (indx_t)page_numkeys(mp) - 1 : 0; return MDBX_SUCCESS; } @@ -14528,8 +14514,8 @@ static int mdbx_cursor_next(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, if (unlikely(ki >= numkeys)) { mdbx_debug("%s", "=====> move to next sibling page"); mc->mc_ki[mc->mc_top] = (indx_t)(numkeys - 1); - if (unlikely((rc = mdbx_cursor_sibling(mc, SIBLING_RIGHT)) != - MDBX_SUCCESS)) { + rc = mdbx_cursor_sibling(mc, SIBLING_RIGHT); + if (unlikely(rc != MDBX_SUCCESS)) { mc->mc_flags |= C_EOF; return rc; } @@ -14567,9 +14553,8 @@ skip: if (unlikely(rc != MDBX_SUCCESS)) return rc; } else if (likely(data)) { - if (unlikely((rc = mdbx_node_read(mc, node, data, - pp_txnid4chk(mp, mc->mc_txn))) != - MDBX_SUCCESS)) + rc = mdbx_node_read(mc, node, data, mp->mp_txnid); + if (unlikely(rc != MDBX_SUCCESS)) return rc; } @@ -14662,9 +14647,8 @@ static int mdbx_cursor_prev(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, if (unlikely(rc != MDBX_SUCCESS)) return rc; } else if (likely(data)) { - if (unlikely((rc = mdbx_node_read(mc, node, data, - pp_txnid4chk(mp, mc->mc_txn))) != - MDBX_SUCCESS)) + rc = mdbx_node_read(mc, node, data, mp->mp_txnid); + if (unlikely(rc != MDBX_SUCCESS)) return rc; } @@ -14927,7 +14911,7 @@ got_node: } MDBX_val actual_data; ret.err = mdbx_node_read(mc, node, &actual_data, - pp_txnid4chk(mc->mc_pg[mc->mc_top], mc->mc_txn)); + mc->mc_pg[mc->mc_top]->mp_txnid); if (unlikely(ret.err != MDBX_SUCCESS)) return ret; const int cmp = mc->mc_dbx->md_dcmp(&aligned_data, &actual_data); @@ -14942,8 +14926,7 @@ got_node: } *data = actual_data; } else { - ret.err = mdbx_node_read(mc, node, data, - pp_txnid4chk(mc->mc_pg[mc->mc_top], mc->mc_txn)); + ret.err = mdbx_node_read(mc, node, data, mc->mc_pg[mc->mc_top]->mp_txnid); if (unlikely(ret.err != MDBX_SUCCESS)) return ret; } @@ -15000,10 +14983,8 @@ static int mdbx_cursor_first(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data) { if (unlikely(rc)) return rc; } else if (likely(data)) { - if (unlikely((rc = mdbx_node_read( - mc, node, data, - pp_txnid4chk(mc->mc_pg[mc->mc_top], mc->mc_txn))) != - MDBX_SUCCESS)) + rc = mdbx_node_read(mc, node, data, mc->mc_pg[mc->mc_top]->mp_txnid); + if (unlikely(rc != MDBX_SUCCESS)) return rc; } @@ -15052,10 +15033,8 @@ static int mdbx_cursor_last(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data) { if (unlikely(rc)) return rc; } else if (likely(data)) { - if (unlikely((rc = mdbx_node_read( - mc, node, data, - pp_txnid4chk(mc->mc_pg[mc->mc_top], mc->mc_txn))) != - MDBX_SUCCESS)) + rc = mdbx_node_read(mc, node, data, mc->mc_pg[mc->mc_top]->mp_txnid); + if (unlikely(rc != MDBX_SUCCESS)) return rc; } @@ -15121,7 +15100,7 @@ int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, return rc; } } else { - rc = mdbx_node_read(mc, node, data, pp_txnid4chk(mp, mc->mc_txn)); + rc = mdbx_node_read(mc, node, data, mp->mp_txnid); if (unlikely(rc)) return rc; } @@ -15228,8 +15207,7 @@ int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, MDBX_node *node = page_node(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); if (!F_ISSET(node_flags(node), F_DUPDATA)) { get_key_optional(node, key); - rc = mdbx_node_read(mc, node, data, - pp_txnid4chk(mc->mc_pg[mc->mc_top], mc->mc_txn)); + rc = mdbx_node_read(mc, node, data, mc->mc_pg[mc->mc_top]->mp_txnid); break; } } @@ -15393,7 +15371,7 @@ int mdbx_cursor_get_batch(MDBX_cursor *mc, size_t *count, MDBX_val *pairs, return MDBX_NOTFOUND; } - const txnid_t pp_txnid = pp_txnid4chk(page, mc->mc_txn); + const txnid_t pp_txnid = page->mp_txnid; do { if (unlikely(n + 2 > limit)) { rc = MDBX_RESULT_TRUE; @@ -15837,8 +15815,8 @@ int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, : 0; const pgno_t pgno = node_largedata_pgno(node); - struct page_result pgr = mdbx_page_get_ex( - mc, pgno, pp_txnid4chk(mc->mc_pg[mc->mc_top], mc->mc_txn)); + struct page_result pgr = + mdbx_page_get_ex(mc, pgno, mc->mc_pg[mc->mc_top]->mp_txnid); if (unlikely(pgr.err != MDBX_SUCCESS)) return pgr.err; if (unlikely(!IS_OVERFLOW(pgr.page))) @@ -16366,7 +16344,7 @@ int mdbx_cursor_del(MDBX_cursor *mc, MDBX_put_flags_t flags) { if (F_ISSET(node_flags(node), F_BIGDATA)) { MDBX_page *omp; if (unlikely((rc = mdbx_page_get(mc, node_largedata_pgno(node), &omp, - pp_txnid4chk(mp, mc->mc_txn))) || + mp->mp_txnid)) || (rc = mdbx_page_retire(mc, omp)))) goto fail; } @@ -17942,8 +17920,7 @@ static int mdbx_rebalance(MDBX_cursor *mc) { } else if (IS_BRANCH(mp) && nkeys == 1) { mdbx_debug("%s", "collapsing root page!"); mc->mc_db->md_root = node_pgno(page_node(mp, 0)); - rc = mdbx_page_get(mc, mc->mc_db->md_root, &mc->mc_pg[0], - pp_txnid4chk(mp, mc->mc_txn)); + rc = mdbx_page_get(mc, mc->mc_db->md_root, &mc->mc_pg[0], mp->mp_txnid); if (unlikely(rc != MDBX_SUCCESS)) return rc; mc->mc_db->md_depth--; @@ -18004,7 +17981,7 @@ static int mdbx_rebalance(MDBX_cursor *mc) { if (mn.mc_ki[pre_top] > 0) { rc = mdbx_page_get( &mn, node_pgno(page_node(mn.mc_pg[pre_top], mn.mc_ki[pre_top] - 1)), - &left, pp_txnid4chk(mn.mc_pg[pre_top], mc->mc_txn)); + &left, mc->mc_pg[mc->mc_top]->mp_txnid); if (unlikely(rc != MDBX_SUCCESS)) return rc; mdbx_cassert(mc, PAGETYPE(left) == PAGETYPE(mc->mc_pg[mc->mc_top])); @@ -18012,7 +17989,7 @@ static int mdbx_rebalance(MDBX_cursor *mc) { if (mn.mc_ki[pre_top] + 1u < page_numkeys(mn.mc_pg[pre_top])) { rc = mdbx_page_get( &mn, node_pgno(page_node(mn.mc_pg[pre_top], mn.mc_ki[pre_top] + 1)), - &right, pp_txnid4chk(mn.mc_pg[pre_top], mc->mc_txn)); + &right, mc->mc_pg[mc->mc_top]->mp_txnid); if (unlikely(rc != MDBX_SUCCESS)) return rc; mdbx_cassert(mc, PAGETYPE(right) == PAGETYPE(mc->mc_pg[mc->mc_top])); @@ -18244,20 +18221,19 @@ __cold static int mdbx_page_check(MDBX_cursor *const mc, dsize, mc->mc_dbx->md_vlen_min, mc->mc_dbx->md_vlen_max); } if ((options & C_RETIRING) == 0) { - MDBX_page *lp; - int err = mdbx_page_get(mc, node_largedata_pgno(node), &lp, - pp_txnid4chk(mp, mc->mc_txn)); - if (unlikely(err != MDBX_SUCCESS)) - return err; - if (unlikely(!IS_OVERFLOW(lp))) { + const struct page_result lp = + mdbx_page_get_ex(mc, node_largedata_pgno(node), mp->mp_txnid); + if (unlikely(lp.err != MDBX_SUCCESS)) + return lp.err; + if (unlikely(!IS_OVERFLOW(lp.page))) { rc = bad_page(mp, "big-node refs to non-overflow page (%u)\n", - lp->mp_pgno); + lp.page->mp_pgno); continue; } - if (unlikely(number_of_ovpages(env, dsize) > lp->mp_pages)) + if (unlikely(number_of_ovpages(env, dsize) > lp.page->mp_pages)) rc = bad_page(mp, "big-node size (%zu) mismatch n-pages size (%u)\n", - dsize, lp->mp_pages); + dsize, lp.page->mp_pages); } continue; } @@ -18455,7 +18431,7 @@ __cold static int mdbx_cursor_check(MDBX_cursor *mc, unsigned options) { return MDBX_CURSOR_FULL; pgno_t pgno = node_pgno(node); MDBX_page *np; - int rc = mdbx_page_get(mc, pgno, &np, pp_txnid4chk(mp, mc->mc_txn)); + int rc = mdbx_page_get(mc, pgno, &np, mp->mp_txnid); mdbx_cassert(mc, rc == MDBX_SUCCESS); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -19473,8 +19449,7 @@ __cold static int mdbx_env_cwalk(mdbx_copy *my, pgno_t *pg, int flags) { const pgno_t pgno = node_largedata_pgno(node); poke_pgno(node_data(node), my->mc_next_pgno); - rc = mdbx_page_get(&couple.outer, pgno, &omp, - pp_txnid4chk(mp, my->mc_txn)); + rc = mdbx_page_get(&couple.outer, pgno, &omp, mp->mp_txnid); if (unlikely(rc != MDBX_SUCCESS)) goto done; unsigned toggle = my->mc_head & 1; @@ -19529,7 +19504,7 @@ __cold static int mdbx_env_cwalk(mdbx_copy *my, pgno_t *pg, int flags) { rc = mdbx_page_get( &couple.outer, node_pgno(page_node(mp, couple.outer.mc_ki[couple.outer.mc_top])), - &mp, pp_txnid4chk(mp, my->mc_txn)); + &mp, mp->mp_txnid); if (unlikely(rc != MDBX_SUCCESS)) goto done; couple.outer.mc_top++; @@ -21542,8 +21517,7 @@ __cold static int mdbx_walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, npages = 1; MDBX_page *op; - err = mdbx_page_get(ctx->mw_cursor, large_pgno, &op, - pp_txnid4chk(mp, ctx->mw_txn)); + err = mdbx_page_get(ctx->mw_cursor, large_pgno, &op, mp->mp_txnid); if (err == MDBX_SUCCESS) err = mdbx_page_check(ctx->mw_cursor, op, 0); if (err == MDBX_SUCCESS) { @@ -21648,8 +21622,7 @@ __cold static int mdbx_walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, MDBX_node *node = page_node(mp, i); if (type == MDBX_page_branch) { - err = mdbx_walk_tree(ctx, node_pgno(node), name, deep + 1, - pp_txnid4chk(mp, ctx->mw_txn)); + err = mdbx_walk_tree(ctx, node_pgno(node), name, deep + 1, mp->mp_txnid); if (unlikely(err != MDBX_SUCCESS)) { if (err == MDBX_RESULT_TRUE) break; @@ -21696,8 +21669,7 @@ __cold static int mdbx_walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, assert(ctx->mw_cursor->mc_xcursor == &container_of(ctx->mw_cursor, MDBX_cursor_couple, outer)->inner); ctx->mw_cursor = &ctx->mw_cursor->mc_xcursor->mx_cursor; - err = mdbx_walk_tree(ctx, db.md_root, name, deep + 1, - pp_txnid4chk(mp, ctx->mw_txn)); + err = mdbx_walk_tree(ctx, db.md_root, name, deep + 1, mp->mp_txnid); MDBX_xcursor *inner_xcursor = container_of(ctx->mw_cursor, MDBX_xcursor, mx_cursor); MDBX_cursor_couple *couple = diff --git a/src/internals.h b/src/internals.h index 21a88711..952690e2 100644 --- a/src/internals.h +++ b/src/internals.h @@ -511,7 +511,8 @@ typedef struct MDBX_page { #define IS_SHADOWED(txn, p) ((p)->mp_txnid > (txn)->mt_txnid) #define IS_VALID(txn, p) ((p)->mp_txnid <= (txn)->mt_front) #define IS_MODIFIABLE(txn, p) ((p)->mp_txnid == (txn)->mt_front) - uint64_t mp_txnid; + uint64_t + mp_txnid; /* txnid which created this page, maybe zero in legacy DB */ struct MDBX_page *mp_next; /* for in-memory list of freed pages */ }; uint16_t mp_leaf2_ksize; /* key size if this is a LEAF2 page */ From d4ef9bf233f704c7e34f6cf48b56c6f4ebb69c37 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Thu, 30 Jun 2022 21:38:32 +0300 Subject: [PATCH 014/364] mdbx: rework page validation/checking, add `MDBX_VALIDATION` option (squashed). MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Здесь основная часть изменений преобразующих отладочную проверку страниц в регулярный и доступный пользователю осторожный/безопасный режим работы с потенциально поврежденной БД. Here the major part of the changes that transform a debugging check of pages into a regular and user-accessible careful/safe mode for working with a potentially corrupted database. --- CMakeLists.txt | 2 +- mdbx.h | 5 +- src/bits.md | 2 +- src/config.h.in | 2 +- src/core.c | 878 ++++++++++++++++++++++++++---------------------- src/internals.h | 36 +- src/mdbx_chk.c | 4 +- src/mdbx_dump.c | 12 +- src/options.h | 10 +- 9 files changed, 516 insertions(+), 435 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 387d60d7..784da6e0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -497,7 +497,7 @@ mark_as_advanced(MDBX_LOCKING) add_mdbx_option(MDBX_TRUST_RTC "Does a system have battery-backed Real-Time Clock or just a fake" AUTO) mark_as_advanced(MDBX_TRUST_RTC) option(MDBX_FORCE_ASSERTIONS "Force enable assertion checking" OFF) -option(MDBX_DISABLE_PAGECHECKS "Disable some checks to reduce an overhead and detection probability of database corruption to a values closer to the LMDB" OFF) +option(MDBX_DISABLE_VALIDATION "Disable some checks to reduce an overhead and detection probability of database corruption to a values closer to the LMDB" OFF) if(NOT MDBX_AMALGAMATED_SOURCE) if(CMAKE_CONFIGURATION_TYPES OR CMAKE_BUILD_TYPE_UPPERCASE STREQUAL "DEBUG") diff --git a/mdbx.h b/mdbx.h index 80eb7325..e37ab981 100644 --- a/mdbx.h +++ b/mdbx.h @@ -1023,6 +1023,9 @@ LIBMDBX_API void mdbx_assert_fail(const MDBX_env *env, const char *msg, enum MDBX_env_flags_t { MDBX_ENV_DEFAULTS = 0, + /** Extra validation of DB structure and pages content. */ + MDBX_VALIDATION = UINT32_C(0x00002000), + /** No environment directory. * * By default, MDBX creates its environment in a directory whose pathname is @@ -5091,7 +5094,7 @@ LIBMDBX_API int mdbx_thread_unregister(const MDBX_env *env); * \retval 1 Transaction aborted asynchronous and reader slot * should be cleared immediately, i.e. read transaction * will not continue but \ref mdbx_txn_abort() - * or \ref mdbx_txn_reset() will be called later. + * nor \ref mdbx_txn_reset() will be called later. * * \retval 2 or great The reader process was terminated or killed, * and libmdbx should entirely reset reader registration. diff --git a/src/bits.md b/src/bits.md index 99cef8e8..82c9eed4 100644 --- a/src/bits.md +++ b/src/bits.md @@ -13,7 +13,7 @@ N | MASK | ENV | TXN | DB | PUT | DBI | NOD 10|0000 0400| | | | | | | | | 11|0000 0800| | | | | | | | | 12|0000 1000| | | | | | | | | -13|0000 2000| | | | | | |P_SPILLED | | +13|0000 2000|VALIDATION | | | | | |P_SPILLED | | 14|0000 4000|NOSUBDIR | | | | | |P_LOOSE | | 15|0000 8000| | |DB_VALID |NOSPILL | | |P_FROZEN | | 16|0001 0000|SAFE_NOSYNC|TXN_NOSYNC | |RESERVE | |RESERVE | | | diff --git a/src/config.h.in b/src/config.h.in index 7959699a..58119c33 100644 --- a/src/config.h.in +++ b/src/config.h.in @@ -26,7 +26,7 @@ #ifndef MDBX_TRUST_RTC_AUTO #cmakedefine01 MDBX_TRUST_RTC #endif -#cmakedefine01 MDBX_DISABLE_PAGECHECKS +#cmakedefine01 MDBX_DISABLE_VALIDATION /* Windows */ #cmakedefine01 MDBX_WITHOUT_MSVC_CRT diff --git a/src/core.c b/src/core.c index 557f266c..713d4bab 100644 --- a/src/core.c +++ b/src/core.c @@ -683,38 +683,37 @@ number_of_ovpages(const MDBX_env *env, size_t bytes) { return bytes2pgno(env, PAGEHDRSZ - 1 + bytes) + 1; } +__cold static const char *pagetype_caption(const uint8_t type, + char buf4unknown[16]) { + switch (type) { + case P_BRANCH: + return "branch"; + case P_LEAF: + return "leaf"; + case P_LEAF | P_SUBP: + return "subleaf"; + case P_LEAF | P_LEAF2: + return "dupfixed-leaf"; + case P_LEAF | P_LEAF2 | P_SUBP: + return "dupfixed-subleaf"; + case P_OVERFLOW: + return "large"; + default: + snprintf(buf4unknown, 16, "unknown_0x%x", type); + return buf4unknown; + } +} + __cold static int MDBX_PRINTF_ARGS(2, 3) bad_page(const MDBX_page *mp, const char *fmt, ...) { if (mdbx_log_enabled(MDBX_LOG_ERROR)) { static const MDBX_page *prev; if (prev != mp) { + char buf4unknown[16]; prev = mp; - const char *type; - switch (mp->mp_flags & (P_BRANCH | P_LEAF | P_OVERFLOW | P_META | - P_LEAF2 | P_BAD | P_SUBP)) { - case P_BRANCH: - type = "branch"; - break; - case P_LEAF: - type = "leaf"; - break; - case P_LEAF | P_SUBP: - type = "subleaf"; - break; - case P_LEAF | P_LEAF2: - type = "dupfixed-leaf"; - break; - case P_LEAF | P_LEAF2 | P_SUBP: - type = "dupfixed-subleaf"; - break; - case P_OVERFLOW: - type = "large"; - break; - default: - type = "broken"; - } mdbx_debug_log(MDBX_LOG_ERROR, "badpage", 0, - "corrupted %s-page #%u, mod-txnid %" PRIaTXN "\n", type, + "corrupted %s-page #%u, mod-txnid %" PRIaTXN "\n", + pagetype_caption(PAGETYPE_EXTRA(mp), buf4unknown), mp->mp_pgno, mp->mp_txnid); } @@ -729,7 +728,7 @@ __cold static int MDBX_PRINTF_ARGS(2, 3) /* Address of node i in page p */ MDBX_NOTHROW_PURE_FUNCTION static __always_inline MDBX_node * page_node(const MDBX_page *mp, unsigned i) { - assert((mp->mp_flags & (P_LEAF2 | P_OVERFLOW | P_META)) == 0); + assert(PAGETYPE(mp) == P_LEAF || PAGETYPE(mp) == P_BRANCH); assert(page_numkeys(mp) > (unsigned)(i)); assert(mp->mp_ptrs[i] % 2 == 0); return (MDBX_node *)((char *)mp + mp->mp_ptrs[i] + PAGEHDRSZ); @@ -740,8 +739,7 @@ page_node(const MDBX_page *mp, unsigned i) { * There are no node headers, keys are stored contiguously. */ MDBX_NOTHROW_PURE_FUNCTION static __always_inline void * page_leaf2key(const MDBX_page *mp, unsigned i, size_t keysize) { - assert((mp->mp_flags & (P_BRANCH | P_LEAF | P_LEAF2 | P_OVERFLOW | P_META)) == - (P_LEAF | P_LEAF2)); + assert(PAGETYPE(mp) == (P_LEAF | P_LEAF2)); assert(mp->mp_leaf2_ksize == keysize); (void)keysize; return (char *)mp + PAGEHDRSZ + (i * mp->mp_leaf2_ksize); @@ -3178,7 +3176,7 @@ static bool mdbx_pnl_check(const MDBX_PNL pl, const size_t limit) { return false; if (unlikely(MDBX_PNL_MOST(pl) >= limit)) return false; - if (mdbx_audit_enabled()) { + if (!MDBX_DISABLE_VALIDATION || mdbx_audit_enabled()) { for (const pgno_t *scan = &MDBX_PNL_LAST(pl); --scan > pl;) { assert(MDBX_PNL_ORDERED(scan[0], scan[1])); if (unlikely(!MDBX_PNL_ORDERED(scan[0], scan[1]))) @@ -3941,10 +3939,9 @@ static int __must_check_result mdbx_audit_ex(MDBX_txn *txn, bool dont_filter_gc); static int __must_check_result mdbx_page_check(MDBX_cursor *const mc, - const MDBX_page *const mp, - unsigned options); -static int __must_check_result mdbx_cursor_check(MDBX_cursor *mc, - unsigned options); + const MDBX_page *const mp); +static int __must_check_result mdbx_cursor_check(MDBX_cursor *mc); +static int __must_check_result mdbx_cursor_check_updating(MDBX_cursor *mc); static int __must_check_result mdbx_cursor_del0(MDBX_cursor *mc); static int __must_check_result mdbx_del0(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, @@ -4257,8 +4254,7 @@ MDBX_MAYBE_UNUSED static void mdbx_page_list(MDBX_page *mp) { MDBX_val key; DKBUF; - switch (mp->mp_flags & - (P_BRANCH | P_LEAF | P_LEAF2 | P_META | P_OVERFLOW | P_SUBP)) { + switch (PAGETYPE_EXTRA(mp)) { case P_BRANCH: type = "Branch page"; break; @@ -4785,7 +4781,7 @@ static __inline void mdbx_page_wash(MDBX_txn *txn, const unsigned di, (txn->mt_parent ? txn->mt_parent->tw.dirtyroom : txn->mt_env->me_options.dp_limit)); mp->mp_txnid = INVALID_TXNID; - mp->mp_flags = 0xFFFF; + mp->mp_flags = P_BAD; VALGRIND_MAKE_MEM_UNDEFINED(mp, PAGEHDRSZ); if (txn->mt_flags & MDBX_WRITEMAP) { VALGRIND_MAKE_MEM_NOACCESS(page_data(mp), @@ -6668,17 +6664,11 @@ page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num, int flags) { goto fail; } - if (!MDBX_DISABLE_PAGECHECKS && - unlikely(key.iov_len != sizeof(txnid_t))) { + if (unlikely(key.iov_len != sizeof(txnid_t))) { ret.err = MDBX_CORRUPTED; goto fail; } last = unaligned_peek_u64(4, key.iov_base); - if (!MDBX_DISABLE_PAGECHECKS && - unlikely(last < MIN_TXNID || last > MAX_TXNID)) { - ret.err = MDBX_CORRUPTED; - goto fail; - } if (oldest <= last) { oldest = mdbx_find_oldest(txn); if (oldest <= last) { @@ -6720,7 +6710,8 @@ page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num, int flags) { mdbx_cassert(mc, (mc->mc_flags & C_GCFREEZE) == 0); pgno_t *gc_pnl = (pgno_t *)data.iov_base; mdbx_tassert(txn, data.iov_len >= MDBX_PNL_SIZEOF(gc_pnl)); - if (unlikely(data.iov_len < MDBX_PNL_SIZEOF(gc_pnl) || + if (unlikely(data.iov_len % sizeof(pgno_t) || + data.iov_len < MDBX_PNL_SIZEOF(gc_pnl) || !mdbx_pnl_check(gc_pnl, txn->mt_next_pgno))) { ret.err = MDBX_CORRUPTED; goto fail; @@ -6771,8 +6762,7 @@ page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num, int flags) { /* Merge in descending sorted order */ const unsigned prev_re_len = MDBX_PNL_SIZE(re_list); mdbx_pnl_xmerge(re_list, gc_pnl); - /* re-check to avoid duplicates */ - if (!MDBX_DISABLE_PAGECHECKS && + if (mdbx_audit_enabled() && unlikely(!mdbx_pnl_check(re_list, txn->mt_next_pgno))) { ret.err = MDBX_CORRUPTED; goto fail; @@ -8177,6 +8167,10 @@ static int mdbx_txn_renew0(MDBX_txn *txn, const unsigned flags) { (txn->mt_flags & MDBX_TXN_RDONLY) ? true : false); if (rc != MDBX_SUCCESS) goto bailout; + } else { + env->me_dxb_mmap.current = size; + env->me_dxb_mmap.filesize = + (env->me_dxb_mmap.filesize < size) ? size : env->me_dxb_mmap.filesize; } if (txn->mt_flags & MDBX_TXN_RDONLY) { #if defined(_WIN32) || defined(_WIN64) @@ -8191,10 +8185,6 @@ static int mdbx_txn_renew0(MDBX_txn *txn, const unsigned flags) { mdbx_srwlock_AcquireShared(&env->me_remap_guard); } #endif /* Windows */ - } else { - env->me_dxb_mmap.current = size; - env->me_dxb_mmap.filesize = - (env->me_dxb_mmap.filesize < size) ? size : env->me_dxb_mmap.filesize; } #if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__) mdbx_txn_valgrind(env, txn); @@ -9376,12 +9366,11 @@ retry: * now delete them and any we reserved for tw.reclaimed_pglist. */ while (ctx->cleaned_id <= txn->tw.last_reclaimed) { rc = mdbx_cursor_first(&ctx->cursor.outer, &key, NULL); - if (unlikely(rc != MDBX_SUCCESS)) { - if (rc == MDBX_NOTFOUND) - break; + if (rc == MDBX_NOTFOUND) + break; + if (unlikely(rc != MDBX_SUCCESS)) goto bailout; - } - if (!MDBX_DISABLE_PAGECHECKS && + if (!MDBX_DISABLE_VALIDATION && unlikely(key.iov_len != sizeof(txnid_t))) { rc = MDBX_CORRUPTED; goto bailout; @@ -9390,11 +9379,6 @@ retry: ctx->settled = 0; ctx->reused_slot = 0; ctx->cleaned_id = unaligned_peek_u64(4, key.iov_base); - if (!MDBX_DISABLE_PAGECHECKS && unlikely(ctx->cleaned_id < MIN_TXNID || - ctx->cleaned_id > MAX_TXNID)) { - rc = MDBX_CORRUPTED; - goto bailout; - } if (ctx->cleaned_id > txn->tw.last_reclaimed) break; if (likely(!ctx->dense)) { @@ -9701,11 +9685,6 @@ retry: goto bailout; } txnid_t gc_first = unaligned_peek_u64(4, key.iov_base); - if (!MDBX_DISABLE_PAGECHECKS && - unlikely(gc_first < MIN_TXNID || gc_first > MAX_TXNID)) { - rc = MDBX_CORRUPTED; - goto bailout; - } if (gc_first <= MIN_TXNID) { mdbx_debug("%s: no free GC's id(s) less than %" PRIaTXN " (going dense-mode)", @@ -9756,17 +9735,11 @@ retry: ctx->rid = mdbx_find_oldest(txn) - 1; rc = mdbx_cursor_get(&ctx->cursor.outer, &key, NULL, MDBX_FIRST); if (rc == MDBX_SUCCESS) { - if (!MDBX_DISABLE_PAGECHECKS && - unlikely(key.iov_len != sizeof(txnid_t))) { + if (unlikely(key.iov_len != sizeof(txnid_t))) { rc = MDBX_CORRUPTED; goto bailout; } txnid_t gc_first = unaligned_peek_u64(4, key.iov_base); - if (!MDBX_DISABLE_PAGECHECKS && - unlikely(gc_first < MIN_TXNID || gc_first > MAX_TXNID)) { - rc = MDBX_CORRUPTED; - goto bailout; - } if (ctx->rid >= gc_first) ctx->rid = gc_first - 1; if (unlikely(ctx->rid == 0)) { @@ -12189,8 +12162,9 @@ __cold static int mdbx_setup_dxb(MDBX_env *env, const int lck_rc, if (unlikely(err != MDBX_SUCCESS)) return err; - err = mdbx_ftruncate(env->me_lazy_fd, - env->me_dxb_mmap.filesize = env->me_dbgeo.now); + err = mdbx_ftruncate(env->me_lazy_fd, env->me_dxb_mmap.filesize = + env->me_dxb_mmap.current = + env->me_dbgeo.now); if (unlikely(err != MDBX_SUCCESS)) return err; @@ -13881,10 +13855,10 @@ static struct node_result __hot mdbx_node_search(MDBX_cursor *mc, /* Pop a page off the top of the cursor's stack. */ static __inline void mdbx_cursor_pop(MDBX_cursor *mc) { - if (mc->mc_snum) { + if (likely(mc->mc_snum)) { mdbx_debug("popped page %" PRIaPGNO " off db %d cursor %p", mc->mc_pg[mc->mc_top]->mp_pgno, DDBI(mc), (void *)mc); - if (--mc->mc_snum) { + if (likely(--mc->mc_snum)) { mc->mc_top--; } else { mc->mc_flags &= ~C_INITIALIZED; @@ -13903,11 +13877,9 @@ static __inline int mdbx_cursor_push(MDBX_cursor *mc, MDBX_page *mp) { return MDBX_CURSOR_FULL; } - mdbx_cassert(mc, mc->mc_snum < UINT16_MAX); mc->mc_top = mc->mc_snum++; mc->mc_pg[mc->mc_top] = mp; mc->mc_ki[mc->mc_top] = 0; - return MDBX_SUCCESS; } @@ -13959,13 +13931,12 @@ spilled: dirty: if (unlikely(ret.page->mp_pgno != pgno)) { bad_page(ret.page, - "mismatch actual pgno (%" PRIaPGNO ") != expected (%" PRIaPGNO - ")\n", + "pgno mismatch (%" PRIaPGNO ") != expected (%" PRIaPGNO ")\n", ret.page->mp_pgno, pgno); goto notfound; } -#if !MDBX_DISABLE_PAGECHECKS +#if !MDBX_DISABLE_VALIDATION if (unlikely(ret.page->mp_flags & P_ILL_BITS)) { ret.err = bad_page(ret.page, "invalid page's flags (%u)\n", ret.page->mp_flags); @@ -13976,7 +13947,7 @@ dirty: unlikely(ret.page->mp_txnid > txn->mt_front || front < txn->mt_txnid)) { ret.err = bad_page( ret.page, - "invalid page txnid (%" PRIaTXN ") for %s' txnid (%" PRIaTXN ")\n", + "invalid page' txnid (%" PRIaTXN ") for %s' txnid (%" PRIaTXN ")\n", ret.page->mp_txnid, (front == txn->mt_front && front != txn->mt_txnid) ? "front-txn" : "parent-page", @@ -13993,11 +13964,13 @@ dirty: ret.page->mp_lower, ret.page->mp_upper, page_space(env)); goto bailout; } -#endif /* !MDBX_DISABLE_PAGECHECKS */ +#endif /* !MDBX_DISABLE_VALIDATION */ + + if (unlikely(mc->mc_checking & CC_PAGECHECK) && + unlikely(MDBX_SUCCESS != (ret.err = mdbx_page_check(mc, ret.page)))) + goto bailout; ret.err = MDBX_SUCCESS; - if (mdbx_audit_enabled()) - ret.err = mdbx_page_check(mc, ret.page, C_UPDATING); return ret; } @@ -14062,13 +14035,11 @@ __hot static int mdbx_page_search_root(MDBX_cursor *mc, const MDBX_val *key, } } -#if !MDBX_DISABLE_PAGECHECKS - if (unlikely(!IS_LEAF(mp))) { - mc->mc_txn->mt_flags |= MDBX_TXN_ERROR; - return bad_page(mp, "index points to a page with 0x%02x flags\n", - mp->mp_flags); + if (!MDBX_DISABLE_VALIDATION && unlikely(!CHECK_LEAF_TYPE(mc, mp))) { + mdbx_error("unexpected leaf-page #%" PRIaPGNO " type 0x%x seen by cursor", + mp->mp_pgno, mp->mp_flags); + return MDBX_CORRUPTED; } -#endif /* !MDBX_DISABLE_PAGECHECKS */ mdbx_debug("found leaf page %" PRIaPGNO " for key [%s]", mp->mp_pgno, DKEY_DEBUG(key)); @@ -14097,7 +14068,7 @@ static int mdbx_setup_dbx(MDBX_dbx *const dbx, const MDBX_db *const db, assert(dbx->md_vlen_max != (unsigned)-1); if ((db->md_flags & (MDBX_DUPFIXED | MDBX_INTEGERDUP)) != 0 && db->md_xsize) { - if (!MDBX_DISABLE_PAGECHECKS && unlikely(db->md_xsize < dbx->md_vlen_min || + if (!MDBX_DISABLE_VALIDATION && unlikely(db->md_xsize < dbx->md_vlen_min || db->md_xsize > dbx->md_vlen_max)) { mdbx_error("db.md_xsize (%u) <> min/max value-length (%zu/%zu)", db->md_xsize, dbx->md_vlen_min, dbx->md_vlen_max); @@ -14170,14 +14141,14 @@ static int mdbx_fetch_sdb(MDBX_txn *txn, MDBX_dbi dbi) { } memcpy(db, data.iov_base, sizeof(MDBX_db)); -#if !MDBX_DISABLE_PAGECHECKS +#if !MDBX_DISABLE_VALIDATION mdbx_tassert(txn, txn->mt_front >= pp_txnid); if (unlikely(db->md_mod_txnid > pp_txnid)) { mdbx_error("db.md_mod_txnid (%" PRIaTXN ") > page-txnid (%" PRIaTXN ")", db->md_mod_txnid, pp_txnid); return MDBX_CORRUPTED; } -#endif /* !MDBX_DISABLE_PAGECHECKS */ +#endif /* !MDBX_DISABLE_VALIDATION */ rc = mdbx_setup_dbx(dbx, db, txn->mt_env->me_psize); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -14529,15 +14500,14 @@ skip: " with %u keys, key index %u", mp->mp_pgno, page_numkeys(mp), mc->mc_ki[mc->mc_top]); - if (!MDBX_DISABLE_PAGECHECKS && unlikely(!IS_LEAF(mp))) + if (!MDBX_DISABLE_VALIDATION && unlikely(!CHECK_LEAF_TYPE(mc, mp))) { + mdbx_error("unexpected leaf-page #%" PRIaPGNO " type 0x%x seen by cursor", + mp->mp_pgno, mp->mp_flags); return MDBX_CORRUPTED; + } if (IS_LEAF2(mp)) { - if (!MDBX_DISABLE_PAGECHECKS && unlikely((mc->mc_flags & C_SUB) == 0)) { - mdbx_error("unexpected LEAF2-page %" PRIaPGNO "for non-dupsort cursor", - mp->mp_pgno); - return MDBX_CORRUPTED; - } else if (likely(key)) { + if (likely(key)) { key->iov_len = mc->mc_db->md_xsize; key->iov_base = page_leaf2key(mp, mc->mc_ki[mc->mc_top], key->iov_len); } @@ -14622,15 +14592,14 @@ static int mdbx_cursor_prev(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, " with %u keys, key index %u", mp->mp_pgno, page_numkeys(mp), mc->mc_ki[mc->mc_top]); - if (!MDBX_DISABLE_PAGECHECKS && unlikely(!IS_LEAF(mp))) + if (!MDBX_DISABLE_VALIDATION && unlikely(!CHECK_LEAF_TYPE(mc, mp))) { + mdbx_error("unexpected leaf-page #%" PRIaPGNO " type 0x%x seen by cursor", + mp->mp_pgno, mp->mp_flags); return MDBX_CORRUPTED; + } if (IS_LEAF2(mp)) { - if (!MDBX_DISABLE_PAGECHECKS && unlikely((mc->mc_flags & C_SUB) == 0)) { - mdbx_error("unexpected LEAF2-page %" PRIaPGNO "for non-dupsort cursor", - mp->mp_pgno); - return MDBX_CORRUPTED; - } else if (likely(key)) { + if (likely(key)) { key->iov_len = mc->mc_db->md_xsize; key->iov_base = page_leaf2key(mp, mc->mc_ki[mc->mc_top], key->iov_len); } @@ -14846,18 +14815,19 @@ got_node: mc->mc_flags |= C_INITIALIZED; mc->mc_flags &= ~C_EOF; + if (!MDBX_DISABLE_VALIDATION && unlikely(!CHECK_LEAF_TYPE(mc, mp))) { + mdbx_error("unexpected leaf-page #%" PRIaPGNO " type 0x%x seen by cursor", + mp->mp_pgno, mp->mp_flags); + ret.err = MDBX_CORRUPTED; + return ret; + } + if (IS_LEAF2(mp)) { - if (!MDBX_DISABLE_PAGECHECKS && unlikely((mc->mc_flags & C_SUB) == 0)) { - mdbx_error("unexpected LEAF2-page %" PRIaPGNO "for non-dupsort cursor", - mp->mp_pgno); - ret.err = MDBX_CORRUPTED; - } else { - if (op == MDBX_SET_RANGE || op == MDBX_SET_KEY) { - key->iov_len = mc->mc_db->md_xsize; - key->iov_base = page_leaf2key(mp, mc->mc_ki[mc->mc_top], key->iov_len); - } - ret.err = MDBX_SUCCESS; + if (op == MDBX_SET_RANGE || op == MDBX_SET_KEY) { + key->iov_len = mc->mc_db->md_xsize; + key->iov_base = page_leaf2key(mp, mc->mc_ki[mc->mc_top], key->iov_len); } + ret.err = MDBX_SUCCESS; return ret; } @@ -14955,35 +14925,35 @@ static int mdbx_cursor_first(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data) { return rc; } - if (!MDBX_DISABLE_PAGECHECKS && unlikely(!IS_LEAF(mc->mc_pg[mc->mc_top]))) + const MDBX_page *mp = mc->mc_pg[mc->mc_top]; + if (!MDBX_DISABLE_VALIDATION && unlikely(!CHECK_LEAF_TYPE(mc, mp))) { + mdbx_error("unexpected leaf-page #%" PRIaPGNO " type 0x%x seen by cursor", + mp->mp_pgno, mp->mp_flags); return MDBX_CORRUPTED; + } mc->mc_flags |= C_INITIALIZED; mc->mc_flags &= ~C_EOF; mc->mc_ki[mc->mc_top] = 0; - if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { - if (!MDBX_DISABLE_PAGECHECKS && unlikely((mc->mc_flags & C_SUB) == 0)) { - mdbx_error("unexpected LEAF2-page %" PRIaPGNO "for non-dupsort cursor", - mc->mc_pg[mc->mc_top]->mp_pgno); - return MDBX_CORRUPTED; - } else if (likely(key)) { + if (IS_LEAF2(mp)) { + if (likely(key)) { key->iov_len = mc->mc_db->md_xsize; - key->iov_base = page_leaf2key(mc->mc_pg[mc->mc_top], 0, key->iov_len); + key->iov_base = page_leaf2key(mp, 0, key->iov_len); } return MDBX_SUCCESS; } - MDBX_node *node = page_node(mc->mc_pg[mc->mc_top], 0); + MDBX_node *node = page_node(mp, 0); if (F_ISSET(node_flags(node), F_DUPDATA)) { - rc = mdbx_xcursor_init1(mc, node, mc->mc_pg[mc->mc_top]); + rc = mdbx_xcursor_init1(mc, node, mp); if (unlikely(rc != MDBX_SUCCESS)) return rc; rc = mdbx_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); if (unlikely(rc)) return rc; } else if (likely(data)) { - rc = mdbx_node_read(mc, node, data, mc->mc_pg[mc->mc_top]->mp_txnid); + rc = mdbx_node_read(mc, node, data, mp->mp_txnid); if (unlikely(rc != MDBX_SUCCESS)) return rc; } @@ -15005,35 +14975,34 @@ static int mdbx_cursor_last(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data) { return rc; } - if (!MDBX_DISABLE_PAGECHECKS && unlikely(!IS_LEAF(mc->mc_pg[mc->mc_top]))) + const MDBX_page *mp = mc->mc_pg[mc->mc_top]; + if (!MDBX_DISABLE_VALIDATION && unlikely(!CHECK_LEAF_TYPE(mc, mp))) { + mdbx_error("unexpected leaf-page #%" PRIaPGNO " type 0x%x seen by cursor", + mp->mp_pgno, mp->mp_flags); return MDBX_CORRUPTED; + } - mc->mc_ki[mc->mc_top] = (indx_t)page_numkeys(mc->mc_pg[mc->mc_top]) - 1; + mc->mc_ki[mc->mc_top] = (indx_t)page_numkeys(mp) - 1; mc->mc_flags |= C_INITIALIZED | C_EOF; - if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { - if (!MDBX_DISABLE_PAGECHECKS && unlikely((mc->mc_flags & C_SUB) == 0)) { - mdbx_error("unexpected LEAF2-page %" PRIaPGNO "for non-dupsort cursor", - mc->mc_pg[mc->mc_top]->mp_pgno); - return MDBX_CORRUPTED; - } else if (likely(key)) { + if (IS_LEAF2(mp)) { + if (likely(key)) { key->iov_len = mc->mc_db->md_xsize; - key->iov_base = page_leaf2key(mc->mc_pg[mc->mc_top], - mc->mc_ki[mc->mc_top], key->iov_len); + key->iov_base = page_leaf2key(mp, mc->mc_ki[mc->mc_top], key->iov_len); } return MDBX_SUCCESS; } - MDBX_node *node = page_node(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); + MDBX_node *node = page_node(mp, mc->mc_ki[mc->mc_top]); if (F_ISSET(node_flags(node), F_DUPDATA)) { - rc = mdbx_xcursor_init1(mc, node, mc->mc_pg[mc->mc_top]); + rc = mdbx_xcursor_init1(mc, node, mp); if (unlikely(rc != MDBX_SUCCESS)) return rc; rc = mdbx_cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL); if (unlikely(rc)) return rc; } else if (likely(data)) { - rc = mdbx_node_read(mc, node, data, mc->mc_pg[mc->mc_top]->mp_txnid); + rc = mdbx_node_read(mc, node, data, mp->mp_txnid); if (unlikely(rc != MDBX_SUCCESS)) return rc; } @@ -15060,7 +15029,12 @@ int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, case MDBX_GET_CURRENT: { if (unlikely(!(mc->mc_flags & C_INITIALIZED))) return MDBX_ENODATA; - MDBX_page *mp = mc->mc_pg[mc->mc_top]; + const MDBX_page *mp = mc->mc_pg[mc->mc_top]; + if (!MDBX_DISABLE_VALIDATION && unlikely(!CHECK_LEAF_TYPE(mc, mp))) { + mdbx_error("unexpected leaf-page #%" PRIaPGNO " type 0x%x seen by cursor", + mp->mp_pgno, mp->mp_flags); + return MDBX_CORRUPTED; + } const unsigned nkeys = page_numkeys(mp); if (unlikely(mc->mc_ki[mc->mc_top] >= nkeys)) { mdbx_cassert(mc, nkeys <= UINT16_MAX); @@ -15074,11 +15048,6 @@ int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, rc = MDBX_SUCCESS; if (IS_LEAF2(mp)) { - if (!MDBX_DISABLE_PAGECHECKS && unlikely((mc->mc_flags & C_SUB) == 0)) { - mdbx_error("unexpected LEAF2-page %" PRIaPGNO "for non-dupsort cursor", - mp->mp_pgno); - return MDBX_CORRUPTED; - } key->iov_len = mc->mc_db->md_xsize; key->iov_base = page_leaf2key(mp, mc->mc_ki[mc->mc_top], key->iov_len); } else { @@ -15313,6 +15282,11 @@ static int cursor_next_batch(MDBX_cursor *mc) { mp = mc->mc_pg[mc->mc_top]; mdbx_debug("next page is %" PRIaPGNO ", key index %u", mp->mp_pgno, mc->mc_ki[mc->mc_top]); + if (!MDBX_DISABLE_VALIDATION && unlikely(!CHECK_LEAF_TYPE(mc, mp))) { + mdbx_error("unexpected leaf-page #%" PRIaPGNO " type 0x%x seen by cursor", + mp->mp_pgno, mp->mp_flags); + return MDBX_CORRUPTED; + } } return MDBX_SUCCESS; } @@ -15354,8 +15328,13 @@ int mdbx_cursor_get_batch(MDBX_cursor *mc, size_t *count, MDBX_val *pairs, return rc; } - const MDBX_page *const page = mc->mc_pg[mc->mc_top]; - const unsigned nkeys = page_numkeys(page); + const MDBX_page *const mp = mc->mc_pg[mc->mc_top]; + if (!MDBX_DISABLE_VALIDATION && unlikely(!CHECK_LEAF_TYPE(mc, mp))) { + mdbx_error("unexpected leaf-page #%" PRIaPGNO " type 0x%x seen by cursor", + mp->mp_pgno, mp->mp_flags); + return MDBX_CORRUPTED; + } + const unsigned nkeys = page_numkeys(mp); unsigned i = mc->mc_ki[mc->mc_top], n = 0; if (unlikely(i >= nkeys)) { mdbx_cassert(mc, op == MDBX_GET_CURRENT); @@ -15371,13 +15350,13 @@ int mdbx_cursor_get_batch(MDBX_cursor *mc, size_t *count, MDBX_val *pairs, return MDBX_NOTFOUND; } - const txnid_t pp_txnid = page->mp_txnid; + const txnid_t pp_txnid = mp->mp_txnid; do { if (unlikely(n + 2 > limit)) { rc = MDBX_RESULT_TRUE; break; } - const MDBX_node *leaf = page_node(page, i); + const MDBX_node *leaf = page_node(mp, i); get_key(leaf, &pairs[n]); rc = mdbx_node_read(mc, leaf, &pairs[n + 1], pp_txnid); if (unlikely(rc != MDBX_SUCCESS)) @@ -15793,7 +15772,7 @@ int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, } if (mdbx_audit_enabled()) { - err = mdbx_cursor_check(mc, 0); + err = mdbx_cursor_check(mc); if (unlikely(err != MDBX_SUCCESS)) return err; } @@ -15802,7 +15781,7 @@ int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, more:; if (mdbx_audit_enabled()) { - err = mdbx_cursor_check(mc, 0); + err = mdbx_cursor_check(mc); if (unlikely(err != MDBX_SUCCESS)) return err; } @@ -15873,7 +15852,7 @@ int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, memcpy(page_data(pgr.page), data->iov_base, data->iov_len); if (mdbx_audit_enabled()) { - err = mdbx_cursor_check(mc, 0); + err = mdbx_cursor_check(mc); if (unlikely(err != MDBX_SUCCESS)) return err; } @@ -16074,7 +16053,7 @@ int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, } if (mdbx_audit_enabled()) { - err = mdbx_cursor_check(mc, 0); + err = mdbx_cursor_check(mc); if (unlikely(err != MDBX_SUCCESS)) return err; } @@ -16095,7 +16074,7 @@ new_sub:; nflags |= MDBX_SPLIT_REPLACE; rc = mdbx_page_split(mc, key, rdata, P_INVALID, nflags); if (rc == MDBX_SUCCESS && mdbx_audit_enabled()) - rc = mdbx_cursor_check(mc, 0); + rc = mdbx_cursor_check(mc); } else { /* There is room already in this leaf page. */ if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { @@ -16221,7 +16200,7 @@ new_sub:; } } if (rc == MDBX_SUCCESS && mdbx_audit_enabled()) - rc = mdbx_cursor_check(mc, 0); + rc = mdbx_cursor_check(mc); return rc; bad_sub: if (unlikely(rc == MDBX_KEYEXIST)) { @@ -16264,16 +16243,13 @@ int mdbx_cursor_del(MDBX_cursor *mc, MDBX_put_flags_t flags) { return rc; MDBX_page *mp = mc->mc_pg[mc->mc_top]; - if (!MDBX_DISABLE_PAGECHECKS && unlikely(!IS_LEAF(mp))) + if (!MDBX_DISABLE_VALIDATION && unlikely(!CHECK_LEAF_TYPE(mc, mp))) { + mdbx_error("unexpected leaf-page #%" PRIaPGNO " type 0x%x seen by cursor", + mp->mp_pgno, mp->mp_flags); return MDBX_CORRUPTED; - if (IS_LEAF2(mp)) { - if (!MDBX_DISABLE_PAGECHECKS && unlikely((mc->mc_flags & C_SUB) == 0)) { - mdbx_error("unexpected LEAF2-page %" PRIaPGNO "for non-dupsort cursor", - mp->mp_pgno); - return MDBX_CORRUPTED; - } - goto del_key; } + if (IS_LEAF2(mp)) + goto del_key; MDBX_node *node = page_node(mp, mc->mc_ki[mc->mc_top]); if (F_ISSET(node_flags(node), F_DUPDATA)) { @@ -16654,10 +16630,13 @@ static void mdbx_node_del(MDBX_cursor *mc, size_t ksize) { #if MDBX_DEBUG > 0 if (mdbx_audit_enabled()) { - int page_check_err = mdbx_page_check(mc, mp, C_UPDATING); + const unsigned checking = mc->mc_checking; + mc->mc_checking |= CC_UPDATING; + const int page_check_err = mdbx_page_check(mc, mp); + mc->mc_checking = checking; mdbx_cassert(mc, page_check_err == MDBX_SUCCESS); } -#endif +#endif /* MDBX_DEBUG > 0 */ } /* Compact the main page after deleting a node on a subpage. @@ -16723,7 +16702,7 @@ static void mdbx_node_shrink(MDBX_page *mp, unsigned indx) { * [in] mc The main cursor whose sorted-dups cursor is to be initialized. */ static int mdbx_xcursor_init0(MDBX_cursor *mc) { MDBX_xcursor *mx = mc->mc_xcursor; - if (!MDBX_DISABLE_PAGECHECKS && unlikely(mx == nullptr)) { + if (!MDBX_DISABLE_VALIDATION && unlikely(mx == nullptr)) { mdbx_error("unexpected dupsort-page for non-dupsort db/cursor (dbi %u)", mc->mc_dbi); return MDBX_CORRUPTED; @@ -16738,7 +16717,11 @@ static int mdbx_xcursor_init0(MDBX_cursor *mc) { mx->mx_cursor.mc_dbistate = mc->mc_dbistate; mx->mx_cursor.mc_snum = 0; mx->mx_cursor.mc_top = 0; - mx->mx_cursor.mc_flags = C_SUB | (mc->mc_flags & (C_COPYING | C_SKIPORD)); + mx->mx_cursor.mc_flags = C_SUB; + STATIC_ASSERT(MDBX_DUPFIXED * 2 == P_LEAF2); + mdbx_cassert(mc, (mc->mc_checking & (P_BRANCH | P_LEAF | P_LEAF2)) == P_LEAF); + mx->mx_cursor.mc_checking = + mc->mc_checking + ((mc->mc_db->md_flags & MDBX_DUPFIXED) << 1); mx->mx_dbx.md_name.iov_len = 0; mx->mx_dbx.md_name.iov_base = NULL; mx->mx_dbx.md_cmp = mc->mc_dbx->md_dcmp; @@ -16756,7 +16739,7 @@ static int mdbx_xcursor_init0(MDBX_cursor *mc) { static int mdbx_xcursor_init1(MDBX_cursor *mc, MDBX_node *node, const MDBX_page *mp) { MDBX_xcursor *mx = mc->mc_xcursor; - if (!MDBX_DISABLE_PAGECHECKS && unlikely(mx == nullptr)) { + if (!MDBX_DISABLE_VALIDATION && unlikely(mx == nullptr)) { mdbx_error("unexpected dupsort-page for non-dupsort db/cursor (dbi %u)", mc->mc_dbi); return MDBX_CORRUPTED; @@ -16768,14 +16751,14 @@ static int mdbx_xcursor_init1(MDBX_cursor *mc, MDBX_node *node, mdbx_error("invalid node flags %u", flags); return MDBX_CORRUPTED; case F_DUPDATA | F_SUBDATA: - if (!MDBX_DISABLE_PAGECHECKS && + if (!MDBX_DISABLE_VALIDATION && unlikely(node_ds(node) != sizeof(MDBX_db))) { mdbx_error("invalid nested-db record size %zu", node_ds(node)); return MDBX_CORRUPTED; } memcpy(&mx->mx_db, node_data(node), sizeof(MDBX_db)); const txnid_t pp_txnid = mp->mp_txnid; - if (!MDBX_DISABLE_PAGECHECKS && + if (!MDBX_DISABLE_VALIDATION && unlikely(mx->mx_db.md_mod_txnid > pp_txnid)) { mdbx_error("nested-db.md_mod_txnid (%" PRIaTXN ") > page-txnid (%" PRIaTXN ")", @@ -16785,10 +16768,10 @@ static int mdbx_xcursor_init1(MDBX_cursor *mc, MDBX_node *node, mx->mx_cursor.mc_pg[0] = 0; mx->mx_cursor.mc_snum = 0; mx->mx_cursor.mc_top = 0; - mx->mx_cursor.mc_flags = C_SUB | (mc->mc_flags & (C_COPYING | C_SKIPORD)); + mx->mx_cursor.mc_flags = C_SUB; break; case F_DUPDATA: - if (!MDBX_DISABLE_PAGECHECKS && unlikely(node_ds(node) <= PAGEHDRSZ)) { + if (!MDBX_DISABLE_VALIDATION && unlikely(node_ds(node) <= PAGEHDRSZ)) { mdbx_error("invalid nested-page size %zu", node_ds(node)); return MDBX_CORRUPTED; } @@ -16802,8 +16785,7 @@ static int mdbx_xcursor_init1(MDBX_cursor *mc, MDBX_node *node, mx->mx_db.md_mod_txnid = mp->mp_txnid; mx->mx_cursor.mc_snum = 1; mx->mx_cursor.mc_top = 0; - mx->mx_cursor.mc_flags = - C_INITIALIZED | C_SUB | (mc->mc_flags & (C_COPYING | C_SKIPORD)); + mx->mx_cursor.mc_flags = C_SUB | C_INITIALIZED; mx->mx_cursor.mc_pg[0] = fp; mx->mx_cursor.mc_ki[0] = 0; mx->mx_db.md_flags = flags_db2sub(mc->mc_db->md_flags); @@ -16813,17 +16795,17 @@ static int mdbx_xcursor_init1(MDBX_cursor *mc, MDBX_node *node, } if (unlikely(mx->mx_db.md_xsize != mc->mc_db->md_xsize)) { - if (!MDBX_DISABLE_PAGECHECKS && unlikely(mc->mc_db->md_xsize != 0)) { + if (!MDBX_DISABLE_VALIDATION && unlikely(mc->mc_db->md_xsize != 0)) { mdbx_error("cursor mismatched nested-db md_xsize %u", mc->mc_db->md_xsize); return MDBX_CORRUPTED; } - if (!MDBX_DISABLE_PAGECHECKS && + if (!MDBX_DISABLE_VALIDATION && unlikely((mc->mc_db->md_flags & MDBX_DUPFIXED) == 0)) { mdbx_error("mismatched nested-db md_flags %u", mc->mc_db->md_flags); return MDBX_CORRUPTED; } - if (!MDBX_DISABLE_PAGECHECKS && + if (!MDBX_DISABLE_VALIDATION && unlikely(mx->mx_db.md_xsize < mc->mc_dbx->md_vlen_min || mx->mx_db.md_xsize > mc->mc_dbx->md_vlen_max)) { mdbx_error("mismatched nested-db.md_xsize (%u) <> min/max value-length " @@ -16853,7 +16835,7 @@ static int mdbx_xcursor_init1(MDBX_cursor *mc, MDBX_node *node, static int mdbx_xcursor_init2(MDBX_cursor *mc, MDBX_xcursor *src_mx, bool new_dupdata) { MDBX_xcursor *mx = mc->mc_xcursor; - if (!MDBX_DISABLE_PAGECHECKS && unlikely(mx == nullptr)) { + if (!MDBX_DISABLE_VALIDATION && unlikely(mx == nullptr)) { mdbx_error("unexpected dupsort-page for non-dupsort db/cursor (dbi %u)", mc->mc_dbi); return MDBX_CORRUPTED; @@ -16862,7 +16844,7 @@ static int mdbx_xcursor_init2(MDBX_cursor *mc, MDBX_xcursor *src_mx, if (new_dupdata) { mx->mx_cursor.mc_snum = 1; mx->mx_cursor.mc_top = 0; - mx->mx_cursor.mc_flags |= C_INITIALIZED; + mx->mx_cursor.mc_flags = C_SUB | C_INITIALIZED; mx->mx_cursor.mc_ki[0] = 0; } @@ -16894,6 +16876,12 @@ static __inline int mdbx_couple_init(MDBX_cursor_couple *couple, couple->outer.mc_top = 0; couple->outer.mc_pg[0] = 0; couple->outer.mc_flags = 0; + STATIC_ASSERT(CC_BRANCH == P_BRANCH && CC_LEAF == P_LEAF && + CC_LEAF2 == P_LEAF2); + couple->outer.mc_checking = + (mdbx_audit_enabled() || (txn->mt_env->me_flags & MDBX_VALIDATION)) + ? CC_PAGECHECK | CC_LEAF + : CC_LEAF; couple->outer.mc_ki[0] = 0; couple->outer.mc_xcursor = NULL; @@ -17220,7 +17208,7 @@ static int mdbx_update_key(MDBX_cursor *mc, const MDBX_val *key) { mdbx_node_del(mc, 0); int rc = mdbx_page_split(mc, key, NULL, pgno, MDBX_SPLIT_REPLACE); if (rc == MDBX_SUCCESS && mdbx_audit_enabled()) - rc = mdbx_cursor_check(mc, C_UPDATING); + rc = mdbx_cursor_check_updating(mc); return rc; } @@ -17813,6 +17801,7 @@ static void cursor_restore(const MDBX_cursor *csrc, MDBX_cursor *cdst) { cdst->mc_snum = csrc->mc_snum; cdst->mc_top = csrc->mc_top; cdst->mc_flags = csrc->mc_flags; + cdst->mc_checking = csrc->mc_checking; for (unsigned i = 0; i < csrc->mc_snum; i++) { cdst->mc_pg[i] = csrc->mc_pg[i]; @@ -18068,9 +18057,9 @@ retry: if (nkeys >= minkeys) { mc->mc_ki[mc->mc_top] = (indx_t)ki_top; - if (!mdbx_audit_enabled()) - return MDBX_SUCCESS; - return mdbx_cursor_check(mc, C_UPDATING); + if (mdbx_audit_enabled()) + return mdbx_cursor_check_updating(mc); + return MDBX_SUCCESS; } if (likely(room_threshold > 0)) { @@ -18086,65 +18075,114 @@ retry: } __cold static int mdbx_page_check(MDBX_cursor *const mc, - const MDBX_page *const mp, unsigned options) { + const MDBX_page *const mp) { DKBUF; - options |= mc->mc_flags; - MDBX_env *const env = mc->mc_txn->mt_env; - const unsigned nkeys = page_numkeys(mp); - char *const end_of_page = (char *)mp + env->me_psize; + int rc = MDBX_SUCCESS; if (unlikely(mp->mp_pgno < MIN_PAGENO || mp->mp_pgno > MAX_PAGENO)) - return bad_page(mp, "invalid pgno (%u)\n", mp->mp_pgno); - if (IS_OVERFLOW(mp)) { + rc = bad_page(mp, "invalid pgno (%u)\n", mp->mp_pgno); + + MDBX_env *const env = mc->mc_txn->mt_env; + const ptrdiff_t offset = (uint8_t *)mp - env->me_dxb_mmap.dxb; + unsigned flags_mask = P_ILL_BITS; + unsigned flags_expected = 0; + if (offset < 0 || + offset > (ptrdiff_t)(env->me_dxb_mmap.current - ((mp->mp_flags & P_SUBP) + ? PAGEHDRSZ + 1 + : env->me_psize))) { + /* should be dirty page without MDBX_WRITEMAP, or a subpage of. */ + flags_mask -= P_SUBP; + if ((env->me_flags & MDBX_WRITEMAP) != 0 || + (!IS_SHADOWED(mc->mc_txn, mp) && !(mp->mp_flags & P_SUBP))) + rc = bad_page(mp, "invalid page-address %p, offset %zi\n", + __Wpedantic_format_voidptr(mp), offset); + } else if (offset & (env->me_psize - 1)) + flags_expected = P_SUBP; + + if (unlikely((mp->mp_flags & flags_mask) != flags_expected)) + rc = bad_page(mp, "unknown/extra page-flags (have 0x%x, expect 0x%x)\n", + mp->mp_flags & flags_mask, flags_expected); + + const uint8_t type = PAGETYPE_EXTRA(mp); + switch (type) { + default: + return bad_page(mp, "invalid type (%u)\n", type); + case P_OVERFLOW: + if (unlikely((mc->mc_flags & C_SUB) || (mc->mc_checking & CC_LEAF2))) + rc = + bad_page(mp, "unexpected overflow-page for dupsort db (flags 0x%x)\n", + mc->mc_db->md_flags); if (unlikely(mp->mp_pages < 1 && mp->mp_pages >= MAX_PAGENO / 2)) - return bad_page(mp, "invalid overflow n-pages (%u)\n", mp->mp_pages); + rc = bad_page(mp, "invalid overflow n-pages (%u)\n", mp->mp_pages); if (unlikely(mp->mp_pgno + mp->mp_pages > mc->mc_txn->mt_next_pgno)) - return bad_page(mp, "overflow page beyond (%u) next-pgno\n", - mp->mp_pgno + mp->mp_pages); - if (unlikely((options & (C_SUB | C_COPYING)) == C_SUB)) - return bad_page(mp, - "unexpected overflow-page for dupsort db (flags 0x%x)\n", - mc->mc_db->md_flags); - return MDBX_SUCCESS; + rc = bad_page(mp, "overflow page beyond (%u) next-pgno\n", + mp->mp_pgno + mp->mp_pages); + return rc; + case P_LEAF: + case P_LEAF | P_SUBP: + if (unlikely((mc->mc_checking & CC_LEAF2) != 0)) + rc = bad_page( + mp, "unexpected leaf-page for dupfixed subtree (db-lags 0x%x)\n", + mc->mc_db->md_flags); + break; + case P_LEAF | P_LEAF2: + case P_LEAF | P_LEAF2 | P_SUBP: + if (unlikely((mc->mc_checking & CC_LEAF2) == 0)) + rc = bad_page( + mp, + "unexpected leaf2-page for non-dupfixed (sub)tree (db-flags 0x%x)\n", + mc->mc_db->md_flags); + break; + case P_BRANCH: + break; } - int rc = MDBX_SUCCESS; - if ((options & C_UPDATING) == 0 || !IS_MODIFIABLE(mc->mc_txn, mp)) { + char *const end_of_page = (char *)mp + env->me_psize; + const unsigned nkeys = page_numkeys(mp); + if ((mc->mc_checking & CC_UPDATING) == 0 || !IS_MODIFIABLE(mc->mc_txn, mp)) { if (unlikely(nkeys < 2 && IS_BRANCH(mp))) - rc = bad_page(mp, "branch-page nkey (%u) < 2\n", nkeys); + rc = bad_page(mp, "branch-page nkeys (%u) < 2\n", nkeys); + } + + const size_t ksize_max = keysize_max(env->me_psize, 0); + const size_t leaf2_ksize = mp->mp_leaf2_ksize; + if (IS_LEAF2(mp)) { + if ((mc->mc_checking & CC_COPYING) == 0) { + if (unlikely((mc->mc_flags & C_SUB) == 0 || + (mc->mc_db->md_flags & MDBX_DUPFIXED) == 0)) + rc = bad_page(mp, "unexpected leaf2-page (db-flags 0x%x)\n", + mc->mc_db->md_flags); + } + if (unlikely(leaf2_ksize < 1 || leaf2_ksize > ksize_max)) + rc = bad_page(mp, "invalid leaf2-key length (%zu)\n", leaf2_ksize); } - if (IS_LEAF2(mp) && unlikely((options & (C_SUB | C_COPYING)) == 0)) - rc = bad_page(mp, "unexpected leaf2-page (db flags 0x%x)\n", - mc->mc_db->md_flags); MDBX_val here, prev = {0, 0}; for (unsigned i = 0; i < nkeys; ++i) { if (IS_LEAF2(mp)) { - const size_t ksize = mp->mp_leaf2_ksize; - char *const key = page_leaf2key(mp, i, ksize); - if (unlikely(end_of_page < key + ksize)) { + char *const key = page_leaf2key(mp, i, leaf2_ksize); + if (unlikely(end_of_page < key + leaf2_ksize)) { rc = bad_page(mp, "leaf2-key beyond (%zu) page-end\n", - key + ksize - end_of_page); + key + leaf2_ksize - end_of_page); continue; } - if ((options & C_COPYING) == 0) { - if (unlikely(ksize != mc->mc_dbx->md_klen_min)) { - if (unlikely(ksize < mc->mc_dbx->md_klen_min || - ksize > mc->mc_dbx->md_klen_max)) - rc = bad_page( - mp, "leaf2-key size (%zu) <> min/max key-length (%zu/%zu)\n", - ksize, mc->mc_dbx->md_klen_min, mc->mc_dbx->md_klen_max); - else - mc->mc_dbx->md_klen_min = mc->mc_dbx->md_klen_max = ksize; - } - if ((options & C_SKIPORD) == 0) { - here.iov_len = ksize; - here.iov_base = key; - if (prev.iov_base && unlikely(mc->mc_dbx->md_cmp(&prev, &here) >= 0)) - rc = bad_page(mp, "leaf2-key #%u wrong order (%s >= %s)\n", i, - DKEY(&prev), DVAL(&here)); - prev = here; - } + if ((mc->mc_checking & CC_COPYING) == 0 && + unlikely(leaf2_ksize != mc->mc_dbx->md_klen_min)) { + if (unlikely(leaf2_ksize < mc->mc_dbx->md_klen_min || + leaf2_ksize > mc->mc_dbx->md_klen_max)) + rc = bad_page( + mp, "leaf2-key size (%zu) <> min/max key-length (%zu/%zu)\n", + leaf2_ksize, mc->mc_dbx->md_klen_min, mc->mc_dbx->md_klen_max); + else + mc->mc_dbx->md_klen_min = mc->mc_dbx->md_klen_max = leaf2_ksize; + } + if ((mc->mc_checking & CC_SKIPORD) == 0) { + here.iov_len = leaf2_ksize; + here.iov_base = key; + if (prev.iov_base && unlikely(mc->mc_dbx->md_cmp(&prev, &here) >= 0)) + rc = bad_page(mp, "leaf2-key #%u wrong order (%s >= %s)\n", i, + DKEY(&prev), DVAL(&here)); + prev = here; } } else { const MDBX_node *const node = page_node(mp, i); @@ -18154,20 +18192,23 @@ __cold static int mdbx_page_check(MDBX_cursor *const mc, node_end - end_of_page); continue; } - size_t ksize = node_ks(node); + const size_t ksize = node_ks(node); + if (unlikely(ksize > ksize_max)) + rc = bad_page(mp, "node[%u] too long key (%zu)\n", i, ksize); char *key = node_key(node); if (unlikely(end_of_page < key + ksize)) { rc = bad_page(mp, "node[%u] key (%zu) beyond page-end\n", i, key + ksize - end_of_page); continue; } - if ((IS_LEAF(mp) || i > 0) && (options & C_COPYING) == 0) { - if (unlikely(ksize < mc->mc_dbx->md_klen_min || + if ((IS_LEAF(mp) || i > 0)) { + if ((mc->mc_checking & CC_COPYING) == 0 && + unlikely(ksize < mc->mc_dbx->md_klen_min || ksize > mc->mc_dbx->md_klen_max)) rc = bad_page( mp, "node[%u] key size (%zu) <> min/max key-length (%zu/%zu)\n", i, ksize, mc->mc_dbx->md_klen_min, mc->mc_dbx->md_klen_max); - if ((options & C_SKIPORD) == 0) { + if ((mc->mc_checking & CC_SKIPORD) == 0) { here.iov_base = key; here.iov_len = ksize; if (prev.iov_base && unlikely(mc->mc_dbx->md_cmp(&prev, &here) >= 0)) @@ -18177,10 +18218,11 @@ __cold static int mdbx_page_check(MDBX_cursor *const mc, } } if (IS_BRANCH(mp)) { - if ((options & C_UPDATING) == 0 && i == 0 && unlikely(ksize != 0)) + if ((mc->mc_checking & CC_UPDATING) == 0 && i == 0 && + unlikely(ksize != 0)) rc = bad_page(mp, "branch-node[%u] wrong 0-node key-length (%zu)\n", i, ksize); - if ((options & C_RETIRING) == 0) { + if ((mc->mc_checking & CC_RETIRING) == 0) { const pgno_t ref = node_pgno(node); if (unlikely(ref < MIN_PAGENO || ref >= mc->mc_txn->mt_next_pgno)) rc = bad_page(mp, "branch-node[%u] wrong pgno (%u)\n", i, ref); @@ -18212,7 +18254,7 @@ __cold static int mdbx_page_check(MDBX_cursor *const mc, "bigdata-pgno", i, nkeys, dsize, data + dsize - end_of_page); continue; } - if ((options & C_COPYING) == 0) { + if ((mc->mc_checking & CC_COPYING) == 0) { if (unlikely(dsize <= mc->mc_dbx->md_vlen_min || dsize > mc->mc_dbx->md_vlen_max)) rc = bad_page( @@ -18220,9 +18262,18 @@ __cold static int mdbx_page_check(MDBX_cursor *const mc, "big-node data size (%zu) <> min/max value-length (%zu/%zu)\n", dsize, mc->mc_dbx->md_vlen_min, mc->mc_dbx->md_vlen_max); } - if ((options & C_RETIRING) == 0) { + if ((mc->mc_checking & CC_RETIRING) == 0) { + /* Disable full checking to avoid infinite recursion + * with a corrupted DB */ +#if !MDBX_DISABLE_VALIDATION + const uint8_t save_checking_level = mc->mc_checking; + mc->mc_checking &= ~CC_PAGECHECK; +#endif /* MDBX_DISABLE_VALIDATION */ const struct page_result lp = mdbx_page_get_ex(mc, node_largedata_pgno(node), mp->mp_txnid); +#if !MDBX_DISABLE_VALIDATION + mc->mc_checking = save_checking_level; +#endif /* MDBX_DISABLE_VALIDATION */ if (unlikely(lp.err != MDBX_SUCCESS)) return lp.err; if (unlikely(!IS_OVERFLOW(lp.page))) { @@ -18250,7 +18301,7 @@ __cold static int mdbx_page_check(MDBX_cursor *const mc, /* wrong, but already handled */ continue; case 0 /* usual */: - if ((options & C_COPYING) == 0) { + if ((mc->mc_checking & CC_COPYING) == 0) { if (unlikely(dsize < mc->mc_dbx->md_vlen_min || dsize > mc->mc_dbx->md_vlen_max)) { rc = bad_page( @@ -18303,29 +18354,27 @@ __cold static int mdbx_page_check(MDBX_cursor *const mc, continue; } - if ((options & C_COPYING) == 0) { - if (unlikely(sub_ksize != mc->mc_dbx->md_vlen_min)) { - if (unlikely(sub_ksize < mc->mc_dbx->md_vlen_min || - sub_ksize > mc->mc_dbx->md_vlen_max)) { - rc = bad_page(mp, - "nested-leaf2-key size (%zu) <> min/max " - "value-length (%zu/%zu)\n", - sub_ksize, mc->mc_dbx->md_vlen_min, - mc->mc_dbx->md_vlen_max); - continue; - } + if ((mc->mc_checking & CC_COPYING) == 0 && + unlikely(sub_ksize != mc->mc_dbx->md_vlen_min)) { + if (unlikely(sub_ksize < mc->mc_dbx->md_vlen_min || + sub_ksize > mc->mc_dbx->md_vlen_max)) + rc = bad_page(mp, + "nested-leaf2-key size (%zu) <> min/max " + "value-length (%zu/%zu)\n", + sub_ksize, mc->mc_dbx->md_vlen_min, + mc->mc_dbx->md_vlen_max); + else mc->mc_dbx->md_vlen_min = mc->mc_dbx->md_vlen_max = sub_ksize; - } - if ((options & C_SKIPORD) == 0) { - sub_here.iov_len = sub_ksize; - sub_here.iov_base = sub_key; - if (sub_prev.iov_base && - unlikely(mc->mc_dbx->md_dcmp(&sub_prev, &sub_here) >= 0)) - rc = bad_page( - mp, "nested-leaf2-key #%u wrong order (%s >= %s)\n", j, - DKEY(&sub_prev), DVAL(&sub_here)); - sub_prev = sub_here; - } + } + if ((mc->mc_checking & CC_SKIPORD) == 0) { + sub_here.iov_len = sub_ksize; + sub_here.iov_base = sub_key; + if (sub_prev.iov_base && + unlikely(mc->mc_dbx->md_dcmp(&sub_prev, &sub_here) >= 0)) + rc = bad_page(mp, + "nested-leaf2-key #%u wrong order (%s >= %s)\n", + j, DKEY(&sub_prev), DVAL(&sub_here)); + sub_prev = sub_here; } } else { const MDBX_node *const sub_node = page_node(sp, j); @@ -18344,7 +18393,7 @@ __cold static int mdbx_page_check(MDBX_cursor *const mc, size_t sub_dsize = node_ds(sub_node); /* char *sub_data = node_data(sub_node); */ - if ((options & C_COPYING) == 0) { + if ((mc->mc_checking & CC_COPYING) == 0) { if (unlikely(sub_ksize < mc->mc_dbx->md_vlen_min || sub_ksize > mc->mc_dbx->md_vlen_max)) rc = bad_page(mp, @@ -18352,17 +18401,16 @@ __cold static int mdbx_page_check(MDBX_cursor *const mc, "value-length (%zu/%zu)\n", sub_ksize, mc->mc_dbx->md_vlen_min, mc->mc_dbx->md_vlen_max); - - if ((options & C_SKIPORD) == 0) { - sub_here.iov_len = sub_ksize; - sub_here.iov_base = sub_key; - if (sub_prev.iov_base && - unlikely(mc->mc_dbx->md_dcmp(&sub_prev, &sub_here) >= 0)) - rc = bad_page( - mp, "nested-node-key #%u wrong order (%s >= %s)\n", j, - DKEY(&sub_prev), DVAL(&sub_here)); - sub_prev = sub_here; - } + } + if ((mc->mc_checking & CC_SKIPORD) == 0) { + sub_here.iov_len = sub_ksize; + sub_here.iov_base = sub_key; + if (sub_prev.iov_base && + unlikely(mc->mc_dbx->md_dcmp(&sub_prev, &sub_here) >= 0)) + rc = bad_page(mp, + "nested-node-key #%u wrong order (%s >= %s)\n", + j, DKEY(&sub_prev), DVAL(&sub_here)); + sub_prev = sub_here; } if (unlikely(sub_dsize != 0)) rc = bad_page(mp, "nested-node non-empty data size (%zu)\n", @@ -18380,19 +18428,23 @@ __cold static int mdbx_page_check(MDBX_cursor *const mc, return rc; } -__cold static int mdbx_cursor_check(MDBX_cursor *mc, unsigned options) { +__cold static int mdbx_cursor_check(MDBX_cursor *mc) { mdbx_cassert(mc, mc->mc_txn->tw.dirtyroom + mc->mc_txn->tw.dirtylist->length == (mc->mc_txn->mt_parent ? mc->mc_txn->mt_parent->tw.dirtyroom : mc->mc_txn->mt_env->me_options.dp_limit)); - mdbx_cassert(mc, mc->mc_top == mc->mc_snum - 1 || (options & C_UPDATING)); - if (unlikely(mc->mc_top != mc->mc_snum - 1) && (options & C_UPDATING) == 0) + mdbx_cassert(mc, mc->mc_top == mc->mc_snum - 1 || + (mc->mc_checking & CC_UPDATING)); + if (unlikely(mc->mc_top != mc->mc_snum - 1) && + (mc->mc_checking & CC_UPDATING) == 0) return MDBX_CURSOR_FULL; - mdbx_cassert(mc, (options & C_UPDATING) ? mc->mc_snum <= mc->mc_db->md_depth - : mc->mc_snum == mc->mc_db->md_depth); - if (unlikely((options & C_UPDATING) ? mc->mc_snum > mc->mc_db->md_depth - : mc->mc_snum != mc->mc_db->md_depth)) + mdbx_cassert(mc, (mc->mc_checking & CC_UPDATING) + ? mc->mc_snum <= mc->mc_db->md_depth + : mc->mc_snum == mc->mc_db->md_depth); + if (unlikely((mc->mc_checking & CC_UPDATING) + ? mc->mc_snum > mc->mc_db->md_depth + : mc->mc_snum != mc->mc_db->md_depth)) return MDBX_CURSOR_FULL; for (int n = 0; n < (int)mc->mc_snum; ++n) { @@ -18405,7 +18457,7 @@ __cold static int mdbx_cursor_check(MDBX_cursor *mc, unsigned options) { mdbx_cassert(mc, branch == expect_branch); if (unlikely(branch != expect_branch)) return MDBX_CURSOR_FULL; - if ((options & C_UPDATING) == 0) { + if ((mc->mc_checking & CC_UPDATING) == 0) { mdbx_cassert(mc, nkeys > mc->mc_ki[n] || (!branch && nkeys == mc->mc_ki[n] && (mc->mc_flags & C_EOF) != 0)); @@ -18419,7 +18471,7 @@ __cold static int mdbx_cursor_check(MDBX_cursor *mc, unsigned options) { return MDBX_CURSOR_FULL; } - int err = mdbx_page_check(mc, mp, options); + int err = mdbx_page_check(mc, mp); if (unlikely(err != MDBX_SUCCESS)) return err; @@ -18431,15 +18483,15 @@ __cold static int mdbx_cursor_check(MDBX_cursor *mc, unsigned options) { return MDBX_CURSOR_FULL; pgno_t pgno = node_pgno(node); MDBX_page *np; - int rc = mdbx_page_get(mc, pgno, &np, mp->mp_txnid); - mdbx_cassert(mc, rc == MDBX_SUCCESS); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; + err = mdbx_page_get(mc, pgno, &np, mp->mp_txnid); + mdbx_cassert(mc, err == MDBX_SUCCESS); + if (unlikely(err != MDBX_SUCCESS)) + return err; const bool nested_leaf = IS_LEAF(np) ? true : false; mdbx_cassert(mc, nested_leaf == expect_nested_leaf); if (unlikely(nested_leaf != expect_nested_leaf)) return MDBX_CURSOR_FULL; - err = mdbx_page_check(mc, np, options); + err = mdbx_page_check(mc, np); if (unlikely(err != MDBX_SUCCESS)) return err; } @@ -18448,6 +18500,14 @@ __cold static int mdbx_cursor_check(MDBX_cursor *mc, unsigned options) { return MDBX_SUCCESS; } +__cold static int mdbx_cursor_check_updating(MDBX_cursor *mc) { + const unsigned checking = mc->mc_checking; + mc->mc_checking |= CC_UPDATING; + const int rc = mdbx_cursor_check(mc); + mc->mc_checking = checking; + return rc; +} + /* Complete a delete operation started by mdbx_cursor_del(). */ static int mdbx_cursor_del0(MDBX_cursor *mc) { int rc; @@ -18558,7 +18618,7 @@ static int mdbx_cursor_del0(MDBX_cursor *mc) { mdbx_cassert(mc, rc == MDBX_SUCCESS); if (mdbx_audit_enabled()) - rc = mdbx_cursor_check(mc, 0); + rc = mdbx_cursor_check(mc); return rc; bailout: @@ -18649,7 +18709,7 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *const newkey, const unsigned newindx = mc->mc_ki[mc->mc_top]; unsigned nkeys = page_numkeys(mp); if (mdbx_audit_enabled()) { - rc = mdbx_cursor_check(mc, C_UPDATING); + rc = mdbx_cursor_check_updating(mc); if (unlikely(rc != MDBX_SUCCESS)) return rc; } @@ -18711,7 +18771,7 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *const newkey, mc->mc_top++; ptop = 0; if (mdbx_audit_enabled()) { - rc = mdbx_cursor_check(mc, C_UPDATING); + rc = mdbx_cursor_check_updating(mc); if (unlikely(rc != MDBX_SUCCESS)) goto done; } @@ -18819,10 +18879,10 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *const newkey, } if (mdbx_audit_enabled()) { - rc = mdbx_cursor_check(mc, C_UPDATING); + rc = mdbx_cursor_check_updating(mc); if (unlikely(rc != MDBX_SUCCESS)) goto done; - rc = mdbx_cursor_check(&mn, C_UPDATING); + rc = mdbx_cursor_check_updating(&mn); if (unlikely(rc != MDBX_SUCCESS)) goto done; } @@ -18949,7 +19009,7 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *const newkey, goto done; mdbx_cassert(mc, (int)mc->mc_snum - snum == mc->mc_db->md_depth - depth); if (mdbx_audit_enabled()) { - rc = mdbx_cursor_check(mc, C_UPDATING); + rc = mdbx_cursor_check_updating(mc); if (unlikely(rc != MDBX_SUCCESS)) goto done; } @@ -19222,7 +19282,7 @@ done: mc->mc_txn->mt_flags |= MDBX_TXN_ERROR; else { if (mdbx_audit_enabled()) - rc = mdbx_cursor_check(mc, C_UPDATING); + rc = mdbx_cursor_check_updating(mc); if (unlikely(nflags & MDBX_RESERVE)) { MDBX_node *node = page_node(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); if (!(node_flags(node) & F_BIGDATA)) @@ -19384,16 +19444,11 @@ __cold static int mdbx_env_cthr_toggle(mdbx_copy *my) { return my->mc_error; } -/* Depth-first tree traversal for compacting copy. - * [in] my control structure. - * [in,out] pg database root. - * [in] flags includes F_DUPDATA if it is a sorted-duplicate sub-DB. */ -__cold static int mdbx_env_cwalk(mdbx_copy *my, pgno_t *pg, int flags) { +/* Depth-first tree traversal for compacting copy. */ +__cold static int mdbx_env_cwalk(mdbx_copy *my, pgno_t *pg, + const unsigned hive_flags) { MDBX_cursor_couple couple; - MDBX_page *mo, *mp, *leaf; - char *buf, *ptr; - int rc; - unsigned i; + MDBX_page *copy; /* Empty DB, nothing to do */ if (*pg == P_INVALID) @@ -19402,11 +19457,13 @@ __cold static int mdbx_env_cwalk(mdbx_copy *my, pgno_t *pg, int flags) { memset(&couple, 0, sizeof(couple)); couple.outer.mc_snum = 1; couple.outer.mc_txn = my->mc_txn; - couple.outer.mc_flags = couple.inner.mx_cursor.mc_flags = - C_COPYING | C_SKIPORD; + couple.outer.mc_checking = couple.inner.mx_cursor.mc_checking = + (hive_flags & MDBX_DUPFIXED) + ? CC_PAGECHECK | CC_COPYING | CC_SKIPORD | CC_LEAF | CC_LEAF2 + : CC_PAGECHECK | CC_COPYING | CC_SKIPORD | CC_LEAF; - rc = mdbx_page_get(&couple.outer, *pg, &couple.outer.mc_pg[0], - my->mc_txn->mt_txnid); + int rc = mdbx_page_get(&couple.outer, *pg, &couple.outer.mc_pg[0], + my->mc_txn->mt_txnid); if (unlikely(rc != MDBX_SUCCESS)) return rc; rc = mdbx_page_search_root(&couple.outer, NULL, MDBX_PS_FIRST); @@ -19414,11 +19471,12 @@ __cold static int mdbx_env_cwalk(mdbx_copy *my, pgno_t *pg, int flags) { return rc; /* Make cursor pages writable */ - buf = ptr = mdbx_malloc(pgno2bytes(my->mc_env, couple.outer.mc_snum)); + char *const buf = mdbx_malloc(pgno2bytes(my->mc_env, couple.outer.mc_snum)); if (buf == NULL) return MDBX_ENOMEM; - for (i = 0; i < couple.outer.mc_top; i++) { + char *ptr = buf; + for (unsigned i = 0; i < couple.outer.mc_top; i++) { mdbx_page_copy((MDBX_page *)ptr, couple.outer.mc_pg[i], my->mc_env->me_psize); couple.outer.mc_pg[i] = (MDBX_page *)ptr; @@ -19426,18 +19484,18 @@ __cold static int mdbx_env_cwalk(mdbx_copy *my, pgno_t *pg, int flags) { } /* This is writable space for a leaf page. Usually not needed. */ - leaf = (MDBX_page *)ptr; + MDBX_page *const leaf = (MDBX_page *)ptr; while (couple.outer.mc_snum > 0) { - mp = couple.outer.mc_pg[couple.outer.mc_top]; + MDBX_page *mp = couple.outer.mc_pg[couple.outer.mc_top]; unsigned n = page_numkeys(mp); if (IS_LEAF(mp)) { - if (!IS_LEAF2(mp) && !(flags & F_DUPDATA)) { - for (i = 0; i < n; i++) { + if (hive_flags == 0 /* may have nested F_SUBDATA or F_BIGDATA nodes */) { + for (unsigned i = 0; i < n; i++) { MDBX_node *node = page_node(mp, i); if (node_flags(node) & F_BIGDATA) { - MDBX_page *omp; + MDBX_page *osrc; /* Need writable leaf */ if (mp != leaf) { @@ -19449,7 +19507,7 @@ __cold static int mdbx_env_cwalk(mdbx_copy *my, pgno_t *pg, int flags) { const pgno_t pgno = node_largedata_pgno(node); poke_pgno(node_data(node), my->mc_next_pgno); - rc = mdbx_page_get(&couple.outer, pgno, &omp, mp->mp_txnid); + rc = mdbx_page_get(&couple.outer, pgno, &osrc, mp->mp_txnid); if (unlikely(rc != MDBX_SUCCESS)) goto done; unsigned toggle = my->mc_head & 1; @@ -19460,21 +19518,21 @@ __cold static int mdbx_env_cwalk(mdbx_copy *my, pgno_t *pg, int flags) { goto done; toggle = my->mc_head & 1; } - mo = (MDBX_page *)(my->mc_wbuf[toggle] + my->mc_wlen[toggle]); - memcpy(mo, omp, my->mc_env->me_psize); - mo->mp_pgno = my->mc_next_pgno; - my->mc_next_pgno += omp->mp_pages; + copy = (MDBX_page *)(my->mc_wbuf[toggle] + my->mc_wlen[toggle]); + memcpy(copy, osrc, my->mc_env->me_psize); + copy->mp_pgno = my->mc_next_pgno; + my->mc_next_pgno += osrc->mp_pages; my->mc_wlen[toggle] += my->mc_env->me_psize; - if (omp->mp_pages > 1) { - my->mc_olen[toggle] = pgno2bytes(my->mc_env, omp->mp_pages - 1); - my->mc_over[toggle] = (uint8_t *)omp + my->mc_env->me_psize; + if (osrc->mp_pages > 1) { + my->mc_olen[toggle] = pgno2bytes(my->mc_env, osrc->mp_pages - 1); + my->mc_over[toggle] = (uint8_t *)osrc + my->mc_env->me_psize; rc = mdbx_env_cthr_toggle(my); if (unlikely(rc != MDBX_SUCCESS)) goto done; toggle = my->mc_head & 1; } } else if (node_flags(node) & F_SUBDATA) { - if (!MDBX_DISABLE_PAGECHECKS && + if (!MDBX_DISABLE_VALIDATION && unlikely(node_ds(node) != sizeof(MDBX_db))) { rc = MDBX_CORRUPTED; goto done; @@ -19490,8 +19548,13 @@ __cold static int mdbx_env_cwalk(mdbx_copy *my, pgno_t *pg, int flags) { MDBX_db db; memcpy(&db, node_data(node), sizeof(MDBX_db)); - rc = mdbx_env_cwalk(my, &db.md_root, node_flags(node) & F_DUPDATA); - if (rc) + STATIC_ASSERT(F_DUPDATA == MDBX_DUPSORT); + rc = mdbx_env_cwalk(my, &db.md_root, + (node_flags(node) & F_DUPDATA) + ? MDBX_DUPSORT | + (db.md_flags & MDBX_DUPFIXED) + : 0); + if (unlikely(rc != MDBX_SUCCESS)) goto done; memcpy(node_data(node), &db, sizeof(MDBX_db)); } @@ -19500,11 +19563,17 @@ __cold static int mdbx_env_cwalk(mdbx_copy *my, pgno_t *pg, int flags) { } else { couple.outer.mc_ki[couple.outer.mc_top]++; if (couple.outer.mc_ki[couple.outer.mc_top] < n) { - again: - rc = mdbx_page_get( - &couple.outer, - node_pgno(page_node(mp, couple.outer.mc_ki[couple.outer.mc_top])), - &mp, mp->mp_txnid); + again:; + const MDBX_node *node = + page_node(mp, couple.outer.mc_ki[couple.outer.mc_top]); + if (unlikely(node->mn_flags)) { + mdbx_error("unexpected type 0x%x of node #%u on page #%" PRIaPGNO, + node->mn_flags, couple.outer.mc_ki[couple.outer.mc_top], + couple.outer.mc_pg[couple.outer.mc_top]->mp_pgno); + rc = MDBX_CORRUPTED; + goto done; + } + rc = mdbx_page_get(&couple.outer, node_pgno(node), &mp, mp->mp_txnid); if (unlikely(rc != MDBX_SUCCESS)) goto done; couple.outer.mc_top++; @@ -19529,19 +19598,19 @@ __cold static int mdbx_env_cwalk(mdbx_copy *my, pgno_t *pg, int flags) { goto done; toggle = my->mc_head & 1; } - mo = (MDBX_page *)(my->mc_wbuf[toggle] + my->mc_wlen[toggle]); - mdbx_page_copy(mo, mp, my->mc_env->me_psize); - mo->mp_pgno = my->mc_next_pgno++; + copy = (MDBX_page *)(my->mc_wbuf[toggle] + my->mc_wlen[toggle]); + mdbx_page_copy(copy, mp, my->mc_env->me_psize); + copy->mp_pgno = my->mc_next_pgno++; my->mc_wlen[toggle] += my->mc_env->me_psize; if (couple.outer.mc_top) { /* Update parent if there is one */ node_set_pgno(page_node(couple.outer.mc_pg[couple.outer.mc_top - 1], couple.outer.mc_ki[couple.outer.mc_top - 1]), - mo->mp_pgno); + copy->mp_pgno); mdbx_cursor_pop(&couple.outer); } else { /* Otherwise we're done */ - *pg = mo->mp_pgno; + *pg = copy->mp_pgno; break; } } @@ -20627,7 +20696,7 @@ static int dbi_open(MDBX_txn *txn, const char *table_name, unsigned user_flags, rc = MDBX_INCOMPATIBLE; goto early_bailout; } - if (!MDBX_DISABLE_PAGECHECKS && unlikely(data.iov_len != sizeof(MDBX_db))) { + if (!MDBX_DISABLE_VALIDATION && unlikely(data.iov_len != sizeof(MDBX_db))) { rc = MDBX_CORRUPTED; goto early_bailout; } @@ -20891,8 +20960,7 @@ static int mdbx_drop_tree(MDBX_cursor *mc, const bool may_have_subDBs) { } } else { mdbx_cassert(mc, mc->mc_snum < mc->mc_db->md_depth); - if (mdbx_audit_enabled()) - mc->mc_flags |= C_RETIRING; + mc->mc_checking |= CC_RETIRING; const int pagetype = (IS_FROZEN(txn, mp) ? P_FROZEN : 0) + ((mc->mc_snum + 1 == mc->mc_db->md_depth) ? P_LEAF : P_BRANCH); @@ -20905,8 +20973,7 @@ static int mdbx_drop_tree(MDBX_cursor *mc, const bool may_have_subDBs) { if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } - if (mdbx_audit_enabled()) - mc->mc_flags -= C_RETIRING; + mc->mc_checking -= CC_RETIRING; } if (!mc->mc_top) break; @@ -21427,7 +21494,7 @@ typedef struct mdbx_walk_ctx { bool mw_dont_check_keys_ordering; } mdbx_walk_ctx_t; -__cold static int mdbx_walk_sdb(mdbx_walk_ctx_t *ctx, MDBX_db *const db, +__cold static int mdbx_walk_sdb(mdbx_walk_ctx_t *ctx, MDBX_db *const sdb, const char *name, int deep); static MDBX_page_type_t walk_page_type(const MDBX_page *mp) { @@ -21453,9 +21520,7 @@ __cold static int mdbx_walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, txnid_t parent_txnid) { assert(pgno != P_INVALID); MDBX_page *mp = nullptr; - int rc, err = mdbx_page_get(ctx->mw_cursor, pgno, &mp, parent_txnid); - if (err == MDBX_SUCCESS) - err = mdbx_page_check(ctx->mw_cursor, mp, 0); + int err = mdbx_page_get(ctx->mw_cursor, pgno, &mp, parent_txnid); MDBX_page_type_t type = walk_page_type(mp); const int nentries = (mp && !IS_OVERFLOW(mp)) ? page_numkeys(mp) : 1; @@ -21470,23 +21535,6 @@ __cold static int mdbx_walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, payload_size; size_t align_bytes = 0; - if (err == MDBX_SUCCESS) { - /* LY: Don't use mask here, e.g bitwise - * (P_BRANCH|P_LEAF|P_LEAF2|P_META|P_OVERFLOW|P_SUBP). - * Pages should not me marked dirty/loose or otherwise. */ - switch (mp->mp_flags) { - default: - err = MDBX_CORRUPTED; - break; - case P_BRANCH: - if (unlikely(nentries < 2)) - err = MDBX_CORRUPTED; - case P_LEAF: - case P_LEAF | P_LEAF2: - break; - } - } - for (int i = 0; err == MDBX_SUCCESS && i < nentries; align_bytes += ((payload_size + align_bytes) & 1), i++) { if (type == MDBX_page_dupfixed_leaf) { @@ -21517,24 +21565,24 @@ __cold static int mdbx_walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, npages = 1; MDBX_page *op; + assert(err == MDBX_SUCCESS); err = mdbx_page_get(ctx->mw_cursor, large_pgno, &op, mp->mp_txnid); - if (err == MDBX_SUCCESS) - err = mdbx_page_check(ctx->mw_cursor, op, 0); if (err == MDBX_SUCCESS) { /* LY: Don't use mask here, e.g bitwise * (P_BRANCH|P_LEAF|P_LEAF2|P_META|P_OVERFLOW|P_SUBP). * Pages should not me marked dirty/loose or otherwise. */ - if (unlikely(P_OVERFLOW != op->mp_flags)) - err = bad_page(mp, "wrong page type %d for large data", op->mp_flags); - else + if (unlikely(P_OVERFLOW != op->mp_flags)) { + assert(err == MDBX_CORRUPTED); + err = MDBX_CORRUPTED; + } else npages = op->mp_pages; } pagesize = pgno2bytes(ctx->mw_txn->mt_env, npages); const size_t over_unused = pagesize - over_payload - over_header; - rc = ctx->mw_visitor(large_pgno, npages, ctx->mw_user, deep, name, - pagesize, MDBX_page_large, err, 1, over_payload, - over_header, over_unused); + const int rc = ctx->mw_visitor(large_pgno, npages, ctx->mw_user, deep, + name, pagesize, MDBX_page_large, err, 1, + over_payload, over_header, over_unused); if (unlikely(rc != MDBX_SUCCESS)) return (rc == MDBX_RESULT_TRUE) ? MDBX_SUCCESS : rc; } break; @@ -21542,18 +21590,23 @@ __cold static int mdbx_walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, case F_SUBDATA /* sub-db */: { const size_t namelen = node_ks(node); payload_size += node_ds(node); - if (unlikely(namelen == 0 || node_ds(node) != sizeof(MDBX_db))) + if (unlikely(namelen == 0 || node_ds(node) != sizeof(MDBX_db))) { + assert(err == MDBX_CORRUPTED); err = MDBX_CORRUPTED; + } } break; case F_SUBDATA | F_DUPDATA /* dupsorted sub-tree */: payload_size += sizeof(MDBX_db); - if (unlikely(node_ds(node) != sizeof(MDBX_db))) + if (unlikely(node_ds(node) != sizeof(MDBX_db))) { + assert(err == MDBX_CORRUPTED); err = MDBX_CORRUPTED; + } break; case F_DUPDATA /* short sub-page */: { if (unlikely(node_ds(node) <= PAGEHDRSZ)) { + assert(err == MDBX_CORRUPTED); err = MDBX_CORRUPTED; break; } @@ -21575,6 +21628,7 @@ __cold static int mdbx_walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, subtype = MDBX_subpage_dupfixed_leaf; break; default: + assert(err == MDBX_CORRUPTED); subtype = MDBX_subpage_broken; err = MDBX_CORRUPTED; } @@ -21589,14 +21643,17 @@ __cold static int mdbx_walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, assert(subtype == MDBX_subpage_leaf); MDBX_node *subnode = page_node(sp, j); subpayload_size += NODESIZE + node_ks(subnode) + node_ds(subnode); - if (unlikely(node_flags(subnode) != 0)) + if (unlikely(node_flags(subnode) != 0)) { + assert(err == MDBX_CORRUPTED); err = MDBX_CORRUPTED; + } } } - rc = ctx->mw_visitor(pgno, 0, ctx->mw_user, deep + 1, name, node_ds(node), - subtype, err, nsubkeys, subpayload_size, - subheader_size, subunused_size + subalign_bytes); + const int rc = + ctx->mw_visitor(pgno, 0, ctx->mw_user, deep + 1, name, node_ds(node), + subtype, err, nsubkeys, subpayload_size, + subheader_size, subunused_size + subalign_bytes); if (unlikely(rc != MDBX_SUCCESS)) return (rc == MDBX_RESULT_TRUE) ? MDBX_SUCCESS : rc; header_size += subheader_size; @@ -21606,13 +21663,14 @@ __cold static int mdbx_walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, } break; default: + assert(err == MDBX_CORRUPTED); err = MDBX_CORRUPTED; } } - rc = ctx->mw_visitor(pgno, 1, ctx->mw_user, deep, name, - ctx->mw_txn->mt_env->me_psize, type, err, nentries, - payload_size, header_size, unused_size + align_bytes); + const int rc = ctx->mw_visitor( + pgno, 1, ctx->mw_user, deep, name, ctx->mw_txn->mt_env->me_psize, type, + err, nentries, payload_size, header_size, unused_size + align_bytes); if (unlikely(rc != MDBX_SUCCESS)) return (rc == MDBX_RESULT_TRUE) ? MDBX_SUCCESS : rc; @@ -21622,6 +21680,7 @@ __cold static int mdbx_walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, MDBX_node *node = page_node(mp, i); if (type == MDBX_page_branch) { + assert(err == MDBX_SUCCESS); err = mdbx_walk_tree(ctx, node_pgno(node), name, deep + 1, mp->mp_txnid); if (unlikely(err != MDBX_SUCCESS)) { if (err == MDBX_RESULT_TRUE) @@ -21640,6 +21699,7 @@ __cold static int mdbx_walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, case F_SUBDATA /* sub-db */: { const size_t namelen = node_ks(node); if (unlikely(namelen == 0 || node_ds(node) != sizeof(MDBX_db))) { + assert(err == MDBX_CORRUPTED); err = MDBX_CORRUPTED; break; } @@ -21648,33 +21708,37 @@ __cold static int mdbx_walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, char *const sub_name = (namelen < sizeof(namebuf_onstask)) ? namebuf_onstask : mdbx_malloc(namelen + 1); - if (sub_name) { - memcpy(sub_name, node_key(node), namelen); - sub_name[namelen] = 0; - memcpy(&db, node_data(node), sizeof(db)); - err = mdbx_walk_sdb(ctx, &db, sub_name, deep + 1); - if (sub_name != namebuf_onstask) - mdbx_free(sub_name); - } else { - err = MDBX_ENOMEM; - } + if (unlikely(!sub_name)) + return MDBX_ENOMEM; + memcpy(sub_name, node_key(node), namelen); + sub_name[namelen] = 0; + memcpy(&db, node_data(node), sizeof(db)); + assert(err == MDBX_SUCCESS); + err = mdbx_walk_sdb(ctx, &db, sub_name, deep + 1); + if (sub_name != namebuf_onstask) + mdbx_free(sub_name); } break; case F_SUBDATA | F_DUPDATA /* dupsorted sub-tree */: if (unlikely(node_ds(node) != sizeof(MDBX_db) || - ctx->mw_cursor->mc_xcursor == NULL)) + ctx->mw_cursor->mc_xcursor == NULL)) { + assert(err == MDBX_CORRUPTED); err = MDBX_CORRUPTED; - else { + } else { memcpy(&db, node_data(node), sizeof(db)); assert(ctx->mw_cursor->mc_xcursor == &container_of(ctx->mw_cursor, MDBX_cursor_couple, outer)->inner); - ctx->mw_cursor = &ctx->mw_cursor->mc_xcursor->mx_cursor; - err = mdbx_walk_tree(ctx, db.md_root, name, deep + 1, mp->mp_txnid); - MDBX_xcursor *inner_xcursor = - container_of(ctx->mw_cursor, MDBX_xcursor, mx_cursor); - MDBX_cursor_couple *couple = - container_of(inner_xcursor, MDBX_cursor_couple, inner); - ctx->mw_cursor = &couple->outer; + assert(err == MDBX_SUCCESS); + err = mdbx_xcursor_init1(ctx->mw_cursor, node, mp); + if (likely(err == MDBX_SUCCESS)) { + ctx->mw_cursor = &ctx->mw_cursor->mc_xcursor->mx_cursor; + err = mdbx_walk_tree(ctx, db.md_root, name, deep + 1, mp->mp_txnid); + MDBX_xcursor *inner_xcursor = + container_of(ctx->mw_cursor, MDBX_xcursor, mx_cursor); + MDBX_cursor_couple *couple = + container_of(inner_xcursor, MDBX_cursor_couple, inner); + ctx->mw_cursor = &couple->outer; + } } break; } @@ -21683,25 +21747,29 @@ __cold static int mdbx_walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, return MDBX_SUCCESS; } -__cold static int mdbx_walk_sdb(mdbx_walk_ctx_t *ctx, MDBX_db *const db, +__cold static int mdbx_walk_sdb(mdbx_walk_ctx_t *ctx, MDBX_db *const sdb, const char *name, int deep) { - if (unlikely(db->md_root == P_INVALID)) + if (unlikely(sdb->md_root == P_INVALID)) return MDBX_SUCCESS; /* empty db */ MDBX_cursor_couple couple; MDBX_dbx dbx = {.md_klen_min = INT_MAX}; uint8_t dbistate = DBI_VALID | DBI_AUDITED; - int rc = mdbx_couple_init(&couple, ~0u, ctx->mw_txn, db, &dbx, &dbistate); + int rc = mdbx_couple_init(&couple, ~0u, ctx->mw_txn, sdb, &dbx, &dbistate); if (unlikely(rc != MDBX_SUCCESS)) return rc; - if (ctx->mw_dont_check_keys_ordering) { - couple.outer.mc_flags |= C_SKIPORD; - couple.inner.mx_cursor.mc_flags |= C_SKIPORD; - } + couple.outer.mc_checking |= ctx->mw_dont_check_keys_ordering + ? CC_SKIPORD | CC_PAGECHECK + : CC_PAGECHECK; + couple.inner.mx_cursor.mc_checking |= ctx->mw_dont_check_keys_ordering + ? CC_SKIPORD | CC_PAGECHECK + : CC_PAGECHECK; couple.outer.mc_next = ctx->mw_cursor; ctx->mw_cursor = &couple.outer; - rc = mdbx_walk_tree(ctx, db->md_root, name, deep, ctx->mw_txn->mt_txnid); + rc = mdbx_walk_tree(ctx, sdb->md_root, name, deep, + sdb->md_mod_txnid ? sdb->md_mod_txnid + : ctx->mw_txn->mt_txnid); ctx->mw_cursor = couple.outer.mc_next; return rc; } @@ -23275,9 +23343,9 @@ __dll_export " MDBX_TRUST_RTC=" MDBX_TRUST_RTC_CONFIG " MDBX_ENABLE_REFUND=" MDBX_STRINGIFY(MDBX_ENABLE_REFUND) " MDBX_ENABLE_MADVISE=" MDBX_STRINGIFY(MDBX_ENABLE_MADVISE) -#if MDBX_DISABLE_PAGECHECKS - " MDBX_DISABLE_PAGECHECKS=YES" -#endif /* MDBX_DISABLE_PAGECHECKS */ +#if MDBX_DISABLE_VALIDATION + " MDBX_DISABLE_VALIDATION=YES" +#endif /* MDBX_DISABLE_VALIDATION */ #ifdef __SANITIZE_ADDRESS__ " SANITIZE_ADDRESS=YES" #endif /* __SANITIZE_ADDRESS__ */ diff --git a/src/internals.h b/src/internals.h index 952690e2..5001d30c 100644 --- a/src/internals.h +++ b/src/internals.h @@ -523,9 +523,11 @@ typedef struct MDBX_page { #define P_BAD 0x10 /* explicit flag for invalid/bad page */ #define P_LEAF2 0x20 /* for MDBX_DUPFIXED records */ #define P_SUBP 0x40 /* for MDBX_DUPSORT sub-pages */ -#define P_SPILLED 0x2000 /* spilled in parent txn */ -#define P_LOOSE 0x4000 /* page was dirtied then freed, can be reused */ -#define P_FROZEN 0x8000 /* used for retire page with known status */ +#define PAGETYPE_EXTRA(p) ((char)(p)->mp_flags) +#define PAGETYPE(p) (PAGETYPE_EXTRA(p) & ~P_SUBP) +#define P_SPILLED 0x2000 /* spilled in parent txn */ +#define P_LOOSE 0x4000 /* page was dirtied then freed, can be reused */ +#define P_FROZEN 0x8000 /* used for retire page with known status */ #define P_ILL_BITS (~(P_BRANCH | P_LEAF | P_LEAF2 | P_OVERFLOW | P_SPILLED)) uint16_t mp_flags; union { @@ -1032,8 +1034,8 @@ struct MDBX_cursor { MDBX_dbx *mc_dbx; /* The mt_dbistate for this database */ uint8_t *mc_dbistate; - unsigned mc_snum; /* number of pushed pages */ - unsigned mc_top; /* index of top page, normally mc_snum-1 */ + uint8_t mc_snum; /* number of pushed pages */ + uint8_t mc_top; /* index of top page, normally mc_snum-1 */ /* Cursor state flags. */ #define C_INITIALIZED 0x01 /* cursor has been initialized and is valid */ @@ -1043,18 +1045,27 @@ struct MDBX_cursor { #define C_UNTRACK 0x10 /* Un-track cursor when closing */ #define C_RECLAIMING 0x20 /* GC lookup is prohibited */ #define C_GCFREEZE 0x40 /* reclaimed_pglist must not be updated */ + uint8_t mc_flags; /* see mdbx_cursor */ /* Cursor checking flags. */ -#define C_COPYING 0x100 /* skip key-value length check (copying simplify) */ -#define C_UPDATING 0x200 /* update/rebalance pending */ -#define C_RETIRING 0x400 /* refs to child pages may be invalid */ -#define C_SKIPORD 0x800 /* don't check keys ordering */ +#define CC_BRANCH 0x01 /* same as P_BRANCH for CHECK_LEAF_TYPE() */ +#define CC_LEAF 0x02 /* same as P_LEAF for CHECK_LEAF_TYPE() */ +#define CC_UPDATING 0x04 /* update/rebalance pending */ +#define CC_COPYING 0x08 /* skip key-value length check (copying simplify) */ +#define CC_SKIPORD 0x10 /* don't check keys ordering */ +#define CC_LEAF2 0x20 /* same as P_LEAF2 for CHECK_LEAF_TYPE() */ +#define CC_RETIRING 0x40 /* refs to child pages may be invalid */ +#define CC_PAGECHECK 0x80 /* perform page checking, see MDBX_VALIDATION */ + uint8_t mc_checking; /* page checking level */ - unsigned mc_flags; /* see mdbx_cursor */ MDBX_page *mc_pg[CURSOR_STACK]; /* stack of pushed pages */ indx_t mc_ki[CURSOR_STACK]; /* stack of page indices */ }; +#define CHECK_LEAF_TYPE(mc, mp) \ + (((PAGETYPE_EXTRA(mp) ^ (mc)->mc_checking) & \ + (CC_BRANCH | CC_LEAF | CC_LEAF2)) == 0) + /* Context for sorted-dup records. * We could have gone to a fully recursive design, with arbitrarily * deep nesting of sub-databases. But for now we only handle these @@ -1444,8 +1455,6 @@ MDBX_INTERNAL_FUNC void mdbx_rthc_thread_dtor(void *ptr); /* Test if a page is a sub page */ #define IS_SUBP(p) (((p)->mp_flags & P_SUBP) != 0) -#define PAGETYPE(p) ((p)->mp_flags & (P_BRANCH | P_LEAF | P_LEAF2 | P_OVERFLOW)) - /* Header for a single key/data pair within a page. * Used in pages of type P_BRANCH and P_LEAF without P_LEAF2. * We guarantee 2-byte alignment for 'MDBX_node's. @@ -1588,7 +1597,8 @@ log2n_powerof2(size_t value) { * environment and re-opening it with the new flags. */ #define ENV_CHANGEABLE_FLAGS \ (MDBX_SAFE_NOSYNC | MDBX_NOMETASYNC | MDBX_DEPRECATED_MAPASYNC | \ - MDBX_NOMEMINIT | MDBX_COALESCE | MDBX_PAGEPERTURB | MDBX_ACCEDE) + MDBX_NOMEMINIT | MDBX_COALESCE | MDBX_PAGEPERTURB | MDBX_ACCEDE | \ + MDBX_VALIDATION) #define ENV_CHANGELESS_FLAGS \ (MDBX_NOSUBDIR | MDBX_RDONLY | MDBX_WRITEMAP | MDBX_NOTLS | MDBX_NORDAHEAD | \ MDBX_LIFORECLAIM | MDBX_EXCLUSIVE) diff --git a/src/mdbx_chk.c b/src/mdbx_chk.c index 576b275c..1f91b03f 100644 --- a/src/mdbx_chk.c +++ b/src/mdbx_chk.c @@ -805,9 +805,9 @@ static int process_db(MDBX_dbi dbi_handle, char *dbi_name, visitor *handler, } if (ignore_wrong_order) { /* for debugging with enabled assertions */ - mc->mc_flags |= C_SKIPORD; + mc->mc_checking |= CC_SKIPORD; if (mc->mc_xcursor) - mc->mc_xcursor->mx_cursor.mc_flags |= C_SKIPORD; + mc->mc_xcursor->mx_cursor.mc_checking |= CC_SKIPORD; } const size_t maxkeysize = mdbx_env_get_maxkeysize_ex(env, flags); diff --git a/src/mdbx_dump.c b/src/mdbx_dump.c index 6eec6fc3..0f57b599 100644 --- a/src/mdbx_dump.c +++ b/src/mdbx_dump.c @@ -186,10 +186,10 @@ static int dump_sdb(MDBX_txn *txn, MDBX_dbi dbi, char *name) { error("mdbx_cursor_open", rc); return rc; } - if (MDBX_DEBUG > 0 && rescue) { - cursor->mc_flags |= C_SKIPORD; + if (rescue) { + cursor->mc_checking |= CC_SKIPORD; if (cursor->mc_xcursor) - cursor->mc_xcursor->mx_cursor.mc_flags |= C_SKIPORD; + cursor->mc_xcursor->mx_cursor.mc_checking |= CC_SKIPORD; } while ((rc = mdbx_cursor_get(cursor, &key, &data, MDBX_NEXT)) == @@ -383,10 +383,10 @@ int main(int argc, char *argv[]) { error("mdbx_cursor_open", rc); goto txn_abort; } - if (MDBX_DEBUG > 0 && rescue) { - cursor->mc_flags |= C_SKIPORD; + if (rescue) { + cursor->mc_checking |= CC_SKIPORD; if (cursor->mc_xcursor) - cursor->mc_xcursor->mx_cursor.mc_flags |= C_SKIPORD; + cursor->mc_xcursor->mx_cursor.mc_checking |= CC_SKIPORD; } bool have_raw = false; diff --git a/src/options.h b/src/options.h index 30ffdf1b..f2b5b900 100644 --- a/src/options.h +++ b/src/options.h @@ -89,11 +89,11 @@ /** Disable some checks to reduce an overhead and detection probability of * database corruption to a values closer to the LMDB. */ -#ifndef MDBX_DISABLE_PAGECHECKS -#define MDBX_DISABLE_PAGECHECKS 0 -#elif !(MDBX_DISABLE_PAGECHECKS == 0 || MDBX_DISABLE_PAGECHECKS == 1) -#error MDBX_DISABLE_PAGECHECKS must be defined as 0 or 1 -#endif /* MDBX_DISABLE_PAGECHECKS */ +#ifndef MDBX_DISABLE_VALIDATION +#define MDBX_DISABLE_VALIDATION 0 +#elif !(MDBX_DISABLE_VALIDATION == 0 || MDBX_DISABLE_VALIDATION == 1) +#error MDBX_DISABLE_VALIDATION must be defined as 0 or 1 +#endif /* MDBX_DISABLE_VALIDATION */ #ifndef MDBX_PNL_PREALLOC_FOR_RADIXSORT #define MDBX_PNL_PREALLOC_FOR_RADIXSORT 1 From 2d300d807b5ca7c93af8c3d47d823558ad19298d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Sun, 3 Jul 2022 22:37:47 +0300 Subject: [PATCH 015/364] mdbx: extract `node_read_bigdata()`. --- src/core.c | 96 +++++++++++++++++++++++++++++++++--------------------- 1 file changed, 59 insertions(+), 37 deletions(-) diff --git a/src/core.c b/src/core.c index 713d4bab..ea1e5cf6 100644 --- a/src/core.c +++ b/src/core.c @@ -3926,7 +3926,7 @@ static int __must_check_result mdbx_node_move(MDBX_cursor *csrc, static int __must_check_result mdbx_node_read(MDBX_cursor *mc, const MDBX_node *leaf, MDBX_val *data, - const txnid_t front); + const MDBX_page *mp); static int __must_check_result mdbx_rebalance(MDBX_cursor *mc); static int __must_check_result mdbx_update_key(MDBX_cursor *mc, const MDBX_val *key); @@ -6695,7 +6695,7 @@ page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num, int flags) { if (unlikely((ret.err = mdbx_node_read( &recur.outer, page_node(mp, recur.outer.mc_ki[recur.outer.mc_top]), - &data, mp->mp_txnid)) != MDBX_SUCCESS)) + &data, mp)) != MDBX_SUCCESS)) goto fail; if ((flags & MDBX_LIFORECLAIM) && !txn->tw.lifo_reclaimed) { @@ -14114,8 +14114,8 @@ static int mdbx_fetch_sdb(MDBX_txn *txn, MDBX_dbi dbi) { return MDBX_INCOMPATIBLE; /* not a named DB */ } - const txnid_t pp_txnid = couple.outer.mc_pg[couple.outer.mc_top]->mp_txnid; - rc = mdbx_node_read(&couple.outer, nsr.node, &data, pp_txnid); + rc = mdbx_node_read(&couple.outer, nsr.node, &data, + couple.outer.mc_pg[couple.outer.mc_top]); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -14142,6 +14142,7 @@ static int mdbx_fetch_sdb(MDBX_txn *txn, MDBX_dbi dbi) { memcpy(db, data.iov_base, sizeof(MDBX_db)); #if !MDBX_DISABLE_VALIDATION + const txnid_t pp_txnid = couple.outer.mc_pg[couple.outer.mc_top]->mp_txnid; mdbx_tassert(txn, txn->mt_front >= pp_txnid); if (unlikely(db->md_mod_txnid > pp_txnid)) { mdbx_error("db.md_mod_txnid (%" PRIaTXN ") > page-txnid (%" PRIaTXN ")", @@ -14256,32 +14257,55 @@ __hot static int mdbx_page_search(MDBX_cursor *mc, const MDBX_val *key, return mdbx_page_search_root(mc, key, flags); } -/* Return the data associated with a given node. - * - * [in] mc The cursor for this operation. - * [in] leaf The node being read. - * [out] data Updated to point to the node's data. - * - * Returns 0 on success, non-zero on failure. */ -static __always_inline int mdbx_node_read(MDBX_cursor *mc, - const MDBX_node *node, MDBX_val *data, - const txnid_t front) { - data->iov_len = node_ds(node); - data->iov_base = node_data(node); - if (unlikely(F_ISSET(node_flags(node), F_BIGDATA))) { - /* Read overflow data. */ - MDBX_page *omp; /* overflow page */ - int rc = mdbx_page_get(mc, node_largedata_pgno(node), &omp, front); - if (unlikely((rc != MDBX_SUCCESS))) { - mdbx_debug("read overflow page %" PRIaPGNO " failed", - node_largedata_pgno(node)); - return rc; - } - data->iov_base = page_data(omp); +/* Read overflow node data. */ +static __noinline int node_read_bigdata(MDBX_cursor *mc, const MDBX_node *node, + MDBX_val *data, const MDBX_page *mp) { + mdbx_cassert(mc, + node_flags(node) == F_BIGDATA && data->iov_len == node_ds(node)); + + struct page_result ret = + mdbx_page_get_ex(mc, node_largedata_pgno(node), mp->mp_txnid); + if (unlikely((ret.err != MDBX_SUCCESS))) { + mdbx_debug("read overflow page %" PRIaPGNO " failed", + node_largedata_pgno(node)); + return ret.err; + } + + data->iov_base = page_data(ret.page); + if (!MDBX_DISABLE_VALIDATION && + unlikely(PAGETYPE_EXTRA(ret.page) != P_OVERFLOW)) + return bad_page(ret.page, "invalid page-type 0x%x for bigdata-node", + PAGETYPE_EXTRA(ret.page)); + if (!MDBX_DISABLE_VALIDATION && + unlikely(node_size_len(node_ks(node), data->iov_len) <= + mc->mc_txn->mt_env->me_leaf_nodemax)) + bad_page(mp, "too small data (%zu bytes) for bigdata-node", data->iov_len); + if (!MDBX_DISABLE_VALIDATION && + unlikely(ret.page->mp_pages != + number_of_ovpages(mc->mc_txn->mt_env, data->iov_len))) { + if (ret.page->mp_pages < + number_of_ovpages(mc->mc_txn->mt_env, data->iov_len)) + return bad_page(ret.page, + "too less n-pages %u for bigdata-node (%zu bytes)", + ret.page->mp_pages, data->iov_len); + else + bad_page(ret.page, "extra n-pages %u for bigdata-node (%zu bytes)", + ret.page->mp_pages, data->iov_len); } return MDBX_SUCCESS; } +/* Return the data associated with a given node. */ +static __always_inline int mdbx_node_read(MDBX_cursor *mc, + const MDBX_node *node, MDBX_val *data, + const MDBX_page *mp) { + data->iov_len = node_ds(node); + data->iov_base = node_data(node); + if (likely(node_flags(node) != F_BIGDATA)) + return MDBX_SUCCESS; + return node_read_bigdata(mc, node, data, mp); +} + int mdbx_get(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, MDBX_val *data) { DKBUF_DEBUG; mdbx_debug("===> get db %u key [%s]", dbi, DKEY_DEBUG(key)); @@ -14523,7 +14547,7 @@ skip: if (unlikely(rc != MDBX_SUCCESS)) return rc; } else if (likely(data)) { - rc = mdbx_node_read(mc, node, data, mp->mp_txnid); + rc = mdbx_node_read(mc, node, data, mp); if (unlikely(rc != MDBX_SUCCESS)) return rc; } @@ -14616,7 +14640,7 @@ static int mdbx_cursor_prev(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, if (unlikely(rc != MDBX_SUCCESS)) return rc; } else if (likely(data)) { - rc = mdbx_node_read(mc, node, data, mp->mp_txnid); + rc = mdbx_node_read(mc, node, data, mp); if (unlikely(rc != MDBX_SUCCESS)) return rc; } @@ -14880,8 +14904,7 @@ got_node: } } MDBX_val actual_data; - ret.err = mdbx_node_read(mc, node, &actual_data, - mc->mc_pg[mc->mc_top]->mp_txnid); + ret.err = mdbx_node_read(mc, node, &actual_data, mc->mc_pg[mc->mc_top]); if (unlikely(ret.err != MDBX_SUCCESS)) return ret; const int cmp = mc->mc_dbx->md_dcmp(&aligned_data, &actual_data); @@ -14896,7 +14919,7 @@ got_node: } *data = actual_data; } else { - ret.err = mdbx_node_read(mc, node, data, mc->mc_pg[mc->mc_top]->mp_txnid); + ret.err = mdbx_node_read(mc, node, data, mc->mc_pg[mc->mc_top]); if (unlikely(ret.err != MDBX_SUCCESS)) return ret; } @@ -14953,7 +14976,7 @@ static int mdbx_cursor_first(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data) { if (unlikely(rc)) return rc; } else if (likely(data)) { - rc = mdbx_node_read(mc, node, data, mp->mp_txnid); + rc = mdbx_node_read(mc, node, data, mp); if (unlikely(rc != MDBX_SUCCESS)) return rc; } @@ -15002,7 +15025,7 @@ static int mdbx_cursor_last(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data) { if (unlikely(rc)) return rc; } else if (likely(data)) { - rc = mdbx_node_read(mc, node, data, mp->mp_txnid); + rc = mdbx_node_read(mc, node, data, mp); if (unlikely(rc != MDBX_SUCCESS)) return rc; } @@ -15069,7 +15092,7 @@ int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, return rc; } } else { - rc = mdbx_node_read(mc, node, data, mp->mp_txnid); + rc = mdbx_node_read(mc, node, data, mp); if (unlikely(rc)) return rc; } @@ -15176,7 +15199,7 @@ int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, MDBX_node *node = page_node(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); if (!F_ISSET(node_flags(node), F_DUPDATA)) { get_key_optional(node, key); - rc = mdbx_node_read(mc, node, data, mc->mc_pg[mc->mc_top]->mp_txnid); + rc = mdbx_node_read(mc, node, data, mc->mc_pg[mc->mc_top]); break; } } @@ -15350,7 +15373,6 @@ int mdbx_cursor_get_batch(MDBX_cursor *mc, size_t *count, MDBX_val *pairs, return MDBX_NOTFOUND; } - const txnid_t pp_txnid = mp->mp_txnid; do { if (unlikely(n + 2 > limit)) { rc = MDBX_RESULT_TRUE; @@ -15358,7 +15380,7 @@ int mdbx_cursor_get_batch(MDBX_cursor *mc, size_t *count, MDBX_val *pairs, } const MDBX_node *leaf = page_node(mp, i); get_key(leaf, &pairs[n]); - rc = mdbx_node_read(mc, leaf, &pairs[n + 1], pp_txnid); + rc = mdbx_node_read(mc, leaf, &pairs[n + 1], mp); if (unlikely(rc != MDBX_SUCCESS)) break; n += 2; From 1740f8227a7a718d1c4e8f4230c1c80c0021657d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Mon, 4 Jul 2022 13:22:18 +0300 Subject: [PATCH 016/364] mdbx: rework copy-with-compactification. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Кроме небольшого рефакторинга здесь реализуется более регулярный способ обхода дерева при копировании с компактификаций. В частности, полная инициализация курсоров позволяет выполнять больше проверок/контроля структуры БД и избавиться от флажка CC_COPYING. Beside a small refactoring, a more regular way of traversing the tree when copying with compactification is implemented here. In particular, full initialization of cursors allows to perform more checks/control of the DB structure and get rid of the CC_COPYING flag. --- src/core.c | 412 ++++++++++++++++++++++++++++------------------------- 1 file changed, 215 insertions(+), 197 deletions(-) diff --git a/src/core.c b/src/core.c index ea1e5cf6..82a1327f 100644 --- a/src/core.c +++ b/src/core.c @@ -19373,7 +19373,7 @@ int mdbx_put(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, MDBX_val *data, /**** COPYING *****************************************************************/ /* State needed for a double-buffering compacting copy. */ -typedef struct mdbx_copy { +typedef struct mdbx_compacting_ctx { MDBX_env *mc_env; MDBX_txn *mc_txn; mdbx_condpair_t mc_condpair; @@ -19388,39 +19388,39 @@ typedef struct mdbx_copy { pgno_t mc_next_pgno; volatile unsigned mc_head; volatile unsigned mc_tail; -} mdbx_copy; +} mdbx_compacting_ctx; /* Dedicated writer thread for compacting copy. */ -__cold static THREAD_RESULT THREAD_CALL mdbx_env_copythr(void *arg) { - mdbx_copy *my = arg; +__cold static THREAD_RESULT THREAD_CALL compacting_write_thread(void *arg) { + mdbx_compacting_ctx *const ctx = arg; #if defined(EPIPE) && !(defined(_WIN32) || defined(_WIN64)) sigset_t sigset; sigemptyset(&sigset); sigaddset(&sigset, SIGPIPE); - my->mc_error = pthread_sigmask(SIG_BLOCK, &sigset, NULL); + ctx->mc_error = pthread_sigmask(SIG_BLOCK, &sigset, NULL); #endif /* EPIPE */ - mdbx_condpair_lock(&my->mc_condpair); - while (!my->mc_error) { - while (my->mc_tail == my->mc_head && !my->mc_error) { - int err = mdbx_condpair_wait(&my->mc_condpair, true); + mdbx_condpair_lock(&ctx->mc_condpair); + while (!ctx->mc_error) { + while (ctx->mc_tail == ctx->mc_head && !ctx->mc_error) { + int err = mdbx_condpair_wait(&ctx->mc_condpair, true); if (err != MDBX_SUCCESS) { - my->mc_error = err; + ctx->mc_error = err; goto bailout; } } - const unsigned toggle = my->mc_tail & 1; - size_t wsize = my->mc_wlen[toggle]; + const unsigned toggle = ctx->mc_tail & 1; + size_t wsize = ctx->mc_wlen[toggle]; if (wsize == 0) { - my->mc_tail += 1; + ctx->mc_tail += 1; break /* EOF */; } - my->mc_wlen[toggle] = 0; - uint8_t *ptr = my->mc_wbuf[toggle]; + ctx->mc_wlen[toggle] = 0; + uint8_t *ptr = ctx->mc_wbuf[toggle]; again: - if (!my->mc_error) { - int err = mdbx_write(my->mc_fd, ptr, wsize); + if (!ctx->mc_error) { + int err = mdbx_write(ctx->mc_fd, ptr, wsize); if (err != MDBX_SUCCESS) { #if defined(EPIPE) && !(defined(_WIN32) || defined(_WIN64)) if (err == EPIPE) { @@ -19430,128 +19430,118 @@ __cold static THREAD_RESULT THREAD_CALL mdbx_env_copythr(void *arg) { sigwait(&sigset, &unused); } #endif /* EPIPE */ - my->mc_error = err; + ctx->mc_error = err; goto bailout; } } /* If there's an overflow page tail, write it too */ - wsize = my->mc_olen[toggle]; + wsize = ctx->mc_olen[toggle]; if (wsize) { - my->mc_olen[toggle] = 0; - ptr = my->mc_over[toggle]; + ctx->mc_olen[toggle] = 0; + ptr = ctx->mc_over[toggle]; goto again; } - my->mc_tail += 1; - mdbx_condpair_signal(&my->mc_condpair, false); + ctx->mc_tail += 1; + mdbx_condpair_signal(&ctx->mc_condpair, false); } bailout: - mdbx_condpair_unlock(&my->mc_condpair); + mdbx_condpair_unlock(&ctx->mc_condpair); return (THREAD_RESULT)0; } /* Give buffer and/or MDBX_EOF to writer thread, await unused buffer. */ -__cold static int mdbx_env_cthr_toggle(mdbx_copy *my) { - mdbx_condpair_lock(&my->mc_condpair); - mdbx_assert(my->mc_env, my->mc_head - my->mc_tail < 2 || my->mc_error); - my->mc_head += 1; - mdbx_condpair_signal(&my->mc_condpair, true); - while (!my->mc_error && - my->mc_head - my->mc_tail == 2 /* both buffers in use */) { - int err = mdbx_condpair_wait(&my->mc_condpair, false); +__cold static int compacting_toggle_write_buffers(mdbx_compacting_ctx *ctx) { + mdbx_condpair_lock(&ctx->mc_condpair); + mdbx_assert(ctx->mc_env, ctx->mc_head - ctx->mc_tail < 2 || ctx->mc_error); + ctx->mc_head += 1; + mdbx_condpair_signal(&ctx->mc_condpair, true); + while (!ctx->mc_error && + ctx->mc_head - ctx->mc_tail == 2 /* both buffers in use */) { + int err = mdbx_condpair_wait(&ctx->mc_condpair, false); if (err != MDBX_SUCCESS) - my->mc_error = err; + ctx->mc_error = err; } - mdbx_condpair_unlock(&my->mc_condpair); - return my->mc_error; + mdbx_condpair_unlock(&ctx->mc_condpair); + return ctx->mc_error; } -/* Depth-first tree traversal for compacting copy. */ -__cold static int mdbx_env_cwalk(mdbx_copy *my, pgno_t *pg, - const unsigned hive_flags) { - MDBX_cursor_couple couple; - MDBX_page *copy; +__cold static int compacting_walk_sdb(mdbx_compacting_ctx *ctx, MDBX_db *sdb); - /* Empty DB, nothing to do */ - if (*pg == P_INVALID) - return MDBX_SUCCESS; - - memset(&couple, 0, sizeof(couple)); - couple.outer.mc_snum = 1; - couple.outer.mc_txn = my->mc_txn; - couple.outer.mc_checking = couple.inner.mx_cursor.mc_checking = - (hive_flags & MDBX_DUPFIXED) - ? CC_PAGECHECK | CC_COPYING | CC_SKIPORD | CC_LEAF | CC_LEAF2 - : CC_PAGECHECK | CC_COPYING | CC_SKIPORD | CC_LEAF; - - int rc = mdbx_page_get(&couple.outer, *pg, &couple.outer.mc_pg[0], - my->mc_txn->mt_txnid); +__cold static int compacting_walk_tree(mdbx_compacting_ctx *ctx, + MDBX_cursor *mc, pgno_t *root, + txnid_t parent_txnid) { + mc->mc_snum = 1; + int rc = mdbx_page_get(mc, *root, &mc->mc_pg[0], parent_txnid); if (unlikely(rc != MDBX_SUCCESS)) return rc; - rc = mdbx_page_search_root(&couple.outer, NULL, MDBX_PS_FIRST); + + rc = mdbx_page_search_root(mc, nullptr, MDBX_PS_FIRST); if (unlikely(rc != MDBX_SUCCESS)) return rc; /* Make cursor pages writable */ - char *const buf = mdbx_malloc(pgno2bytes(my->mc_env, couple.outer.mc_snum)); + char *const buf = mdbx_malloc(pgno2bytes(ctx->mc_env, mc->mc_snum)); if (buf == NULL) return MDBX_ENOMEM; char *ptr = buf; - for (unsigned i = 0; i < couple.outer.mc_top; i++) { - mdbx_page_copy((MDBX_page *)ptr, couple.outer.mc_pg[i], - my->mc_env->me_psize); - couple.outer.mc_pg[i] = (MDBX_page *)ptr; - ptr += my->mc_env->me_psize; + for (unsigned i = 0; i < mc->mc_top; i++) { + mdbx_page_copy((MDBX_page *)ptr, mc->mc_pg[i], ctx->mc_env->me_psize); + mc->mc_pg[i] = (MDBX_page *)ptr; + ptr += ctx->mc_env->me_psize; } /* This is writable space for a leaf page. Usually not needed. */ MDBX_page *const leaf = (MDBX_page *)ptr; + MDBX_page *copy; - while (couple.outer.mc_snum > 0) { - MDBX_page *mp = couple.outer.mc_pg[couple.outer.mc_top]; + while (mc->mc_snum > 0) { + MDBX_page *mp = mc->mc_pg[mc->mc_top]; unsigned n = page_numkeys(mp); if (IS_LEAF(mp)) { - if (hive_flags == 0 /* may have nested F_SUBDATA or F_BIGDATA nodes */) { + if (!(mc->mc_flags & + C_SUB) /* may have nested F_SUBDATA or F_BIGDATA nodes */) { for (unsigned i = 0; i < n; i++) { MDBX_node *node = page_node(mp, i); - if (node_flags(node) & F_BIGDATA) { - MDBX_page *osrc; - + if (node_flags(node) == F_BIGDATA) { /* Need writable leaf */ if (mp != leaf) { - couple.outer.mc_pg[couple.outer.mc_top] = leaf; - mdbx_page_copy(leaf, mp, my->mc_env->me_psize); + mc->mc_pg[mc->mc_top] = leaf; + mdbx_page_copy(leaf, mp, ctx->mc_env->me_psize); mp = leaf; node = page_node(mp, i); } const pgno_t pgno = node_largedata_pgno(node); - poke_pgno(node_data(node), my->mc_next_pgno); - rc = mdbx_page_get(&couple.outer, pgno, &osrc, mp->mp_txnid); + poke_pgno(node_data(node), ctx->mc_next_pgno); + MDBX_page *osrc; + rc = mdbx_page_get(mc, pgno, &osrc, mp->mp_txnid); if (unlikely(rc != MDBX_SUCCESS)) goto done; - unsigned toggle = my->mc_head & 1; - if (my->mc_wlen[toggle] + my->mc_env->me_psize > - ((size_t)(MDBX_ENVCOPY_WRITEBUF))) { - rc = mdbx_env_cthr_toggle(my); + + unsigned side = ctx->mc_head & 1; + if (ctx->mc_wlen[side] + ctx->mc_env->me_psize > + (size_t)MDBX_ENVCOPY_WRITEBUF) { + rc = compacting_toggle_write_buffers(ctx); if (unlikely(rc != MDBX_SUCCESS)) goto done; - toggle = my->mc_head & 1; + side = ctx->mc_head & 1; } - copy = (MDBX_page *)(my->mc_wbuf[toggle] + my->mc_wlen[toggle]); - memcpy(copy, osrc, my->mc_env->me_psize); - copy->mp_pgno = my->mc_next_pgno; - my->mc_next_pgno += osrc->mp_pages; - my->mc_wlen[toggle] += my->mc_env->me_psize; + copy = (MDBX_page *)(ctx->mc_wbuf[side] + ctx->mc_wlen[side]); + memcpy(copy, osrc, ctx->mc_env->me_psize); + copy->mp_pgno = ctx->mc_next_pgno; + ctx->mc_next_pgno += osrc->mp_pages; + ctx->mc_wlen[side] += ctx->mc_env->me_psize; + if (osrc->mp_pages > 1) { - my->mc_olen[toggle] = pgno2bytes(my->mc_env, osrc->mp_pages - 1); - my->mc_over[toggle] = (uint8_t *)osrc + my->mc_env->me_psize; - rc = mdbx_env_cthr_toggle(my); + ctx->mc_olen[side] = pgno2bytes(ctx->mc_env, osrc->mp_pages - 1); + ctx->mc_over[side] = (uint8_t *)osrc + ctx->mc_env->me_psize; + rc = compacting_toggle_write_buffers(ctx); if (unlikely(rc != MDBX_SUCCESS)) goto done; - toggle = my->mc_head & 1; + side = ctx->mc_head & 1; } } else if (node_flags(node) & F_SUBDATA) { if (!MDBX_DISABLE_VALIDATION && @@ -19562,77 +19552,84 @@ __cold static int mdbx_env_cwalk(mdbx_copy *my, pgno_t *pg, /* Need writable leaf */ if (mp != leaf) { - couple.outer.mc_pg[couple.outer.mc_top] = leaf; - mdbx_page_copy(leaf, mp, my->mc_env->me_psize); + mc->mc_pg[mc->mc_top] = leaf; + mdbx_page_copy(leaf, mp, ctx->mc_env->me_psize); mp = leaf; node = page_node(mp, i); } - MDBX_db db; - memcpy(&db, node_data(node), sizeof(MDBX_db)); - STATIC_ASSERT(F_DUPDATA == MDBX_DUPSORT); - rc = mdbx_env_cwalk(my, &db.md_root, - (node_flags(node) & F_DUPDATA) - ? MDBX_DUPSORT | - (db.md_flags & MDBX_DUPFIXED) - : 0); + MDBX_db *nested = nullptr; + if (node_flags(node) & F_DUPDATA) { + rc = mdbx_xcursor_init1(mc, node, mp); + if (likely(rc == MDBX_SUCCESS)) { + nested = &mc->mc_xcursor->mx_db; + rc = compacting_walk_tree(ctx, &mc->mc_xcursor->mx_cursor, + &nested->md_root, mp->mp_txnid); + } + } else { + MDBX_cursor_couple *couple = + container_of(mc, MDBX_cursor_couple, inner.mx_cursor); + nested = &couple->inner.mx_db; + memcpy(nested, node_data(node), sizeof(MDBX_db)); + rc = compacting_walk_sdb(ctx, nested); + } if (unlikely(rc != MDBX_SUCCESS)) goto done; - memcpy(node_data(node), &db, sizeof(MDBX_db)); + memcpy(node_data(node), nested, sizeof(MDBX_db)); } } } } else { - couple.outer.mc_ki[couple.outer.mc_top]++; - if (couple.outer.mc_ki[couple.outer.mc_top] < n) { + mc->mc_ki[mc->mc_top]++; + if (mc->mc_ki[mc->mc_top] < n) { again:; - const MDBX_node *node = - page_node(mp, couple.outer.mc_ki[couple.outer.mc_top]); - if (unlikely(node->mn_flags)) { + const MDBX_node *node = page_node(mp, mc->mc_ki[mc->mc_top]); + if (unlikely(node_flags(node))) { mdbx_error("unexpected type 0x%x of node #%u on page #%" PRIaPGNO, - node->mn_flags, couple.outer.mc_ki[couple.outer.mc_top], - couple.outer.mc_pg[couple.outer.mc_top]->mp_pgno); + node_flags(node), mc->mc_ki[mc->mc_top], + mc->mc_pg[mc->mc_top]->mp_pgno); rc = MDBX_CORRUPTED; goto done; } - rc = mdbx_page_get(&couple.outer, node_pgno(node), &mp, mp->mp_txnid); + rc = mdbx_page_get(mc, node_pgno(node), &mp, mp->mp_txnid); if (unlikely(rc != MDBX_SUCCESS)) goto done; - couple.outer.mc_top++; - couple.outer.mc_snum++; - couple.outer.mc_ki[couple.outer.mc_top] = 0; + mc->mc_top++; + mc->mc_snum++; + mc->mc_ki[mc->mc_top] = 0; if (IS_BRANCH(mp)) { /* Whenever we advance to a sibling branch page, * we must proceed all the way down to its first leaf. */ - mdbx_page_copy(couple.outer.mc_pg[couple.outer.mc_top], mp, - my->mc_env->me_psize); + mdbx_page_copy(mc->mc_pg[mc->mc_top], mp, ctx->mc_env->me_psize); goto again; } else - couple.outer.mc_pg[couple.outer.mc_top] = mp; + mc->mc_pg[mc->mc_top] = mp; continue; } } - unsigned toggle = my->mc_head & 1; - if (my->mc_wlen[toggle] + my->mc_wlen[toggle] > - ((size_t)(MDBX_ENVCOPY_WRITEBUF))) { - rc = mdbx_env_cthr_toggle(my); + + unsigned side = ctx->mc_head & 1; + if (ctx->mc_wlen[side] + ctx->mc_env->me_psize > + (size_t)MDBX_ENVCOPY_WRITEBUF) { + rc = compacting_toggle_write_buffers(ctx); if (unlikely(rc != MDBX_SUCCESS)) goto done; - toggle = my->mc_head & 1; + side = ctx->mc_head & 1; } - copy = (MDBX_page *)(my->mc_wbuf[toggle] + my->mc_wlen[toggle]); - mdbx_page_copy(copy, mp, my->mc_env->me_psize); - copy->mp_pgno = my->mc_next_pgno++; - my->mc_wlen[toggle] += my->mc_env->me_psize; - if (couple.outer.mc_top) { + copy = (MDBX_page *)(ctx->mc_wbuf[side] + ctx->mc_wlen[side]); + mdbx_page_copy(copy, mp, ctx->mc_env->me_psize); + copy->mp_pgno = ctx->mc_next_pgno++; + ctx->mc_wlen[side] += ctx->mc_env->me_psize; + + if (mc->mc_top) { /* Update parent if there is one */ - node_set_pgno(page_node(couple.outer.mc_pg[couple.outer.mc_top - 1], - couple.outer.mc_ki[couple.outer.mc_top - 1]), - copy->mp_pgno); - mdbx_cursor_pop(&couple.outer); + node_set_pgno( + page_node(mc->mc_pg[mc->mc_top - 1], mc->mc_ki[mc->mc_top - 1]), + copy->mp_pgno); + mdbx_cursor_pop(mc); } else { /* Otherwise we're done */ - *pg = copy->mp_pgno; + *root = copy->mp_pgno; break; } } @@ -19641,7 +19638,25 @@ done: return rc; } -__cold static void compact_fixup_meta(MDBX_env *env, MDBX_meta *meta) { +__cold static int compacting_walk_sdb(mdbx_compacting_ctx *ctx, MDBX_db *sdb) { + if (unlikely(sdb->md_root == P_INVALID)) + return MDBX_SUCCESS; /* empty db */ + + MDBX_cursor_couple couple; + MDBX_dbx dbx = {.md_klen_min = INT_MAX}; + uint8_t dbistate = DBI_VALID | DBI_AUDITED; + int rc = mdbx_couple_init(&couple, ~0u, ctx->mc_txn, sdb, &dbx, &dbistate); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + couple.outer.mc_checking |= CC_SKIPORD | CC_PAGECHECK; + couple.inner.mx_cursor.mc_checking |= CC_SKIPORD | CC_PAGECHECK; + return compacting_walk_tree(ctx, &couple.outer, &sdb->md_root, + sdb->md_mod_txnid ? sdb->md_mod_txnid + : ctx->mc_txn->mt_txnid); +} + +__cold static void compacting_fixup_meta(MDBX_env *env, MDBX_meta *meta) { /* Calculate filesize taking in account shrink/growing thresholds */ if (meta->mm_geo.next != meta->mm_geo.now) { meta->mm_geo.now = meta->mm_geo.next; @@ -19665,7 +19680,7 @@ __cold static void compact_fixup_meta(MDBX_env *env, MDBX_meta *meta) { } /* Make resizeable */ -__cold static void make_sizeable(MDBX_meta *meta) { +__cold static void meta_make_sizeable(MDBX_meta *meta) { meta->mm_geo.lower = MIN_PAGENO; if (meta->mm_geo.grow_pv == 0) { const pgno_t step = 1 + (meta->mm_geo.upper - meta->mm_geo.lower) / 42; @@ -19688,7 +19703,7 @@ __cold static int mdbx_env_compact(MDBX_env *env, MDBX_txn *read_txn, meta_set_txnid(env, meta, read_txn->mt_txnid); if (flags & MDBX_CP_FORCE_DYNAMIC_SIZE) - make_sizeable(meta); + meta_make_sizeable(meta); /* copy canary sequences if present */ if (read_txn->mt_canary.v) { @@ -19696,67 +19711,96 @@ __cold static int mdbx_env_compact(MDBX_env *env, MDBX_txn *read_txn, meta->mm_canary.v = constmeta_txnid(env, meta); } - /* Set metapage 1 with current main DB */ - pgno_t new_root, root = read_txn->mt_dbs[MAIN_DBI].md_root; - if ((new_root = root) == P_INVALID) { + if (read_txn->mt_dbs[MAIN_DBI].md_root == P_INVALID) { /* When the DB is empty, handle it specially to * fix any breakage like page leaks from ITS#8174. */ meta->mm_dbs[MAIN_DBI].md_flags = read_txn->mt_dbs[MAIN_DBI].md_flags; - compact_fixup_meta(env, meta); + compacting_fixup_meta(env, meta); if (dest_is_pipe) { int rc = mdbx_write(fd, buffer, meta_bytes); - if (rc != MDBX_SUCCESS) + if (unlikely(rc != MDBX_SUCCESS)) return rc; } } else { - /* Count free pages + GC pages. Subtract from last_pg - * to find the new last_pg, which also becomes the new root. */ - pgno_t freecount = 0; + /* Count free pages + GC pages. */ MDBX_cursor_couple couple; - MDBX_val key, data; - int rc = mdbx_cursor_init(&couple.outer, read_txn, FREE_DBI); if (unlikely(rc != MDBX_SUCCESS)) return rc; - while ((rc = mdbx_cursor_get(&couple.outer, &key, &data, MDBX_NEXT)) == 0) - freecount += *(pgno_t *)data.iov_base; + pgno_t gc = read_txn->mt_dbs[FREE_DBI].md_branch_pages + + read_txn->mt_dbs[FREE_DBI].md_leaf_pages + + read_txn->mt_dbs[FREE_DBI].md_overflow_pages; + MDBX_val key, data; + while ((rc = mdbx_cursor_get(&couple.outer, &key, &data, MDBX_NEXT)) == + MDBX_SUCCESS) { + const MDBX_PNL pnl = data.iov_base; + if (unlikely(data.iov_len % sizeof(pgno_t) || + data.iov_len < MDBX_PNL_SIZEOF(pnl) || + !(mdbx_pnl_check(pnl, read_txn->mt_next_pgno)))) + return MDBX_CORRUPTED; + gc += MDBX_PNL_SIZE(pnl); + } if (unlikely(rc != MDBX_NOTFOUND)) return rc; - freecount += read_txn->mt_dbs[FREE_DBI].md_branch_pages + - read_txn->mt_dbs[FREE_DBI].md_leaf_pages + - read_txn->mt_dbs[FREE_DBI].md_overflow_pages; - - new_root = read_txn->mt_next_pgno - 1 - freecount; - meta->mm_geo.next = new_root + 1; + /* Substract GC-pages from mt_next_pgno to find the new mt_next_pgno. */ + meta->mm_geo.next = read_txn->mt_next_pgno - gc; + /* Set with current main DB */ meta->mm_dbs[MAIN_DBI] = read_txn->mt_dbs[MAIN_DBI]; - meta->mm_dbs[MAIN_DBI].md_root = new_root; - mdbx_copy ctx; + mdbx_compacting_ctx ctx; memset(&ctx, 0, sizeof(ctx)); rc = mdbx_condpair_init(&ctx.mc_condpair); if (unlikely(rc != MDBX_SUCCESS)) return rc; - memset(data_buffer, 0, ((size_t)(MDBX_ENVCOPY_WRITEBUF)) * 2); + memset(data_buffer, 0, 2 * (size_t)MDBX_ENVCOPY_WRITEBUF); ctx.mc_wbuf[0] = data_buffer; - ctx.mc_wbuf[1] = data_buffer + ((size_t)(MDBX_ENVCOPY_WRITEBUF)); + ctx.mc_wbuf[1] = data_buffer + (size_t)MDBX_ENVCOPY_WRITEBUF; ctx.mc_next_pgno = NUM_METAS; ctx.mc_env = env; ctx.mc_fd = fd; ctx.mc_txn = read_txn; mdbx_thread_t thread; - int thread_err = mdbx_thread_create(&thread, mdbx_env_copythr, &ctx); + int thread_err = mdbx_thread_create(&thread, compacting_write_thread, &ctx); if (likely(thread_err == MDBX_SUCCESS)) { if (dest_is_pipe) { - compact_fixup_meta(env, meta); + compacting_fixup_meta(env, meta); rc = mdbx_write(fd, buffer, meta_bytes); } - if (rc == MDBX_SUCCESS) - rc = mdbx_env_cwalk(&ctx, &root, 0); - mdbx_env_cthr_toggle(&ctx); - mdbx_env_cthr_toggle(&ctx); + if (likely(rc == MDBX_SUCCESS)) + rc = compacting_walk_sdb(&ctx, &meta->mm_dbs[MAIN_DBI]); + if (ctx.mc_wlen[ctx.mc_head & 1]) + /* toggle to flush non-empty buffers */ + compacting_toggle_write_buffers(&ctx); + + if (likely(rc == MDBX_SUCCESS) && + unlikely(meta->mm_geo.next != ctx.mc_next_pgno)) { + if (ctx.mc_next_pgno > meta->mm_geo.next) { + mdbx_error( + "the source DB %s: post-compactification used pages %" PRIaPGNO + " %c expected %" PRIaPGNO, + "has double-used pages or other corruption", ctx.mc_next_pgno, + '>', meta->mm_geo.next); + rc = MDBX_CORRUPTED; /* corrupted DB */ + } + if (ctx.mc_next_pgno < meta->mm_geo.next) { + mdbx_warning( + "the source DB %s: post-compactification used pages %" PRIaPGNO + " %c expected %" PRIaPGNO, + "has page leak(s)", ctx.mc_next_pgno, '<', meta->mm_geo.next); + if (dest_is_pipe) + /* the root within already written meta-pages is wrong */ + rc = MDBX_CORRUPTED; + } + /* fixup meta */ + meta->mm_geo.next = ctx.mc_next_pgno; + } + + /* toggle with empty buffers to exit thread's loop */ + mdbx_assert(env, (ctx.mc_wlen[ctx.mc_head & 1]) == 0); + compacting_toggle_write_buffers(&ctx); thread_err = mdbx_thread_join(thread); mdbx_assert(env, (ctx.mc_tail == ctx.mc_head && ctx.mc_wlen[ctx.mc_head & 1] == 0) || @@ -19769,32 +19813,8 @@ __cold static int mdbx_env_compact(MDBX_env *env, MDBX_txn *read_txn, return rc; if (unlikely(ctx.mc_error != MDBX_SUCCESS)) return ctx.mc_error; - - if (dest_is_pipe) { - if (unlikely(root != new_root)) { - mdbx_error("post-compactification root %" PRIaPGNO - " NE expected %" PRIaPGNO - " (source DB corrupted or has a page leak(s))", - root, new_root); - return MDBX_CORRUPTED; /* page leak or corrupt DB */ - } - } else { - if (unlikely(root > new_root)) { - mdbx_error("post-compactification root %" PRIaPGNO - " GT expected %" PRIaPGNO " (source DB corrupted)", - root, new_root); - return MDBX_CORRUPTED; /* page leak or corrupt DB */ - } - if (unlikely(root < new_root)) { - mdbx_warning("post-compactification root %" PRIaPGNO - " LT expected %" PRIaPGNO " (page leak(s) in source DB)", - root, new_root); - /* fixup meta */ - meta->mm_dbs[MAIN_DBI].md_root = root; - meta->mm_geo.next = root + 1; - } - compact_fixup_meta(env, meta); - } + if (!dest_is_pipe) + compacting_fixup_meta(env, meta); } /* Extend file if required */ @@ -19804,12 +19824,11 @@ __cold static int mdbx_env_compact(MDBX_env *env, MDBX_txn *read_txn, return mdbx_ftruncate(fd, whole_size); const size_t used_size = pgno2bytes(env, meta->mm_geo.next); - memset(data_buffer, 0, ((size_t)(MDBX_ENVCOPY_WRITEBUF))); + memset(data_buffer, 0, (size_t)MDBX_ENVCOPY_WRITEBUF); for (size_t offset = used_size; offset < whole_size;) { - const size_t chunk = - (((size_t)(MDBX_ENVCOPY_WRITEBUF)) < whole_size - offset) - ? ((size_t)(MDBX_ENVCOPY_WRITEBUF)) - : whole_size - offset; + const size_t chunk = ((size_t)MDBX_ENVCOPY_WRITEBUF < whole_size - offset) + ? (size_t)MDBX_ENVCOPY_WRITEBUF + : whole_size - offset; /* copy to avoid EFAULT in case swapped-out */ int rc = mdbx_write(fd, data_buffer, chunk); if (unlikely(rc != MDBX_SUCCESS)) @@ -19850,7 +19869,7 @@ __cold static int mdbx_env_copy_asis(MDBX_env *env, MDBX_txn *read_txn, mdbx_txn_unlock(env); if (flags & MDBX_CP_FORCE_DYNAMIC_SIZE) - make_sizeable(headcopy); + meta_make_sizeable(headcopy); /* Update signature to steady */ unaligned_poke_u64(4, headcopy->mm_datasync_sign, meta_sign(headcopy)); @@ -19910,10 +19929,9 @@ __cold static int mdbx_env_copy_asis(MDBX_env *env, MDBX_txn *read_txn, #endif /* MDBX_USE_COPYFILERANGE */ /* fallback to portable */ - const size_t chunk = - (((size_t)(MDBX_ENVCOPY_WRITEBUF)) < used_size - offset) - ? ((size_t)(MDBX_ENVCOPY_WRITEBUF)) - : used_size - offset; + const size_t chunk = ((size_t)MDBX_ENVCOPY_WRITEBUF < used_size - offset) + ? (size_t)MDBX_ENVCOPY_WRITEBUF + : used_size - offset; /* copy to avoid EFAULT in case swapped-out */ memcpy(data_buffer, env->me_map + offset, chunk); rc = mdbx_write(fd, data_buffer, chunk); @@ -19925,12 +19943,12 @@ __cold static int mdbx_env_copy_asis(MDBX_env *env, MDBX_txn *read_txn, if (!dest_is_pipe) rc = mdbx_ftruncate(fd, whole_size); else { - memset(data_buffer, 0, ((size_t)(MDBX_ENVCOPY_WRITEBUF))); + memset(data_buffer, 0, (size_t)MDBX_ENVCOPY_WRITEBUF); for (size_t offset = used_size; rc == MDBX_SUCCESS && offset < whole_size;) { const size_t chunk = - (((size_t)(MDBX_ENVCOPY_WRITEBUF)) < whole_size - offset) - ? ((size_t)(MDBX_ENVCOPY_WRITEBUF)) + ((size_t)MDBX_ENVCOPY_WRITEBUF < whole_size - offset) + ? (size_t)MDBX_ENVCOPY_WRITEBUF : whole_size - offset; /* copy to avoid EFAULT in case swapped-out */ rc = mdbx_write(fd, data_buffer, chunk); @@ -19961,8 +19979,8 @@ __cold int mdbx_env_copy2fd(MDBX_env *env, mdbx_filehandle_t fd, const size_t buffer_size = pgno_align2os_bytes(env, NUM_METAS) + ceil_powerof2(((flags & MDBX_CP_COMPACT) - ? ((size_t)(MDBX_ENVCOPY_WRITEBUF)) * 2 - : ((size_t)(MDBX_ENVCOPY_WRITEBUF))), + ? 2 * (size_t)MDBX_ENVCOPY_WRITEBUF + : (size_t)MDBX_ENVCOPY_WRITEBUF), env->me_os_psize); uint8_t *buffer = NULL; From 498514dae18547ace207be26660bfc34b3154ad0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Mon, 4 Jul 2022 20:53:20 +0300 Subject: [PATCH 017/364] mdbx: remove unneeded `CC_COPYING`. --- src/core.c | 63 ++++++++++++++++++++----------------------------- src/internals.h | 1 - 2 files changed, 26 insertions(+), 38 deletions(-) diff --git a/src/core.c b/src/core.c index 82a1327f..e61c0ebb 100644 --- a/src/core.c +++ b/src/core.c @@ -18168,12 +18168,10 @@ __cold static int mdbx_page_check(MDBX_cursor *const mc, const size_t ksize_max = keysize_max(env->me_psize, 0); const size_t leaf2_ksize = mp->mp_leaf2_ksize; if (IS_LEAF2(mp)) { - if ((mc->mc_checking & CC_COPYING) == 0) { - if (unlikely((mc->mc_flags & C_SUB) == 0 || - (mc->mc_db->md_flags & MDBX_DUPFIXED) == 0)) - rc = bad_page(mp, "unexpected leaf2-page (db-flags 0x%x)\n", - mc->mc_db->md_flags); - } + if (unlikely((mc->mc_flags & C_SUB) == 0 || + (mc->mc_db->md_flags & MDBX_DUPFIXED) == 0)) + rc = bad_page(mp, "unexpected leaf2-page (db-flags 0x%x)\n", + mc->mc_db->md_flags); if (unlikely(leaf2_ksize < 1 || leaf2_ksize > ksize_max)) rc = bad_page(mp, "invalid leaf2-key length (%zu)\n", leaf2_ksize); } @@ -18188,8 +18186,7 @@ __cold static int mdbx_page_check(MDBX_cursor *const mc, continue; } - if ((mc->mc_checking & CC_COPYING) == 0 && - unlikely(leaf2_ksize != mc->mc_dbx->md_klen_min)) { + if (unlikely(leaf2_ksize != mc->mc_dbx->md_klen_min)) { if (unlikely(leaf2_ksize < mc->mc_dbx->md_klen_min || leaf2_ksize > mc->mc_dbx->md_klen_max)) rc = bad_page( @@ -18224,8 +18221,7 @@ __cold static int mdbx_page_check(MDBX_cursor *const mc, continue; } if ((IS_LEAF(mp) || i > 0)) { - if ((mc->mc_checking & CC_COPYING) == 0 && - unlikely(ksize < mc->mc_dbx->md_klen_min || + if (unlikely(ksize < mc->mc_dbx->md_klen_min || ksize > mc->mc_dbx->md_klen_max)) rc = bad_page( mp, "node[%u] key size (%zu) <> min/max key-length (%zu/%zu)\n", @@ -18276,14 +18272,12 @@ __cold static int mdbx_page_check(MDBX_cursor *const mc, "bigdata-pgno", i, nkeys, dsize, data + dsize - end_of_page); continue; } - if ((mc->mc_checking & CC_COPYING) == 0) { - if (unlikely(dsize <= mc->mc_dbx->md_vlen_min || - dsize > mc->mc_dbx->md_vlen_max)) - rc = bad_page( - mp, - "big-node data size (%zu) <> min/max value-length (%zu/%zu)\n", - dsize, mc->mc_dbx->md_vlen_min, mc->mc_dbx->md_vlen_max); - } + if (unlikely(dsize <= mc->mc_dbx->md_vlen_min || + dsize > mc->mc_dbx->md_vlen_max)) + rc = bad_page( + mp, + "big-node data size (%zu) <> min/max value-length (%zu/%zu)\n", + dsize, mc->mc_dbx->md_vlen_min, mc->mc_dbx->md_vlen_max); if ((mc->mc_checking & CC_RETIRING) == 0) { /* Disable full checking to avoid infinite recursion * with a corrupted DB */ @@ -18323,14 +18317,12 @@ __cold static int mdbx_page_check(MDBX_cursor *const mc, /* wrong, but already handled */ continue; case 0 /* usual */: - if ((mc->mc_checking & CC_COPYING) == 0) { - if (unlikely(dsize < mc->mc_dbx->md_vlen_min || - dsize > mc->mc_dbx->md_vlen_max)) { - rc = bad_page( - mp, "node-data size (%zu) <> min/max value-length (%zu/%zu)\n", - dsize, mc->mc_dbx->md_vlen_min, mc->mc_dbx->md_vlen_max); - continue; - } + if (unlikely(dsize < mc->mc_dbx->md_vlen_min || + dsize > mc->mc_dbx->md_vlen_max)) { + rc = bad_page( + mp, "node-data size (%zu) <> min/max value-length (%zu/%zu)\n", + dsize, mc->mc_dbx->md_vlen_min, mc->mc_dbx->md_vlen_max); + continue; } break; case F_SUBDATA /* sub-db */: @@ -18376,8 +18368,7 @@ __cold static int mdbx_page_check(MDBX_cursor *const mc, continue; } - if ((mc->mc_checking & CC_COPYING) == 0 && - unlikely(sub_ksize != mc->mc_dbx->md_vlen_min)) { + if (unlikely(sub_ksize != mc->mc_dbx->md_vlen_min)) { if (unlikely(sub_ksize < mc->mc_dbx->md_vlen_min || sub_ksize > mc->mc_dbx->md_vlen_max)) rc = bad_page(mp, @@ -18415,15 +18406,13 @@ __cold static int mdbx_page_check(MDBX_cursor *const mc, size_t sub_dsize = node_ds(sub_node); /* char *sub_data = node_data(sub_node); */ - if ((mc->mc_checking & CC_COPYING) == 0) { - if (unlikely(sub_ksize < mc->mc_dbx->md_vlen_min || - sub_ksize > mc->mc_dbx->md_vlen_max)) - rc = bad_page(mp, - "nested-node-key size (%zu) <> min/max " - "value-length (%zu/%zu)\n", - sub_ksize, mc->mc_dbx->md_vlen_min, - mc->mc_dbx->md_vlen_max); - } + if (unlikely(sub_ksize < mc->mc_dbx->md_vlen_min || + sub_ksize > mc->mc_dbx->md_vlen_max)) + rc = bad_page(mp, + "nested-node-key size (%zu) <> min/max " + "value-length (%zu/%zu)\n", + sub_ksize, mc->mc_dbx->md_vlen_min, + mc->mc_dbx->md_vlen_max); if ((mc->mc_checking & CC_SKIPORD) == 0) { sub_here.iov_len = sub_ksize; sub_here.iov_base = sub_key; diff --git a/src/internals.h b/src/internals.h index 5001d30c..4162610d 100644 --- a/src/internals.h +++ b/src/internals.h @@ -1051,7 +1051,6 @@ struct MDBX_cursor { #define CC_BRANCH 0x01 /* same as P_BRANCH for CHECK_LEAF_TYPE() */ #define CC_LEAF 0x02 /* same as P_LEAF for CHECK_LEAF_TYPE() */ #define CC_UPDATING 0x04 /* update/rebalance pending */ -#define CC_COPYING 0x08 /* skip key-value length check (copying simplify) */ #define CC_SKIPORD 0x10 /* don't check keys ordering */ #define CC_LEAF2 0x20 /* same as P_LEAF2 for CHECK_LEAF_TYPE() */ #define CC_RETIRING 0x40 /* refs to child pages may be invalid */ From ca3f188370ff978569d1411488e78791cf475492 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Mon, 4 Jul 2022 21:22:39 +0300 Subject: [PATCH 018/364] mdbx: extend `CHECK_LEAF_TYPE()` by adding `CC_OVERFLOW`. --- src/core.c | 2 +- src/internals.h | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/core.c b/src/core.c index e61c0ebb..a94f9e24 100644 --- a/src/core.c +++ b/src/core.c @@ -16899,7 +16899,7 @@ static __inline int mdbx_couple_init(MDBX_cursor_couple *couple, couple->outer.mc_pg[0] = 0; couple->outer.mc_flags = 0; STATIC_ASSERT(CC_BRANCH == P_BRANCH && CC_LEAF == P_LEAF && - CC_LEAF2 == P_LEAF2); + CC_OVERFLOW == P_OVERFLOW && CC_LEAF2 == P_LEAF2); couple->outer.mc_checking = (mdbx_audit_enabled() || (txn->mt_env->me_flags & MDBX_VALIDATION)) ? CC_PAGECHECK | CC_LEAF diff --git a/src/internals.h b/src/internals.h index 4162610d..325a0524 100644 --- a/src/internals.h +++ b/src/internals.h @@ -1050,7 +1050,8 @@ struct MDBX_cursor { /* Cursor checking flags. */ #define CC_BRANCH 0x01 /* same as P_BRANCH for CHECK_LEAF_TYPE() */ #define CC_LEAF 0x02 /* same as P_LEAF for CHECK_LEAF_TYPE() */ -#define CC_UPDATING 0x04 /* update/rebalance pending */ +#define CC_OVERFLOW 0x04 /* same as P_OVERFLOW for CHECK_LEAF_TYPE() */ +#define CC_UPDATING 0x08 /* update/rebalance pending */ #define CC_SKIPORD 0x10 /* don't check keys ordering */ #define CC_LEAF2 0x20 /* same as P_LEAF2 for CHECK_LEAF_TYPE() */ #define CC_RETIRING 0x40 /* refs to child pages may be invalid */ @@ -1063,7 +1064,7 @@ struct MDBX_cursor { #define CHECK_LEAF_TYPE(mc, mp) \ (((PAGETYPE_EXTRA(mp) ^ (mc)->mc_checking) & \ - (CC_BRANCH | CC_LEAF | CC_LEAF2)) == 0) + (CC_BRANCH | CC_LEAF | CC_OVERFLOW | CC_LEAF2)) == 0) /* Context for sorted-dup records. * We could have gone to a fully recursive design, with arbitrarily From 6eefa05f3d7bc38c94788105742c9131043e966b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Mon, 4 Jul 2022 21:57:42 +0300 Subject: [PATCH 019/364] mdbx: minor refine `CC_RETIRING` handling. --- src/core.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/core.c b/src/core.c index a94f9e24..56f475af 100644 --- a/src/core.c +++ b/src/core.c @@ -18240,11 +18240,12 @@ __cold static int mdbx_page_check(MDBX_cursor *const mc, unlikely(ksize != 0)) rc = bad_page(mp, "branch-node[%u] wrong 0-node key-length (%zu)\n", i, ksize); - if ((mc->mc_checking & CC_RETIRING) == 0) { - const pgno_t ref = node_pgno(node); - if (unlikely(ref < MIN_PAGENO || ref >= mc->mc_txn->mt_next_pgno)) - rc = bad_page(mp, "branch-node[%u] wrong pgno (%u)\n", i, ref); - } + const pgno_t ref = node_pgno(node); + if (unlikely(ref < MIN_PAGENO) || + (unlikely(ref >= mc->mc_txn->mt_next_pgno) && + (unlikely(ref >= mc->mc_txn->mt_geo.now) || + !(mc->mc_checking & CC_RETIRING)))) + rc = bad_page(mp, "branch-node[%u] wrong pgno (%u)\n", i, ref); if (unlikely(node_flags(node))) rc = bad_page(mp, "branch-node[%u] wrong flags (%u)\n", i, node_flags(node)); From b31b270ffd18f20995a196fdae8c96e74950ae32 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Tue, 5 Jul 2022 19:02:47 +0300 Subject: [PATCH 020/364] mdbx: refine copy-with-compactification to clear/zero unused gaps on a DB pages. --- src/core.c | 167 ++++++++++++++++++++++++++++++----------------------- 1 file changed, 96 insertions(+), 71 deletions(-) diff --git a/src/core.c b/src/core.c index 56f475af..90bdc62e 100644 --- a/src/core.c +++ b/src/core.c @@ -19368,9 +19368,7 @@ typedef struct mdbx_compacting_ctx { MDBX_txn *mc_txn; mdbx_condpair_t mc_condpair; uint8_t *mc_wbuf[2]; - uint8_t *mc_over[2]; size_t mc_wlen[2]; - size_t mc_olen[2]; mdbx_filehandle_t mc_fd; /* Error code. Never cleared if set. Both threads can set nonzero * to fail the copy. Not mutex-protected, MDBX expects atomic int. */ @@ -19408,7 +19406,6 @@ __cold static THREAD_RESULT THREAD_CALL compacting_write_thread(void *arg) { } ctx->mc_wlen[toggle] = 0; uint8_t *ptr = ctx->mc_wbuf[toggle]; - again: if (!ctx->mc_error) { int err = mdbx_write(ctx->mc_fd, ptr, wsize); if (err != MDBX_SUCCESS) { @@ -19424,14 +19421,6 @@ __cold static THREAD_RESULT THREAD_CALL compacting_write_thread(void *arg) { goto bailout; } } - - /* If there's an overflow page tail, write it too */ - wsize = ctx->mc_olen[toggle]; - if (wsize) { - ctx->mc_olen[toggle] = 0; - ptr = ctx->mc_over[toggle]; - goto again; - } ctx->mc_tail += 1; mdbx_condpair_signal(&ctx->mc_condpair, false); } @@ -19458,6 +19447,69 @@ __cold static int compacting_toggle_write_buffers(mdbx_compacting_ctx *ctx) { __cold static int compacting_walk_sdb(mdbx_compacting_ctx *ctx, MDBX_db *sdb); +static int compacting_put_bytes(mdbx_compacting_ctx *ctx, const void *src, + size_t bytes, pgno_t pgno, pgno_t npages) { + assert(pgno == 0 || bytes > PAGEHDRSZ); + while (bytes > 0) { + const unsigned side = ctx->mc_head & 1; + const size_t left = (size_t)MDBX_ENVCOPY_WRITEBUF - ctx->mc_wlen[side]; + if (left < (pgno ? PAGEHDRSZ : 1)) { + int err = compacting_toggle_write_buffers(ctx); + if (unlikely(err != MDBX_SUCCESS)) + return err; + continue; + } + const size_t chunk = (bytes < left) ? bytes : left; + void *const dst = ctx->mc_wbuf[side] + ctx->mc_wlen[side]; + if (src) { + memcpy(dst, src, chunk); + if (pgno) { + assert(chunk > PAGEHDRSZ); + MDBX_page *mp = dst; + mp->mp_pgno = pgno; + if (mp->mp_flags == P_OVERFLOW) { + assert(bytes <= pgno2bytes(ctx->mc_env, npages)); + mp->mp_pages = npages; + } + pgno = 0; + } + src = (const char *)src + chunk; + } else + memset(dst, 0, chunk); + bytes -= chunk; + ctx->mc_wlen[side] += chunk; + } + return MDBX_SUCCESS; +} + +static int compacting_put_page(mdbx_compacting_ctx *ctx, const MDBX_page *mp, + const size_t head_bytes, const size_t tail_bytes, + const pgno_t npages) { + if (tail_bytes) { + assert(head_bytes + tail_bytes <= ctx->mc_env->me_psize); + assert(npages == 1 && + (PAGETYPE_EXTRA(mp) == P_BRANCH || PAGETYPE_EXTRA(mp) == P_LEAF)); + } else { + assert(head_bytes <= pgno2bytes(ctx->mc_env, npages)); + assert((npages == 1 && PAGETYPE_EXTRA(mp) == (P_LEAF | P_LEAF2)) || + PAGETYPE_EXTRA(mp) == P_OVERFLOW); + } + + const pgno_t pgno = ctx->mc_next_pgno; + ctx->mc_next_pgno += npages; + int err = compacting_put_bytes(ctx, mp, head_bytes, pgno, npages); + if (unlikely(err != MDBX_SUCCESS)) + return err; + err = compacting_put_bytes( + ctx, nullptr, pgno2bytes(ctx->mc_env, npages) - (head_bytes + tail_bytes), + 0, 0); + if (unlikely(err != MDBX_SUCCESS)) + return err; + return compacting_put_bytes( + ctx, (const char *)mp + ctx->mc_env->me_psize - tail_bytes, tail_bytes, 0, + 0); +} + __cold static int compacting_walk_tree(mdbx_compacting_ctx *ctx, MDBX_cursor *mc, pgno_t *root, txnid_t parent_txnid) { @@ -19481,10 +19533,8 @@ __cold static int compacting_walk_tree(mdbx_compacting_ctx *ctx, mc->mc_pg[i] = (MDBX_page *)ptr; ptr += ctx->mc_env->me_psize; } - /* This is writable space for a leaf page. Usually not needed. */ MDBX_page *const leaf = (MDBX_page *)ptr; - MDBX_page *copy; while (mc->mc_snum > 0) { MDBX_page *mp = mc->mc_pg[mc->mc_top]; @@ -19504,35 +19554,17 @@ __cold static int compacting_walk_tree(mdbx_compacting_ctx *ctx, node = page_node(mp, i); } - const pgno_t pgno = node_largedata_pgno(node); + const struct page_result lp = + mdbx_page_get_ex(mc, node_largedata_pgno(node), mp->mp_txnid); + if (unlikely((rc = lp.err) != MDBX_SUCCESS)) + goto done; + const size_t datasize = node_ds(node); + const pgno_t npages = number_of_ovpages(ctx->mc_env, datasize); poke_pgno(node_data(node), ctx->mc_next_pgno); - MDBX_page *osrc; - rc = mdbx_page_get(mc, pgno, &osrc, mp->mp_txnid); + rc = compacting_put_page(ctx, lp.page, PAGEHDRSZ + datasize, 0, + npages); if (unlikely(rc != MDBX_SUCCESS)) goto done; - - unsigned side = ctx->mc_head & 1; - if (ctx->mc_wlen[side] + ctx->mc_env->me_psize > - (size_t)MDBX_ENVCOPY_WRITEBUF) { - rc = compacting_toggle_write_buffers(ctx); - if (unlikely(rc != MDBX_SUCCESS)) - goto done; - side = ctx->mc_head & 1; - } - copy = (MDBX_page *)(ctx->mc_wbuf[side] + ctx->mc_wlen[side]); - memcpy(copy, osrc, ctx->mc_env->me_psize); - copy->mp_pgno = ctx->mc_next_pgno; - ctx->mc_next_pgno += osrc->mp_pages; - ctx->mc_wlen[side] += ctx->mc_env->me_psize; - - if (osrc->mp_pages > 1) { - ctx->mc_olen[side] = pgno2bytes(ctx->mc_env, osrc->mp_pages - 1); - ctx->mc_over[side] = (uint8_t *)osrc + ctx->mc_env->me_psize; - rc = compacting_toggle_write_buffers(ctx); - if (unlikely(rc != MDBX_SUCCESS)) - goto done; - side = ctx->mc_head & 1; - } } else if (node_flags(node) & F_SUBDATA) { if (!MDBX_DISABLE_VALIDATION && unlikely(node_ds(node) != sizeof(MDBX_db))) { @@ -19572,54 +19604,47 @@ __cold static int compacting_walk_tree(mdbx_compacting_ctx *ctx, } else { mc->mc_ki[mc->mc_top]++; if (mc->mc_ki[mc->mc_top] < n) { - again:; - const MDBX_node *node = page_node(mp, mc->mc_ki[mc->mc_top]); - if (unlikely(node_flags(node))) { - mdbx_error("unexpected type 0x%x of node #%u on page #%" PRIaPGNO, - node_flags(node), mc->mc_ki[mc->mc_top], - mc->mc_pg[mc->mc_top]->mp_pgno); - rc = MDBX_CORRUPTED; - goto done; - } - rc = mdbx_page_get(mc, node_pgno(node), &mp, mp->mp_txnid); - if (unlikely(rc != MDBX_SUCCESS)) - goto done; - mc->mc_top++; - mc->mc_snum++; - mc->mc_ki[mc->mc_top] = 0; - if (IS_BRANCH(mp)) { + while (1) { + const MDBX_node *node = page_node(mp, mc->mc_ki[mc->mc_top]); + rc = mdbx_page_get(mc, node_pgno(node), &mp, mp->mp_txnid); + if (unlikely(rc != MDBX_SUCCESS)) + goto done; + mc->mc_top++; + mc->mc_snum++; + mc->mc_ki[mc->mc_top] = 0; + if (!IS_BRANCH(mp)) { + mc->mc_pg[mc->mc_top] = mp; + break; + } /* Whenever we advance to a sibling branch page, * we must proceed all the way down to its first leaf. */ mdbx_page_copy(mc->mc_pg[mc->mc_top], mp, ctx->mc_env->me_psize); - goto again; - } else - mc->mc_pg[mc->mc_top] = mp; + } continue; } } - unsigned side = ctx->mc_head & 1; - if (ctx->mc_wlen[side] + ctx->mc_env->me_psize > - (size_t)MDBX_ENVCOPY_WRITEBUF) { - rc = compacting_toggle_write_buffers(ctx); - if (unlikely(rc != MDBX_SUCCESS)) - goto done; - side = ctx->mc_head & 1; + const pgno_t pgno = ctx->mc_next_pgno; + if (likely(!IS_LEAF2(mp))) { + rc = compacting_put_page( + ctx, mp, PAGEHDRSZ + mp->mp_lower, + ctx->mc_env->me_psize - (PAGEHDRSZ + mp->mp_upper), 1); + } else { + rc = compacting_put_page( + ctx, mp, PAGEHDRSZ + page_numkeys(mp) * mp->mp_leaf2_ksize, 0, 1); } - copy = (MDBX_page *)(ctx->mc_wbuf[side] + ctx->mc_wlen[side]); - mdbx_page_copy(copy, mp, ctx->mc_env->me_psize); - copy->mp_pgno = ctx->mc_next_pgno++; - ctx->mc_wlen[side] += ctx->mc_env->me_psize; + if (unlikely(rc != MDBX_SUCCESS)) + goto done; if (mc->mc_top) { /* Update parent if there is one */ node_set_pgno( page_node(mc->mc_pg[mc->mc_top - 1], mc->mc_ki[mc->mc_top - 1]), - copy->mp_pgno); + pgno); mdbx_cursor_pop(mc); } else { /* Otherwise we're done */ - *root = copy->mp_pgno; + *root = pgno; break; } } From fbe97a79a32d5d063c7671420e84d6865261084d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Wed, 6 Jul 2022 22:51:57 +0300 Subject: [PATCH 021/364] mdbx: more checking for a large/overflow nodes and pages. --- src/core.c | 201 +++++++++++++++++++++++++++++++++-------------------- 1 file changed, 126 insertions(+), 75 deletions(-) diff --git a/src/core.c b/src/core.c index 90bdc62e..361b52c2 100644 --- a/src/core.c +++ b/src/core.c @@ -536,7 +536,7 @@ __cold intptr_t mdbx_limits_valsize_max(intptr_t pagesize, /* Calculate the size of a leaf node. * * The size depends on the environment's page size; if a data item - * is too large it will be put onto an overflow page and the node + * is too large it will be put onto an large/overflow page and the node * size will only include the key and not the data. Sizes are always * rounded up to an even number of bytes, to guarantee 2-byte alignment * of the MDBX_node headers. */ @@ -544,7 +544,7 @@ MDBX_NOTHROW_PURE_FUNCTION static __always_inline size_t leaf_size(const MDBX_env *env, const MDBX_val *key, const MDBX_val *data) { size_t node_bytes = node_size(key, data); if (node_bytes > env->me_leaf_nodemax) { - /* put on overflow page */ + /* put on large/overflow page */ node_bytes = node_size_len(key->iov_len, 0) + sizeof(pgno_t); } @@ -554,7 +554,7 @@ leaf_size(const MDBX_env *env, const MDBX_val *key, const MDBX_val *data) { /* Calculate the size of a branch node. * * The size should depend on the environment's page size but since - * we currently don't support spilling large keys onto overflow + * we currently don't support spilling large keys onto large/overflow * pages, it's simply the size of the MDBX_node header plus the * size of the key. Sizes are always rounded up to an even number * of bytes, to guarantee 2-byte alignment of the MDBX_node headers. @@ -569,7 +569,7 @@ branch_size(const MDBX_env *env, const MDBX_val *key) { * This is just the node header plus the key, there is no data. */ size_t node_bytes = node_size(key, nullptr); if (unlikely(node_bytes > env->me_leaf_nodemax)) { - /* put on overflow page */ + /* put on large/overflow page */ /* not implemented */ mdbx_assert_fail(env, "INDXSIZE(key) <= env->me_nodemax", __func__, __LINE__); @@ -677,7 +677,7 @@ page_fill(const MDBX_env *env, const MDBX_page *mp) { return page_used(env, mp) * 100.0 / page_space(env); } -/* The number of overflow pages needed to store the given size. */ +/* The number of large/overflow pages needed to store the given size. */ MDBX_NOTHROW_PURE_FUNCTION static __always_inline pgno_t number_of_ovpages(const MDBX_env *env, size_t bytes) { return bytes2pgno(env, PAGEHDRSZ - 1 + bytes) + 1; @@ -4240,7 +4240,7 @@ static const char *mdbx_leafnode_type(MDBX_node *n) { static const char *const tp[2][2] = {{"", ": DB"}, {": sub-page", ": sub-DB"}}; return F_ISSET(node_flags(n), F_BIGDATA) - ? ": overflow page" + ? ": large page" : tp[F_ISSET(node_flags(n), F_DUPDATA)] [F_ISSET(node_flags(n), F_SUBDATA)]; } @@ -6726,7 +6726,7 @@ page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num, int flags) { txn->mt_next_pgno + (size_t)num) || gc_len + MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) >= MDBX_PGL_LIMIT)) { - /* Stop reclaiming to avoid overflow the page list. + /* Stop reclaiming to avoid large/overflow the page list. * This is a rare case while search for a continuously multi-page region * in a large database. * todo4recovery://erased_by_github/libmdbx/issues/123 */ @@ -7079,7 +7079,7 @@ __hot static struct page_result page_alloc(MDBX_cursor *mc) { return page_alloc_slowpath(mc, 1, MDBX_ALLOC_ALL); } -/* Copy the used portions of a non-overflow page. */ +/* Copy the used portions of a non-large/overflow page. */ __hot static void mdbx_page_copy(MDBX_page *dst, const MDBX_page *src, size_t psize) { STATIC_ASSERT(UINT16_MAX > MAX_PAGESIZE - PAGEHDRSZ); @@ -10368,7 +10368,7 @@ static __inline void mdbx_txn_merge(MDBX_txn *const parent, MDBX_txn *const txn, } } } else { - /* from begin to end with dst shrinking (a lot of new overflow pages) */ + /* from begin to end with shrinking (a lot of new large/overflow pages) */ for (l = s = d = 1; s <= src->length && d <= dst->length;) { if (unlikely(l >= d)) { /* squash to get a gap of free space for merge */ @@ -13469,7 +13469,7 @@ __cold int mdbx_env_open(MDBX_env *env, const char *pathname, mdbx_debug("entries: %" PRIu64, db->md_entries); mdbx_debug("branch pages: %" PRIaPGNO, db->md_branch_pages); mdbx_debug("leaf pages: %" PRIaPGNO, db->md_leaf_pages); - mdbx_debug("overflow pages: %" PRIaPGNO, db->md_overflow_pages); + mdbx_debug("large/overflow pages: %" PRIaPGNO, db->md_overflow_pages); mdbx_debug("root: %" PRIaPGNO, db->md_root); mdbx_debug("schema_altered: %" PRIaTXN, db->md_mod_txnid); } @@ -13955,14 +13955,29 @@ dirty: goto bailout; } - if (unlikely((ret.page->mp_upper < ret.page->mp_lower || - ((ret.page->mp_lower | ret.page->mp_upper) & 1) || - PAGEHDRSZ + ret.page->mp_upper > env->me_psize) && - !IS_OVERFLOW(ret.page))) { - ret.err = - bad_page(ret.page, "invalid page lower(%u)/upper(%u) with limit (%u)\n", - ret.page->mp_lower, ret.page->mp_upper, page_space(env)); - goto bailout; + if (!IS_OVERFLOW(ret.page)) { + if (unlikely(ret.page->mp_upper < ret.page->mp_lower || + ((ret.page->mp_lower | ret.page->mp_upper) & 1) || + PAGEHDRSZ + ret.page->mp_upper > env->me_psize)) { + ret.err = + bad_page(ret.page, "invalid page lower(%u)/upper(%u) with limit %u\n", + ret.page->mp_lower, ret.page->mp_upper, page_space(env)); + goto bailout; + } + } else { + const pgno_t npages = ret.page->mp_pages; + if (unlikely(npages < 1 || npages >= MAX_PAGENO / 2)) { + ret.err = + bad_page(ret.page, "invalid n-pages (%u) for large-page\n", npages); + goto bailout; + } + if (unlikely(ret.page->mp_pgno + npages > mc->mc_txn->mt_next_pgno)) { + ret.err = bad_page( + ret.page, + "end of large-page beyond (%u) allocated space (%u next-pgno)\n", + ret.page->mp_pgno + npages, mc->mc_txn->mt_next_pgno); + goto bailout; + } } #endif /* !MDBX_DISABLE_VALIDATION */ @@ -14257,40 +14272,40 @@ __hot static int mdbx_page_search(MDBX_cursor *mc, const MDBX_val *key, return mdbx_page_search_root(mc, key, flags); } -/* Read overflow node data. */ +/* Read large/overflow node data. */ static __noinline int node_read_bigdata(MDBX_cursor *mc, const MDBX_node *node, MDBX_val *data, const MDBX_page *mp) { mdbx_cassert(mc, node_flags(node) == F_BIGDATA && data->iov_len == node_ds(node)); - struct page_result ret = + struct page_result lp = mdbx_page_get_ex(mc, node_largedata_pgno(node), mp->mp_txnid); - if (unlikely((ret.err != MDBX_SUCCESS))) { - mdbx_debug("read overflow page %" PRIaPGNO " failed", + if (unlikely((lp.err != MDBX_SUCCESS))) { + mdbx_debug("read large/overflow page %" PRIaPGNO " failed", node_largedata_pgno(node)); - return ret.err; + return lp.err; } - data->iov_base = page_data(ret.page); + data->iov_base = page_data(lp.page); if (!MDBX_DISABLE_VALIDATION && - unlikely(PAGETYPE_EXTRA(ret.page) != P_OVERFLOW)) - return bad_page(ret.page, "invalid page-type 0x%x for bigdata-node", - PAGETYPE_EXTRA(ret.page)); - if (!MDBX_DISABLE_VALIDATION && - unlikely(node_size_len(node_ks(node), data->iov_len) <= - mc->mc_txn->mt_env->me_leaf_nodemax)) - bad_page(mp, "too small data (%zu bytes) for bigdata-node", data->iov_len); - if (!MDBX_DISABLE_VALIDATION && - unlikely(ret.page->mp_pages != - number_of_ovpages(mc->mc_txn->mt_env, data->iov_len))) { - if (ret.page->mp_pages < - number_of_ovpages(mc->mc_txn->mt_env, data->iov_len)) - return bad_page(ret.page, - "too less n-pages %u for bigdata-node (%zu bytes)", - ret.page->mp_pages, data->iov_len); - else - bad_page(ret.page, "extra n-pages %u for bigdata-node (%zu bytes)", - ret.page->mp_pages, data->iov_len); + unlikely(PAGETYPE_EXTRA(lp.page) != P_OVERFLOW)) + return bad_page(lp.page, "invalid page-type 0x%x for bigdata-node", + PAGETYPE_EXTRA(lp.page)); + if (!MDBX_DISABLE_VALIDATION) { + const MDBX_env *env = mc->mc_txn->mt_env; + const size_t dsize = data->iov_len; + if (unlikely(node_size_len(node_ks(node), dsize) <= env->me_leaf_nodemax)) + bad_page(mp, "too small data (%zu bytes) for bigdata-node", dsize); + const unsigned npages = number_of_ovpages(env, dsize); + if (unlikely(lp.page->mp_pages != npages)) { + if (lp.page->mp_pages < npages) + return bad_page(lp.page, + "too less n-pages %u for bigdata-node (%zu bytes)", + lp.page->mp_pages, dsize); + else + bad_page(lp.page, "extra n-pages %u for bigdata-node (%zu bytes)", + lp.page->mp_pages, dsize); + } } return MDBX_SUCCESS; } @@ -16096,7 +16111,7 @@ new_sub:; nflags |= MDBX_SPLIT_REPLACE; rc = mdbx_page_split(mc, key, rdata, P_INVALID, nflags); if (rc == MDBX_SUCCESS && mdbx_audit_enabled()) - rc = mdbx_cursor_check(mc); + rc = insert_key ? mdbx_cursor_check(mc) : mdbx_cursor_check_updating(mc); } else { /* There is room already in this leaf page. */ if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { @@ -16338,7 +16353,7 @@ int mdbx_cursor_del(MDBX_cursor *mc, MDBX_put_flags_t flags) { else if (unlikely((node_flags(node) ^ flags) & F_SUBDATA)) return MDBX_INCOMPATIBLE; - /* add overflow pages to free list */ + /* add large/overflow pages to free list */ if (F_ISSET(node_flags(node), F_BIGDATA)) { MDBX_page *omp; if (unlikely((rc = mdbx_page_get(mc, node_largedata_pgno(node), &omp, @@ -16511,13 +16526,13 @@ static int __must_check_result mdbx_node_add_leaf(MDBX_cursor *mc, size_t node_bytes; if (unlikely(flags & F_BIGDATA)) { - /* Data already on overflow page. */ + /* Data already on large/overflow page. */ STATIC_ASSERT(sizeof(pgno_t) % 2 == 0); node_bytes = node_size_len(key->iov_len, 0) + sizeof(pgno_t) + sizeof(indx_t); } else if (unlikely(node_size(key, data) > mc->mc_txn->mt_env->me_leaf_nodemax)) { - /* Put data on overflow page. */ + /* Put data on large/overflow page. */ if (unlikely(mc->mc_db->md_flags & MDBX_DUPSORT)) { mdbx_error("Unexpected target %s flags 0x%x for large data-item", "dupsort-db", mc->mc_db->md_flags); @@ -16533,7 +16548,7 @@ static int __must_check_result mdbx_node_add_leaf(MDBX_cursor *mc, if (unlikely(npr.err != MDBX_SUCCESS)) return npr.err; largepage = npr.page; - mdbx_debug("allocated %u overflow page(s) %" PRIaPGNO "for %" PRIuPTR + mdbx_debug("allocated %u large/overflow page(s) %" PRIaPGNO "for %" PRIuPTR " data bytes", largepage->mp_pages, largepage->mp_pgno, data->iov_len); flags |= F_BIGDATA; @@ -18124,30 +18139,43 @@ __cold static int mdbx_page_check(MDBX_cursor *const mc, rc = bad_page(mp, "unknown/extra page-flags (have 0x%x, expect 0x%x)\n", mp->mp_flags & flags_mask, flags_expected); + mdbx_cassert(mc, (mc->mc_checking & CC_LEAF2) == 0 || + (mc->mc_flags & C_SUB) != 0); const uint8_t type = PAGETYPE_EXTRA(mp); switch (type) { default: return bad_page(mp, "invalid type (%u)\n", type); case P_OVERFLOW: - if (unlikely((mc->mc_flags & C_SUB) || (mc->mc_checking & CC_LEAF2))) - rc = - bad_page(mp, "unexpected overflow-page for dupsort db (flags 0x%x)\n", - mc->mc_db->md_flags); - if (unlikely(mp->mp_pages < 1 && mp->mp_pages >= MAX_PAGENO / 2)) - rc = bad_page(mp, "invalid overflow n-pages (%u)\n", mp->mp_pages); - if (unlikely(mp->mp_pgno + mp->mp_pages > mc->mc_txn->mt_next_pgno)) - rc = bad_page(mp, "overflow page beyond (%u) next-pgno\n", - mp->mp_pgno + mp->mp_pages); - return rc; - case P_LEAF: + if (unlikely(mc->mc_flags & C_SUB)) + rc = bad_page(mp, "unexpected %s-page for %s (db-flags 0x%x)\n", "large", + "nested dupsort tree", mc->mc_db->md_flags); + const pgno_t npages = mp->mp_pages; + if (unlikely(npages < 1 || npages >= MAX_PAGENO / 2)) + rc = bad_page(mp, "invalid n-pages (%u) for large-page\n", npages); + if (unlikely(mp->mp_pgno + npages > mc->mc_txn->mt_next_pgno)) + rc = bad_page( + mp, "end of large-page beyond (%u) allocated space (%u next-pgno)\n", + mp->mp_pgno + npages, mc->mc_txn->mt_next_pgno); + return rc; //-------------------------- end of large/overflow page handling case P_LEAF | P_SUBP: + if (unlikely(mc->mc_db->md_depth != 1)) + rc = bad_page(mp, "unexpected %s-page for %s (db-flags 0x%x)\n", + "leaf-sub", "nested dupsort db", mc->mc_db->md_flags); + /* fall through */ + __fallthrough; + case P_LEAF: if (unlikely((mc->mc_checking & CC_LEAF2) != 0)) rc = bad_page( mp, "unexpected leaf-page for dupfixed subtree (db-lags 0x%x)\n", mc->mc_db->md_flags); break; - case P_LEAF | P_LEAF2: case P_LEAF | P_LEAF2 | P_SUBP: + if (unlikely(mc->mc_db->md_depth != 1)) + rc = bad_page(mp, "unexpected %s-page for %s (db-flags 0x%x)\n", + "leaf2-sub", "nested dupsort db", mc->mc_db->md_flags); + /* fall through */ + __fallthrough; + case P_LEAF | P_LEAF2: if (unlikely((mc->mc_checking & CC_LEAF2) == 0)) rc = bad_page( mp, @@ -18158,12 +18186,24 @@ __cold static int mdbx_page_check(MDBX_cursor *const mc, break; } + if (unlikely(mp->mp_upper < mp->mp_lower || + ((mp->mp_lower | mp->mp_upper) & 1) || + PAGEHDRSZ + mp->mp_upper > env->me_psize)) + rc = bad_page(mp, "invalid page lower(%u)/upper(%u) with limit %u\n", + mp->mp_lower, mp->mp_upper, page_space(env)); + char *const end_of_page = (char *)mp + env->me_psize; const unsigned nkeys = page_numkeys(mp); - if ((mc->mc_checking & CC_UPDATING) == 0 || !IS_MODIFIABLE(mc->mc_txn, mp)) { - if (unlikely(nkeys < 2 && IS_BRANCH(mp))) - rc = bad_page(mp, "branch-page nkeys (%u) < 2\n", nkeys); - } + if (unlikely(nkeys <= IS_BRANCH(mp)) && + (!(mc->mc_flags & C_SUB) || mc->mc_db->md_entries) && + ((mc->mc_checking & CC_UPDATING) == 0 || !IS_MODIFIABLE(mc->mc_txn, mp))) + rc = bad_page(mp, "%s-page nkeys (%u) < %u\n", + IS_BRANCH(mp) ? "branch" : "leaf", nkeys, 1 + IS_BRANCH(mp)); + if (!IS_LEAF2(mp) && unlikely(PAGEHDRSZ + mp->mp_upper + + nkeys * sizeof(MDBX_node) + nkeys - 1 > + env->me_psize)) + rc = bad_page(mp, "invalid page upper (%u) for nkeys %u with limit %u\n", + mp->mp_upper, nkeys, page_space(env)); const size_t ksize_max = keysize_max(env->me_psize, 0); const size_t leaf2_ksize = mp->mp_leaf2_ksize; @@ -18279,29 +18319,35 @@ __cold static int mdbx_page_check(MDBX_cursor *const mc, mp, "big-node data size (%zu) <> min/max value-length (%zu/%zu)\n", dsize, mc->mc_dbx->md_vlen_min, mc->mc_dbx->md_vlen_max); + if (unlikely(node_size_len(node_ks(node), dsize) <= + mc->mc_txn->mt_env->me_leaf_nodemax)) + bad_page(mp, "too small data (%zu bytes) for bigdata-node", dsize); + if ((mc->mc_checking & CC_RETIRING) == 0) { /* Disable full checking to avoid infinite recursion * with a corrupted DB */ -#if !MDBX_DISABLE_VALIDATION const uint8_t save_checking_level = mc->mc_checking; mc->mc_checking &= ~CC_PAGECHECK; -#endif /* MDBX_DISABLE_VALIDATION */ const struct page_result lp = mdbx_page_get_ex(mc, node_largedata_pgno(node), mp->mp_txnid); -#if !MDBX_DISABLE_VALIDATION mc->mc_checking = save_checking_level; -#endif /* MDBX_DISABLE_VALIDATION */ if (unlikely(lp.err != MDBX_SUCCESS)) return lp.err; if (unlikely(!IS_OVERFLOW(lp.page))) { - rc = bad_page(mp, "big-node refs to non-overflow page (%u)\n", + rc = bad_page(mp, "big-node refs to non-large page (%u)\n", lp.page->mp_pgno); continue; } - if (unlikely(number_of_ovpages(env, dsize) > lp.page->mp_pages)) - rc = - bad_page(mp, "big-node size (%zu) mismatch n-pages size (%u)\n", - dsize, lp.page->mp_pages); + const unsigned npages = number_of_ovpages(env, dsize); + if (unlikely(lp.page->mp_pages != npages)) { + if (lp.page->mp_pages < npages) + rc = bad_page(lp.page, + "too less n-pages %u for bigdata-node (%zu bytes)", + lp.page->mp_pages, dsize); + else + bad_page(lp.page, "extra n-pages %u for bigdata-node (%zu bytes)", + lp.page->mp_pages, dsize); + } } continue; } @@ -18345,8 +18391,6 @@ __cold static int mdbx_page_check(MDBX_cursor *const mc, continue; } else { const MDBX_page *const sp = (MDBX_page *)data; - const char *const end_of_subpage = data + dsize; - const int nsubkeys = page_numkeys(sp); switch (sp->mp_flags & /* ignore legacy P_DIRTY flag */ ~0x10) { case P_LEAF | P_SUBP: case P_LEAF | P_LEAF2 | P_SUBP: @@ -18357,6 +18401,13 @@ __cold static int mdbx_page_check(MDBX_cursor *const mc, continue; } + const char *const end_of_subpage = data + dsize; + const int nsubkeys = page_numkeys(sp); + if (unlikely(nsubkeys == 0) && !(mc->mc_checking & CC_UPDATING) && + mc->mc_db->md_entries) + rc = bad_page(mp, "no keys on a %s-page\n", + IS_LEAF2(sp) ? "leaf2-sub" : "leaf-sub"); + MDBX_val sub_here, sub_prev = {0, 0}; for (int j = 0; j < nsubkeys; j++) { if (IS_LEAF2(sp)) { @@ -20974,7 +21025,7 @@ static int mdbx_drop_tree(MDBX_cursor *mc, const bool may_have_subDBs) { /* DUPSORT sub-DBs have no ovpages/DBs. Omit scanning leaves. * This also avoids any P_LEAF2 pages, which have no nodes. - * Also if the DB doesn't have sub-DBs and has no overflow + * Also if the DB doesn't have sub-DBs and has no large/overflow * pages, omit scanning leaves. */ if (!(may_have_subDBs | mc->mc_db->md_overflow_pages)) mdbx_cursor_pop(mc); From a812198c4943d5428401082bf079a8b5f115637c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Thu, 7 Jul 2022 02:25:35 +0300 Subject: [PATCH 022/364] mdbx: rework/clone `page_get()` to three for the cases: any, large, branch-leaf. --- src/core.c | 388 +++++++++++++++++++++++++----------------------- src/internals.h | 3 +- 2 files changed, 203 insertions(+), 188 deletions(-) diff --git a/src/core.c b/src/core.c index 361b52c2..25ceb060 100644 --- a/src/core.c +++ b/src/core.c @@ -3816,17 +3816,16 @@ static __must_check_result __inline int mdbx_page_retire(MDBX_cursor *mc, static int __must_check_result mdbx_page_dirty(MDBX_txn *txn, MDBX_page *mp, unsigned npages); -struct page_result { +typedef struct page_result { MDBX_page *page; int err; -}; +} pgr_t; static txnid_t mdbx_kick_longlived_readers(MDBX_env *env, const txnid_t laggard); -static struct page_result page_new(MDBX_cursor *mc, const unsigned flags); -static struct page_result page_new_large(MDBX_cursor *mc, - const unsigned npages); +static pgr_t page_new(MDBX_cursor *mc, const unsigned flags); +static pgr_t page_new_large(MDBX_cursor *mc, const unsigned npages); static int mdbx_page_touch(MDBX_cursor *mc); static int mdbx_cursor_touch(MDBX_cursor *mc); static int mdbx_touch_dbi(MDBX_cursor *mc); @@ -3853,12 +3852,32 @@ enum { #define MDBX_END_SLOT 0x80 /* release any reader slot if MDBX_NOTLS */ static int mdbx_txn_end(MDBX_txn *txn, const unsigned mode); -__hot static struct page_result __must_check_result -mdbx_page_get_ex(MDBX_cursor *const mc, const pgno_t pgno, const txnid_t front); -static __always_inline int __must_check_result mdbx_page_get( - MDBX_cursor *mc, const pgno_t pgno, MDBX_page **mp, const txnid_t front) { +static __always_inline pgr_t page_get_inline(const uint16_t ILL, + MDBX_cursor *const mc, + const pgno_t pgno, + const txnid_t front); - struct page_result ret = mdbx_page_get_ex(mc, pgno, front); +static pgr_t page_get_any(MDBX_cursor *const mc, const pgno_t pgno, + const txnid_t front) { + return page_get_inline(P_ILL_BITS, mc, pgno, front); +} + +__hot static pgr_t page_get_three(MDBX_cursor *const mc, const pgno_t pgno, + const txnid_t front) { + return page_get_inline(P_ILL_BITS | P_OVERFLOW, mc, pgno, front); +} + +static pgr_t page_get_large(MDBX_cursor *const mc, const pgno_t pgno, + const txnid_t front) { + return page_get_inline(P_ILL_BITS | P_BRANCH | P_LEAF | P_LEAF2, mc, pgno, + front); +} + +static __always_inline int __must_check_result page_get(MDBX_cursor *mc, + const pgno_t pgno, + MDBX_page **mp, + const txnid_t front) { + pgr_t ret = page_get_three(mc, pgno, front); *mp = ret.page; return ret.err; } @@ -4822,12 +4841,13 @@ static int mdbx_page_retire_ex(MDBX_cursor *mc, const pgno_t pgno, bool is_frozen = false, is_spilled = false, is_shadowed = false; if (unlikely(!mp)) { if (mdbx_assert_enabled() && pagetype) { - MDBX_page *check; - rc = mdbx_page_get(mc, pgno, &check, txn->mt_front); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - mdbx_tassert(txn, (PAGETYPE(check) & ~P_LEAF2) == (pagetype & ~P_FROZEN)); - mdbx_tassert(txn, !(pagetype & P_FROZEN) || IS_FROZEN(txn, check)); + pgr_t check; + check = page_get_any(mc, pgno, txn->mt_front); + if (unlikely(check.err != MDBX_SUCCESS)) + return check.err; + mdbx_tassert(txn, + (PAGETYPE(check.page) & ~P_LEAF2) == (pagetype & ~P_FROZEN)); + mdbx_tassert(txn, !(pagetype & P_FROZEN) || IS_FROZEN(txn, check.page)); } if (pagetype & P_FROZEN) { is_frozen = true; @@ -4863,9 +4883,10 @@ static int mdbx_page_retire_ex(MDBX_cursor *mc, const pgno_t pgno, goto status_done; } - rc = mdbx_page_get(mc, pgno, &mp, txn->mt_front); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; + pgr_t pg = page_get_any(mc, pgno, txn->mt_front); + if (unlikely(pg.err != MDBX_SUCCESS)) + return pg.err; + mp = pg.page; mdbx_tassert(txn, !pagetype || PAGETYPE(mp) == pagetype); pagetype = PAGETYPE(mp); } @@ -6552,9 +6573,9 @@ __hot static pgno_t *scan4range(const MDBX_PNL pnl, const unsigned len, #define MDBX_ALLOC_NOLOG 32 #define MDBX_ALLOC_ALL (MDBX_ALLOC_GC | MDBX_ALLOC_NEW) -__cold static struct page_result -page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num, int flags) { - struct page_result ret; +__cold static pgr_t page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num, + int flags) { + pgr_t ret; MDBX_txn *const txn = mc->mc_txn; MDBX_env *const env = txn->mt_env; mdbx_assert(env, num == 0 || !(flags & MDBX_ALLOC_SLOT)); @@ -7009,7 +7030,7 @@ done: return ret; } -__hot static struct page_result page_alloc(MDBX_cursor *mc) { +__hot static pgr_t page_alloc(MDBX_cursor *mc) { MDBX_txn *const txn = mc->mc_txn; /* If there are any loose pages, just use them */ @@ -7032,7 +7053,7 @@ __hot static struct page_result page_alloc(MDBX_cursor *mc) { VALGRIND_MAKE_MEM_UNDEFINED(page_data(page), page_space(txn->mt_env)); MDBX_ASAN_UNPOISON_MEMORY_REGION(page_data(page), page_space(txn->mt_env)); page->mp_txnid = txn->mt_front; - struct page_result ret = {page, MDBX_SUCCESS}; + pgr_t ret = {page, MDBX_SUCCESS}; return ret; } @@ -7050,7 +7071,7 @@ __hot static struct page_result page_alloc(MDBX_cursor *mc) { #endif MDBX_env *const env = txn->mt_env; - struct page_result ret; + pgr_t ret; if (env->me_flags & MDBX_WRITEMAP) { ret.page = pgno2page(env, pgno); MDBX_ASAN_UNPOISON_MEMORY_REGION(ret.page, env->me_psize); @@ -7105,13 +7126,13 @@ __hot static void mdbx_page_copy(MDBX_page *dst, const MDBX_page *src, * * If a page being referenced was spilled to disk in this txn, bring * it back and make it dirty/writable again. */ -static struct page_result __must_check_result -mdbx_page_unspill(MDBX_txn *const txn, const MDBX_page *const mp) { +static pgr_t __must_check_result mdbx_page_unspill(MDBX_txn *const txn, + const MDBX_page *const mp) { mdbx_verbose("unspill page %" PRIaPGNO, mp->mp_pgno); mdbx_tassert(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0); mdbx_tassert(txn, IS_SPILLED(txn, mp)); const MDBX_txn *scan = txn; - struct page_result ret; + pgr_t ret; do { mdbx_tassert(txn, (scan->mt_flags & MDBX_TXN_SPILLS) != 0); const unsigned si = mdbx_search_spilled(scan, mp->mp_pgno); @@ -7188,7 +7209,7 @@ __hot static int mdbx_page_touch(MDBX_cursor *mc) { rc = mdbx_pnl_need(&txn->tw.retired_pages, 1); if (unlikely(rc != MDBX_SUCCESS)) goto fail; - const struct page_result par = page_alloc(mc); + const pgr_t par = page_alloc(mc); rc = par.err; np = par.page; if (unlikely(rc != MDBX_SUCCESS)) @@ -7215,7 +7236,7 @@ __hot static int mdbx_page_touch(MDBX_cursor *mc) { np->mp_pgno = pgno; np->mp_txnid = txn->mt_front; } else if (IS_SPILLED(txn, mp)) { - struct page_result pur = mdbx_page_unspill(txn, mp); + pgr_t pur = mdbx_page_unspill(txn, mp); np = pur.page; rc = pur.err; if (likely(rc == MDBX_SUCCESS)) { @@ -13883,110 +13904,129 @@ static __inline int mdbx_cursor_push(MDBX_cursor *mc, MDBX_page *mp) { return MDBX_SUCCESS; } -__hot static struct page_result mdbx_page_get_ex(MDBX_cursor *const mc, - const pgno_t pgno, - const txnid_t front) { - struct page_result ret; +__hot static __noinline MDBX_page *page_lookup_spilled(MDBX_txn *const txn, + const pgno_t pgno) { + const MDBX_txn *spiller = txn; + do { + /* Spilled pages were dirtied in this txn and flushed + * because the dirty list got full. Bring this page + * back in from the map (but don't unspill it here, + * leave that unless page_touch happens again). */ + if (unlikely(spiller->mt_flags & MDBX_TXN_SPILLS) && + mdbx_search_spilled(spiller, pgno)) + break; + + const unsigned i = mdbx_dpl_search(spiller, pgno); + mdbx_tassert(txn, (int)i > 0); + if (spiller->tw.dirtylist->items[i].pgno == pgno) { + spiller->tw.dirtylist->items[i].lru = txn->tw.dirtylru++; + return spiller->tw.dirtylist->items[i].ptr; + } + + spiller = spiller->mt_parent; + } while (spiller); + + return pgno2page(txn->mt_env, pgno); +} + +__hot static __always_inline pgr_t page_get_inline(const uint16_t ILL, + MDBX_cursor *const mc, + const pgno_t pgno, + const txnid_t front) { MDBX_txn *const txn = mc->mc_txn; mdbx_tassert(txn, front <= txn->mt_front); + + pgr_t r; if (unlikely(pgno >= txn->mt_next_pgno)) { mdbx_error("page #%" PRIaPGNO " beyond next-pgno", pgno); notfound: - ret.page = nullptr; - ret.err = MDBX_PAGE_NOTFOUND; + r.page = nullptr; + r.err = MDBX_PAGE_NOTFOUND; bailout: mc->mc_txn->mt_flags |= MDBX_TXN_ERROR; - return ret; + return r; } + r.page = pgno2page(txn->mt_env, pgno); + if (unlikely((txn->mt_flags & (MDBX_TXN_RDONLY | MDBX_WRITEMAP)) == 0)) + r.page = page_lookup_spilled(txn, pgno); + MDBX_env *const env = txn->mt_env; mdbx_assert(env, ((txn->mt_flags ^ env->me_flags) & MDBX_WRITEMAP) == 0); - if (unlikely((txn->mt_flags & (MDBX_TXN_RDONLY | MDBX_WRITEMAP)) == 0)) { - const MDBX_txn *spiller = txn; - do { - /* Spilled pages were dirtied in this txn and flushed - * because the dirty list got full. Bring this page - * back in from the map (but don't unspill it here, - * leave that unless page_touch happens again). */ - if (unlikely(spiller->mt_flags & MDBX_TXN_SPILLS) && - mdbx_search_spilled(spiller, pgno)) { - goto spilled; - } - const unsigned i = mdbx_dpl_search(spiller, pgno); - assert((int)i > 0); - if (spiller->tw.dirtylist->items[i].pgno == pgno) { - ret.page = spiller->tw.dirtylist->items[i].ptr; - spiller->tw.dirtylist->items[i].lru = txn->tw.dirtylru++; - goto dirty; - } - - spiller = spiller->mt_parent; - } while (spiller != NULL); - } - -spilled: - ret.page = pgno2page(env, pgno); - -dirty: - if (unlikely(ret.page->mp_pgno != pgno)) { - bad_page(ret.page, + if (unlikely(r.page->mp_pgno != pgno)) { + bad_page(r.page, "pgno mismatch (%" PRIaPGNO ") != expected (%" PRIaPGNO ")\n", - ret.page->mp_pgno, pgno); + r.page->mp_pgno, pgno); goto notfound; } #if !MDBX_DISABLE_VALIDATION - if (unlikely(ret.page->mp_flags & P_ILL_BITS)) { - ret.err = - bad_page(ret.page, "invalid page's flags (%u)\n", ret.page->mp_flags); + if (unlikely(r.page->mp_flags & ILL)) { + if (ILL == P_ILL_BITS || (r.page->mp_flags & P_ILL_BITS)) + r.err = bad_page(r.page, "invalid page's flags (%u)\n", r.page->mp_flags); + else if (ILL & P_OVERFLOW) { + assert((ILL & (P_BRANCH | P_LEAF | P_LEAF2)) == 0); + assert(r.page->mp_flags & (P_BRANCH | P_LEAF | P_LEAF2)); + r.err = bad_page(r.page, "unexpected %s instead of %s (%u)\n", + "large/overlow", "branch/leaf/leaf2", r.page->mp_flags); + } else if (ILL & (P_BRANCH | P_LEAF | P_LEAF2)) { + assert((ILL & P_BRANCH) && (ILL & P_LEAF) && (ILL & P_LEAF2)); + assert(r.page->mp_flags & (P_BRANCH | P_LEAF | P_LEAF2)); + r.err = bad_page(r.page, "unexpected %s instead of %s (%u)\n", + "branch/leaf/leaf2", "large/overlow", r.page->mp_flags); + } else { + assert(false); + } goto bailout; } - if (unlikely(ret.page->mp_txnid > front) && - unlikely(ret.page->mp_txnid > txn->mt_front || front < txn->mt_txnid)) { - ret.err = bad_page( - ret.page, + if (unlikely(r.page->mp_txnid > front) && + unlikely(r.page->mp_txnid > txn->mt_front || front < txn->mt_txnid)) { + r.err = bad_page( + r.page, "invalid page' txnid (%" PRIaTXN ") for %s' txnid (%" PRIaTXN ")\n", - ret.page->mp_txnid, + r.page->mp_txnid, (front == txn->mt_front && front != txn->mt_txnid) ? "front-txn" : "parent-page", front); goto bailout; } - if (!IS_OVERFLOW(ret.page)) { - if (unlikely(ret.page->mp_upper < ret.page->mp_lower || - ((ret.page->mp_lower | ret.page->mp_upper) & 1) || - PAGEHDRSZ + ret.page->mp_upper > env->me_psize)) { - ret.err = - bad_page(ret.page, "invalid page lower(%u)/upper(%u) with limit %u\n", - ret.page->mp_lower, ret.page->mp_upper, page_space(env)); + if (((ILL & P_OVERFLOW) || !IS_OVERFLOW(r.page)) && + (ILL & (P_BRANCH | P_LEAF | P_LEAF2)) == 0) { + if (unlikely(r.page->mp_upper < r.page->mp_lower || + ((r.page->mp_lower | r.page->mp_upper) & 1) || + PAGEHDRSZ + r.page->mp_upper > env->me_psize)) { + r.err = + bad_page(r.page, "invalid page' lower(%u)/upper(%u) with limit %u\n", + r.page->mp_lower, r.page->mp_upper, page_space(env)); + goto bailout; + } + } else if ((ILL & P_OVERFLOW) == 0) { + const pgno_t npages = r.page->mp_pages; + if (unlikely(npages < 1 || npages >= MAX_PAGENO / 2)) { + r.err = bad_page(r.page, "invalid n-pages (%u) for large-page\n", npages); + goto bailout; + } + if (unlikely(r.page->mp_pgno + npages > txn->mt_next_pgno)) { + r.err = bad_page( + r.page, + "end of large-page beyond (%u) allocated space (%u next-pgno)\n", + r.page->mp_pgno + npages, txn->mt_next_pgno); goto bailout; } } else { - const pgno_t npages = ret.page->mp_pages; - if (unlikely(npages < 1 || npages >= MAX_PAGENO / 2)) { - ret.err = - bad_page(ret.page, "invalid n-pages (%u) for large-page\n", npages); - goto bailout; - } - if (unlikely(ret.page->mp_pgno + npages > mc->mc_txn->mt_next_pgno)) { - ret.err = bad_page( - ret.page, - "end of large-page beyond (%u) allocated space (%u next-pgno)\n", - ret.page->mp_pgno + npages, mc->mc_txn->mt_next_pgno); - goto bailout; - } + assert(false); } #endif /* !MDBX_DISABLE_VALIDATION */ if (unlikely(mc->mc_checking & CC_PAGECHECK) && - unlikely(MDBX_SUCCESS != (ret.err = mdbx_page_check(mc, ret.page)))) + unlikely(MDBX_SUCCESS != (r.err = mdbx_page_check(mc, r.page)))) goto bailout; - ret.err = MDBX_SUCCESS; - return ret; + r.err = MDBX_SUCCESS; + return r; } /* Finish mdbx_page_search() / mdbx_page_search_lowest(). @@ -14034,7 +14074,7 @@ __hot static int mdbx_page_search_root(MDBX_cursor *mc, const MDBX_val *key, mdbx_cassert(mc, i >= 0 && i < (int)page_numkeys(mp)); node = page_node(mp, i); - rc = mdbx_page_get(mc, node_pgno(node), &mp, mp->mp_txnid); + rc = page_get(mc, node_pgno(node), &mp, mp->mp_txnid); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -14183,7 +14223,7 @@ __hot static int mdbx_page_search_lowest(MDBX_cursor *mc) { mdbx_cassert(mc, IS_BRANCH(mp)); MDBX_node *node = page_node(mp, 0); - int rc = mdbx_page_get(mc, node_pgno(node), &mp, mp->mp_txnid); + int rc = page_get(mc, node_pgno(node), &mp, mp->mp_txnid); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -14249,7 +14289,7 @@ __hot static int mdbx_page_search(MDBX_cursor *mc, const MDBX_val *key, } while (unlikely((scan = scan->mt_parent) != nullptr)); } - if (unlikely((rc = mdbx_page_get(mc, root, &mc->mc_pg[0], pp_txnid)) != 0)) + if (unlikely((rc = page_get(mc, root, &mc->mc_pg[0], pp_txnid)) != 0)) return rc; } @@ -14278,19 +14318,15 @@ static __noinline int node_read_bigdata(MDBX_cursor *mc, const MDBX_node *node, mdbx_cassert(mc, node_flags(node) == F_BIGDATA && data->iov_len == node_ds(node)); - struct page_result lp = - mdbx_page_get_ex(mc, node_largedata_pgno(node), mp->mp_txnid); + pgr_t lp = page_get_large(mc, node_largedata_pgno(node), mp->mp_txnid); if (unlikely((lp.err != MDBX_SUCCESS))) { mdbx_debug("read large/overflow page %" PRIaPGNO " failed", node_largedata_pgno(node)); return lp.err; } + mdbx_cassert(mc, PAGETYPE_EXTRA(lp.page) == P_OVERFLOW); data->iov_base = page_data(lp.page); - if (!MDBX_DISABLE_VALIDATION && - unlikely(PAGETYPE_EXTRA(lp.page) != P_OVERFLOW)) - return bad_page(lp.page, "invalid page-type 0x%x for bigdata-node", - PAGETYPE_EXTRA(lp.page)); if (!MDBX_DISABLE_VALIDATION) { const MDBX_env *env = mc->mc_txn->mt_env; const size_t dsize = data->iov_len; @@ -14456,7 +14492,7 @@ static int mdbx_cursor_sibling(MDBX_cursor *mc, int dir) { mdbx_cassert(mc, IS_BRANCH(mc->mc_pg[mc->mc_top])); node = page_node(mp = mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); - rc = mdbx_page_get(mc, node_pgno(node), &mp, mp->mp_txnid); + rc = page_get(mc, node_pgno(node), &mp, mp->mp_txnid); if (unlikely(rc != MDBX_SUCCESS)) { /* mc will be inconsistent if caller does mc_snum++ as above */ mc->mc_flags &= ~(C_INITIALIZED | C_EOF); @@ -15728,7 +15764,7 @@ int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, if (unlikely(err != MDBX_SUCCESS)) return err; } - struct page_result npr = page_new(mc, P_LEAF); + pgr_t npr = page_new(mc, P_LEAF); if (unlikely(npr.err != MDBX_SUCCESS)) return npr.err; npr.err = mdbx_cursor_push(mc, npr.page); @@ -15831,29 +15867,27 @@ int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, : 0; const pgno_t pgno = node_largedata_pgno(node); - struct page_result pgr = - mdbx_page_get_ex(mc, pgno, mc->mc_pg[mc->mc_top]->mp_txnid); - if (unlikely(pgr.err != MDBX_SUCCESS)) - return pgr.err; - if (unlikely(!IS_OVERFLOW(pgr.page))) - return MDBX_CORRUPTED; + pgr_t lp = page_get_large(mc, pgno, mc->mc_pg[mc->mc_top]->mp_txnid); + if (unlikely(lp.err != MDBX_SUCCESS)) + return lp.err; + mdbx_cassert(mc, PAGETYPE_EXTRA(lp.page) == P_OVERFLOW); /* Is the ov page from this txn (or a parent) and big enough? */ - int ovpages = pgr.page->mp_pages; - if (!IS_FROZEN(mc->mc_txn, pgr.page) && + int ovpages = lp.page->mp_pages; + if (!IS_FROZEN(mc->mc_txn, lp.page) && (unlikely(mc->mc_flags & C_GCFREEZE) ? (ovpages >= dpages) : (ovpages == /* LY: add configurable threshold to keep reserve space */ dpages))) { /* yes, overwrite it. */ - if (!IS_MODIFIABLE(mc->mc_txn, pgr.page)) { - if (IS_SPILLED(mc->mc_txn, pgr.page)) { - pgr = /* TODO: avoid search and get txn & spill-index from + if (!IS_MODIFIABLE(mc->mc_txn, lp.page)) { + if (IS_SPILLED(mc->mc_txn, lp.page)) { + lp = /* TODO: avoid search and get txn & spill-index from page_result */ - mdbx_page_unspill(mc->mc_txn, pgr.page); - if (unlikely(pgr.err)) - return pgr.err; + mdbx_page_unspill(mc->mc_txn, lp.page); + if (unlikely(lp.err)) + return lp.err; } else { if (unlikely(!mc->mc_txn->mt_parent)) { mdbx_error( @@ -15861,7 +15895,7 @@ int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, "page %" PRIaPGNO " mod-txnid %" PRIaTXN "," " without parent transaction, current txn %" PRIaTXN " front %" PRIaTXN, - "overflow/large", pgno, pgr.page->mp_txnid, + "overflow/large", pgno, lp.page->mp_txnid, mc->mc_txn->mt_txnid, mc->mc_txn->mt_front); return MDBX_PROBLEM; } @@ -15871,8 +15905,8 @@ int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, if (unlikely(!np)) return MDBX_ENOMEM; - memcpy(np, pgr.page, PAGEHDRSZ); /* Copy header of page */ - err = mdbx_page_dirty(mc->mc_txn, pgr.page = np, ovpages); + memcpy(np, lp.page, PAGEHDRSZ); /* Copy header of page */ + err = mdbx_page_dirty(mc->mc_txn, lp.page = np, ovpages); if (unlikely(err != MDBX_SUCCESS)) return err; @@ -15884,9 +15918,9 @@ int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, } node_set_ds(node, data->iov_len); if (F_ISSET(flags, MDBX_RESERVE)) - data->iov_base = page_data(pgr.page); + data->iov_base = page_data(lp.page); else - memcpy(page_data(pgr.page), data->iov_base, data->iov_len); + memcpy(page_data(lp.page), data->iov_base, data->iov_len); if (mdbx_audit_enabled()) { err = mdbx_cursor_check(mc); @@ -15896,7 +15930,7 @@ int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, return MDBX_SUCCESS; } - if ((err = mdbx_page_retire(mc, pgr.page)) != MDBX_SUCCESS) + if ((err = mdbx_page_retire(mc, lp.page)) != MDBX_SUCCESS) return err; } else { olddata.iov_len = node_ds(node); @@ -16018,7 +16052,7 @@ int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, nested_dupdb.md_entries = page_numkeys(fp); xdata.iov_len = sizeof(nested_dupdb); xdata.iov_base = &nested_dupdb; - const struct page_result par = page_alloc(mc); + const pgr_t par = page_alloc(mc); mp = par.page; if (unlikely(par.err != MDBX_SUCCESS)) return par.err; @@ -16355,10 +16389,8 @@ int mdbx_cursor_del(MDBX_cursor *mc, MDBX_put_flags_t flags) { /* add large/overflow pages to free list */ if (F_ISSET(node_flags(node), F_BIGDATA)) { - MDBX_page *omp; - if (unlikely((rc = mdbx_page_get(mc, node_largedata_pgno(node), &omp, - mp->mp_txnid)) || - (rc = mdbx_page_retire(mc, omp)))) + pgr_t lp = page_get_large(mc, node_largedata_pgno(node), mp->mp_txnid); + if (unlikely((rc = lp.err) || (rc = mdbx_page_retire(mc, lp.page)))) goto fail; } @@ -16372,9 +16404,9 @@ fail: /* Allocate and initialize new pages for a database. * Set MDBX_TXN_ERROR on failure. */ -static struct page_result page_new(MDBX_cursor *mc, const unsigned flags) { +static pgr_t page_new(MDBX_cursor *mc, const unsigned flags) { mdbx_cassert(mc, (flags & P_OVERFLOW) == 0); - struct page_result ret = page_alloc(mc); + pgr_t ret = page_alloc(mc); if (unlikely(ret.err != MDBX_SUCCESS)) return ret; @@ -16403,11 +16435,10 @@ static struct page_result page_new(MDBX_cursor *mc, const unsigned flags) { return ret; } -static struct page_result page_new_large(MDBX_cursor *mc, - const unsigned npages) { - struct page_result ret = - likely(npages == 1) ? page_alloc(mc) - : page_alloc_slowpath(mc, npages, MDBX_ALLOC_ALL); +static pgr_t page_new_large(MDBX_cursor *mc, const unsigned npages) { + pgr_t ret = likely(npages == 1) + ? page_alloc(mc) + : page_alloc_slowpath(mc, npages, MDBX_ALLOC_ALL); if (unlikely(ret.err != MDBX_SUCCESS)) return ret; @@ -16544,7 +16575,7 @@ static int __must_check_result mdbx_node_add_leaf(MDBX_cursor *mc, return MDBX_PROBLEM; } const pgno_t ovpages = number_of_ovpages(mc->mc_txn->mt_env, data->iov_len); - const struct page_result npr = page_new_large(mc, ovpages); + const pgr_t npr = page_new_large(mc, ovpages); if (unlikely(npr.err != MDBX_SUCCESS)) return npr.err; largepage = npr.page; @@ -17946,7 +17977,7 @@ static int mdbx_rebalance(MDBX_cursor *mc) { } else if (IS_BRANCH(mp) && nkeys == 1) { mdbx_debug("%s", "collapsing root page!"); mc->mc_db->md_root = node_pgno(page_node(mp, 0)); - rc = mdbx_page_get(mc, mc->mc_db->md_root, &mc->mc_pg[0], mp->mp_txnid); + rc = page_get(mc, mc->mc_db->md_root, &mc->mc_pg[0], mp->mp_txnid); if (unlikely(rc != MDBX_SUCCESS)) return rc; mc->mc_db->md_depth--; @@ -18005,7 +18036,7 @@ static int mdbx_rebalance(MDBX_cursor *mc) { MDBX_page *left = nullptr, *right = nullptr; if (mn.mc_ki[pre_top] > 0) { - rc = mdbx_page_get( + rc = page_get( &mn, node_pgno(page_node(mn.mc_pg[pre_top], mn.mc_ki[pre_top] - 1)), &left, mc->mc_pg[mc->mc_top]->mp_txnid); if (unlikely(rc != MDBX_SUCCESS)) @@ -18013,7 +18044,7 @@ static int mdbx_rebalance(MDBX_cursor *mc) { mdbx_cassert(mc, PAGETYPE(left) == PAGETYPE(mc->mc_pg[mc->mc_top])); } if (mn.mc_ki[pre_top] + 1u < page_numkeys(mn.mc_pg[pre_top])) { - rc = mdbx_page_get( + rc = page_get( &mn, node_pgno(page_node(mn.mc_pg[pre_top], mn.mc_ki[pre_top] + 1)), &right, mc->mc_pg[mc->mc_top]->mp_txnid); if (unlikely(rc != MDBX_SUCCESS)) @@ -18324,20 +18355,11 @@ __cold static int mdbx_page_check(MDBX_cursor *const mc, bad_page(mp, "too small data (%zu bytes) for bigdata-node", dsize); if ((mc->mc_checking & CC_RETIRING) == 0) { - /* Disable full checking to avoid infinite recursion - * with a corrupted DB */ - const uint8_t save_checking_level = mc->mc_checking; - mc->mc_checking &= ~CC_PAGECHECK; - const struct page_result lp = - mdbx_page_get_ex(mc, node_largedata_pgno(node), mp->mp_txnid); - mc->mc_checking = save_checking_level; + const pgr_t lp = + page_get_large(mc, node_largedata_pgno(node), mp->mp_txnid); if (unlikely(lp.err != MDBX_SUCCESS)) return lp.err; - if (unlikely(!IS_OVERFLOW(lp.page))) { - rc = bad_page(mp, "big-node refs to non-large page (%u)\n", - lp.page->mp_pgno); - continue; - } + mdbx_cassert(mc, PAGETYPE_EXTRA(lp.page) == P_OVERFLOW); const unsigned npages = number_of_ovpages(env, dsize); if (unlikely(lp.page->mp_pages != npages)) { if (lp.page->mp_pages < npages) @@ -18546,7 +18568,7 @@ __cold static int mdbx_cursor_check(MDBX_cursor *mc) { return MDBX_CURSOR_FULL; pgno_t pgno = node_pgno(node); MDBX_page *np; - err = mdbx_page_get(mc, pgno, &np, mp->mp_txnid); + err = page_get(mc, pgno, &np, mp->mp_txnid); mdbx_cassert(mc, err == MDBX_SUCCESS); if (unlikely(err != MDBX_SUCCESS)) return err; @@ -18787,7 +18809,7 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *const newkey, mdbx_cassert(mc, nkeys + 1 >= minkeys * 2); /* Create a new sibling page. */ - struct page_result npr = page_new(mc, mp->mp_flags); + pgr_t npr = page_new(mc, mp->mp_flags); if (unlikely(npr.err != MDBX_SUCCESS)) return npr.err; MDBX_page *const sister = npr.page; @@ -19565,7 +19587,7 @@ __cold static int compacting_walk_tree(mdbx_compacting_ctx *ctx, MDBX_cursor *mc, pgno_t *root, txnid_t parent_txnid) { mc->mc_snum = 1; - int rc = mdbx_page_get(mc, *root, &mc->mc_pg[0], parent_txnid); + int rc = page_get(mc, *root, &mc->mc_pg[0], parent_txnid); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -19605,8 +19627,8 @@ __cold static int compacting_walk_tree(mdbx_compacting_ctx *ctx, node = page_node(mp, i); } - const struct page_result lp = - mdbx_page_get_ex(mc, node_largedata_pgno(node), mp->mp_txnid); + const pgr_t lp = + page_get_large(mc, node_largedata_pgno(node), mp->mp_txnid); if (unlikely((rc = lp.err) != MDBX_SUCCESS)) goto done; const size_t datasize = node_ds(node); @@ -19657,7 +19679,7 @@ __cold static int compacting_walk_tree(mdbx_compacting_ctx *ctx, if (mc->mc_ki[mc->mc_top] < n) { while (1) { const MDBX_node *node = page_node(mp, mc->mc_ki[mc->mc_top]); - rc = mdbx_page_get(mc, node_pgno(node), &mp, mp->mp_txnid); + rc = page_get(mc, node_pgno(node), &mp, mp->mp_txnid); if (unlikely(rc != MDBX_SUCCESS)) goto done; mc->mc_top++; @@ -21626,23 +21648,21 @@ __cold static int mdbx_walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, txnid_t parent_txnid) { assert(pgno != P_INVALID); MDBX_page *mp = nullptr; - int err = mdbx_page_get(ctx->mw_cursor, pgno, &mp, parent_txnid); + int err = page_get(ctx->mw_cursor, pgno, &mp, parent_txnid); MDBX_page_type_t type = walk_page_type(mp); - const int nentries = (mp && !IS_OVERFLOW(mp)) ? page_numkeys(mp) : 1; - unsigned npages = (mp && IS_OVERFLOW(mp)) ? mp->mp_pages : 1; + const unsigned nentries = mp ? page_numkeys(mp) : 0; + unsigned npages = 1; size_t pagesize = pgno2bytes(ctx->mw_txn->mt_env, npages); - size_t header_size = (mp && !IS_LEAF2(mp) && !IS_OVERFLOW(mp)) - ? PAGEHDRSZ + mp->mp_lower - : PAGEHDRSZ; + size_t header_size = + (mp && !IS_LEAF2(mp)) ? PAGEHDRSZ + mp->mp_lower : PAGEHDRSZ; size_t payload_size = 0; size_t unused_size = - (mp && !IS_OVERFLOW(mp) ? page_room(mp) : pagesize - header_size) - - payload_size; + (mp ? page_room(mp) : pagesize - header_size) - payload_size; size_t align_bytes = 0; - for (int i = 0; err == MDBX_SUCCESS && i < nentries; - align_bytes += ((payload_size + align_bytes) & 1), i++) { + for (unsigned i = 0; err == MDBX_SUCCESS && i < nentries; + align_bytes += ((payload_size + align_bytes) & 1), ++i) { if (type == MDBX_page_dupfixed_leaf) { /* LEAF2 pages have no mp_ptrs[] or node headers */ payload_size += mp->mp_leaf2_ksize; @@ -21670,18 +21690,12 @@ __cold static int mdbx_walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, const size_t over_header = PAGEHDRSZ; npages = 1; - MDBX_page *op; assert(err == MDBX_SUCCESS); - err = mdbx_page_get(ctx->mw_cursor, large_pgno, &op, mp->mp_txnid); + pgr_t lp = page_get_large(ctx->mw_cursor, large_pgno, mp->mp_txnid); + err = lp.err; if (err == MDBX_SUCCESS) { - /* LY: Don't use mask here, e.g bitwise - * (P_BRANCH|P_LEAF|P_LEAF2|P_META|P_OVERFLOW|P_SUBP). - * Pages should not me marked dirty/loose or otherwise. */ - if (unlikely(P_OVERFLOW != op->mp_flags)) { - assert(err == MDBX_CORRUPTED); - err = MDBX_CORRUPTED; - } else - npages = op->mp_pages; + mdbx_cassert(ctx->mw_cursor, PAGETYPE_EXTRA(lp.page) == P_OVERFLOW); + npages = lp.page->mp_pages; } pagesize = pgno2bytes(ctx->mw_txn->mt_env, npages); @@ -21718,7 +21732,7 @@ __cold static int mdbx_walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, } MDBX_page *sp = node_data(node); - const int nsubkeys = page_numkeys(sp); + const unsigned nsubkeys = page_numkeys(sp); size_t subheader_size = IS_LEAF2(sp) ? PAGEHDRSZ : PAGEHDRSZ + sp->mp_lower; size_t subunused_size = page_room(sp); @@ -21739,8 +21753,8 @@ __cold static int mdbx_walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, err = MDBX_CORRUPTED; } - for (int j = 0; err == MDBX_SUCCESS && j < nsubkeys; - subalign_bytes += ((subpayload_size + subalign_bytes) & 1), j++) { + for (unsigned j = 0; err == MDBX_SUCCESS && j < nsubkeys; + subalign_bytes += ((subpayload_size + subalign_bytes) & 1), ++j) { if (subtype == MDBX_subpage_dupfixed_leaf) { /* LEAF2 pages have no mp_ptrs[] or node headers */ @@ -21780,7 +21794,7 @@ __cold static int mdbx_walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, if (unlikely(rc != MDBX_SUCCESS)) return (rc == MDBX_RESULT_TRUE) ? MDBX_SUCCESS : rc; - for (int i = 0; err == MDBX_SUCCESS && i < nentries; i++) { + for (unsigned i = 0; err == MDBX_SUCCESS && i < nentries; ++i) { if (type == MDBX_page_dupfixed_leaf) continue; diff --git a/src/internals.h b/src/internals.h index 325a0524..185a5fac 100644 --- a/src/internals.h +++ b/src/internals.h @@ -528,7 +528,8 @@ typedef struct MDBX_page { #define P_SPILLED 0x2000 /* spilled in parent txn */ #define P_LOOSE 0x4000 /* page was dirtied then freed, can be reused */ #define P_FROZEN 0x8000 /* used for retire page with known status */ -#define P_ILL_BITS (~(P_BRANCH | P_LEAF | P_LEAF2 | P_OVERFLOW | P_SPILLED)) +#define P_ILL_BITS \ + ((uint16_t) ~(P_BRANCH | P_LEAF | P_LEAF2 | P_OVERFLOW | P_SPILLED)) uint16_t mp_flags; union { uint32_t mp_pages; /* number of overflow pages */ From bc744a843abfb6ff0122610f0101480a5d6dc3de Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Thu, 7 Jul 2022 15:48:24 +0300 Subject: [PATCH 023/364] mdbx: refine/speedup `PAGETYPE()`. --- src/core.c | 129 ++++++++++++++++++++++++++---------------------- src/internals.h | 35 +++++++------ 2 files changed, 90 insertions(+), 74 deletions(-) diff --git a/src/core.c b/src/core.c index 25ceb060..744a7328 100644 --- a/src/core.c +++ b/src/core.c @@ -696,6 +696,8 @@ __cold static const char *pagetype_caption(const uint8_t type, return "dupfixed-leaf"; case P_LEAF | P_LEAF2 | P_SUBP: return "dupfixed-subleaf"; + case P_LEAF | P_LEAF2 | P_SUBP | P_LEGACY_DIRTY: + return "dupfixed-subleaf.legacy-dirty"; case P_OVERFLOW: return "large"; default: @@ -713,7 +715,7 @@ __cold static int MDBX_PRINTF_ARGS(2, 3) prev = mp; mdbx_debug_log(MDBX_LOG_ERROR, "badpage", 0, "corrupted %s-page #%u, mod-txnid %" PRIaTXN "\n", - pagetype_caption(PAGETYPE_EXTRA(mp), buf4unknown), + pagetype_caption(PAGETYPE_WHOLE(mp), buf4unknown), mp->mp_pgno, mp->mp_txnid); } @@ -728,7 +730,7 @@ __cold static int MDBX_PRINTF_ARGS(2, 3) /* Address of node i in page p */ MDBX_NOTHROW_PURE_FUNCTION static __always_inline MDBX_node * page_node(const MDBX_page *mp, unsigned i) { - assert(PAGETYPE(mp) == P_LEAF || PAGETYPE(mp) == P_BRANCH); + assert(PAGETYPE_COMPAT(mp) == P_LEAF || PAGETYPE_WHOLE(mp) == P_BRANCH); assert(page_numkeys(mp) > (unsigned)(i)); assert(mp->mp_ptrs[i] % 2 == 0); return (MDBX_node *)((char *)mp + mp->mp_ptrs[i] + PAGEHDRSZ); @@ -739,7 +741,7 @@ page_node(const MDBX_page *mp, unsigned i) { * There are no node headers, keys are stored contiguously. */ MDBX_NOTHROW_PURE_FUNCTION static __always_inline void * page_leaf2key(const MDBX_page *mp, unsigned i, size_t keysize) { - assert(PAGETYPE(mp) == (P_LEAF | P_LEAF2)); + assert(PAGETYPE_COMPAT(mp) == (P_LEAF | P_LEAF2)); assert(mp->mp_leaf2_ksize == keysize); (void)keysize; return (char *)mp + PAGEHDRSZ + (i * mp->mp_leaf2_ksize); @@ -4273,7 +4275,7 @@ MDBX_MAYBE_UNUSED static void mdbx_page_list(MDBX_page *mp) { MDBX_val key; DKBUF; - switch (PAGETYPE_EXTRA(mp)) { + switch (PAGETYPE_WHOLE(mp)) { case P_BRANCH: type = "Branch page"; break; @@ -4821,10 +4823,10 @@ static __inline void mdbx_page_wash(MDBX_txn *txn, const unsigned di, * to this txn's free list. */ static int mdbx_page_retire_ex(MDBX_cursor *mc, const pgno_t pgno, MDBX_page *mp /* maybe null */, - int pagetype /* maybe unknown/zero */) { + unsigned pageflags /* maybe unknown/zero */) { int rc; MDBX_txn *const txn = mc->mc_txn; - mdbx_tassert(txn, !mp || (mp->mp_pgno == pgno && PAGETYPE(mp) == pagetype)); + mdbx_tassert(txn, !mp || (mp->mp_pgno == pgno && mp->mp_flags == pageflags)); /* During deleting entire subtrees, it is reasonable and possible to avoid * reading leaf pages, i.e. significantly reduce hard page-faults & IOPs: @@ -4840,16 +4842,16 @@ static int mdbx_page_retire_ex(MDBX_cursor *mc, const pgno_t pgno, unsigned di = 0, si = 0, npages = 1; bool is_frozen = false, is_spilled = false, is_shadowed = false; if (unlikely(!mp)) { - if (mdbx_assert_enabled() && pagetype) { + if (mdbx_assert_enabled() && pageflags) { pgr_t check; check = page_get_any(mc, pgno, txn->mt_front); if (unlikely(check.err != MDBX_SUCCESS)) return check.err; - mdbx_tassert(txn, - (PAGETYPE(check.page) & ~P_LEAF2) == (pagetype & ~P_FROZEN)); - mdbx_tassert(txn, !(pagetype & P_FROZEN) || IS_FROZEN(txn, check.page)); + mdbx_tassert(txn, (check.page->mp_flags & ~P_LEAF2) == + (pageflags & ~P_FROZEN)); + mdbx_tassert(txn, !(pageflags & P_FROZEN) || IS_FROZEN(txn, check.page)); } - if (pagetype & P_FROZEN) { + if (pageflags & P_FROZEN) { is_frozen = true; if (mdbx_assert_enabled()) { for (MDBX_txn *scan = txn; scan; scan = scan->mt_parent) { @@ -4858,7 +4860,7 @@ static int mdbx_page_retire_ex(MDBX_cursor *mc, const pgno_t pgno, } } goto status_done; - } else if (pagetype && txn->tw.dirtylist) { + } else if (pageflags && txn->tw.dirtylist) { if ((di = mdbx_dpl_exist(txn, pgno)) != 0) { mp = txn->tw.dirtylist->items[di].ptr; mdbx_tassert(txn, IS_MODIFIABLE(txn, mp)); @@ -4887,8 +4889,8 @@ static int mdbx_page_retire_ex(MDBX_cursor *mc, const pgno_t pgno, if (unlikely(pg.err != MDBX_SUCCESS)) return pg.err; mp = pg.page; - mdbx_tassert(txn, !pagetype || PAGETYPE(mp) == pagetype); - pagetype = PAGETYPE(mp); + mdbx_tassert(txn, !pageflags || mp->mp_flags == pageflags); + pageflags = mp->mp_flags; } is_frozen = IS_FROZEN(txn, mp); @@ -4915,9 +4917,9 @@ static int mdbx_page_retire_ex(MDBX_cursor *mc, const pgno_t pgno, } status_done: - if (likely((pagetype & P_OVERFLOW) == 0)) { + if (likely((pageflags & P_OVERFLOW) == 0)) { STATIC_ASSERT(P_BRANCH == 1); - const bool is_branch = pagetype & P_BRANCH; + const bool is_branch = pageflags & P_BRANCH; if (unlikely(mc->mc_flags & C_SUB)) { MDBX_db *outer = mdbx_outer_db(mc); mdbx_cassert(mc, !is_branch || outer->md_branch_pages > 0); @@ -4927,8 +4929,8 @@ status_done: } mdbx_cassert(mc, !is_branch || mc->mc_db->md_branch_pages > 0); mc->mc_db->md_branch_pages -= is_branch; - mdbx_cassert(mc, (pagetype & P_LEAF) == 0 || mc->mc_db->md_leaf_pages > 0); - mc->mc_db->md_leaf_pages -= (pagetype & P_LEAF) != 0; + mdbx_cassert(mc, (pageflags & P_LEAF) == 0 || mc->mc_db->md_leaf_pages > 0); + mc->mc_db->md_leaf_pages -= (pageflags & P_LEAF) != 0; } else { npages = mp->mp_pages; mdbx_cassert(mc, mc->mc_db->md_overflow_pages >= npages); @@ -5113,7 +5115,7 @@ status_done: } static __inline int mdbx_page_retire(MDBX_cursor *mc, MDBX_page *mp) { - return mdbx_page_retire_ex(mc, mp->mp_pgno, mp, PAGETYPE(mp)); + return mdbx_page_retire_ex(mc, mp->mp_pgno, mp, mp->mp_flags); } struct mdbx_iov_ctx { @@ -14325,7 +14327,7 @@ static __noinline int node_read_bigdata(MDBX_cursor *mc, const MDBX_node *node, return lp.err; } - mdbx_cassert(mc, PAGETYPE_EXTRA(lp.page) == P_OVERFLOW); + mdbx_cassert(mc, PAGETYPE_WHOLE(lp.page) == P_OVERFLOW); data->iov_base = page_data(lp.page); if (!MDBX_DISABLE_VALIDATION) { const MDBX_env *env = mc->mc_txn->mt_env; @@ -15870,7 +15872,7 @@ int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, pgr_t lp = page_get_large(mc, pgno, mc->mc_pg[mc->mc_top]->mp_txnid); if (unlikely(lp.err != MDBX_SUCCESS)) return lp.err; - mdbx_cassert(mc, PAGETYPE_EXTRA(lp.page) == P_OVERFLOW); + mdbx_cassert(mc, PAGETYPE_WHOLE(lp.page) == P_OVERFLOW); /* Is the ov page from this txn (or a parent) and big enough? */ int ovpages = lp.page->mp_pages; @@ -16112,7 +16114,7 @@ int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, memcpy(olddata.iov_base, data->iov_base, data->iov_len); else { mdbx_cassert(mc, page_numkeys(mc->mc_pg[mc->mc_top]) == 1); - mdbx_cassert(mc, PAGETYPE(mc->mc_pg[mc->mc_top]) == P_LEAF); + mdbx_cassert(mc, PAGETYPE_COMPAT(mc->mc_pg[mc->mc_top]) == P_LEAF); mdbx_cassert(mc, node_ds(node) == 0); mdbx_cassert(mc, node_flags(node) == 0); mdbx_cassert(mc, key->iov_len < UINT16_MAX); @@ -16469,7 +16471,7 @@ static int __must_check_result mdbx_node_add_leaf2(MDBX_cursor *mc, key ? key->iov_len : 0, DKEY_DEBUG(key)); mdbx_cassert(mc, key); - mdbx_cassert(mc, PAGETYPE(mp) == (P_LEAF | P_LEAF2)); + mdbx_cassert(mc, PAGETYPE_COMPAT(mp) == (P_LEAF | P_LEAF2)); const unsigned ksize = mc->mc_db->md_xsize; mdbx_cassert(mc, ksize == key->iov_len); const unsigned nkeys = page_numkeys(mp); @@ -16506,7 +16508,7 @@ static int __must_check_result mdbx_node_add_branch(MDBX_cursor *mc, IS_SUBP(mp) ? "sub-" : "", mp->mp_pgno, indx, pgno, key ? key->iov_len : 0, DKEY_DEBUG(key)); - mdbx_cassert(mc, PAGETYPE(mp) == P_BRANCH); + mdbx_cassert(mc, PAGETYPE_WHOLE(mp) == P_BRANCH); STATIC_ASSERT(NODESIZE % 2 == 0); /* Move higher pointers up one slot. */ @@ -16551,7 +16553,7 @@ static int __must_check_result mdbx_node_add_leaf(MDBX_cursor *mc, IS_SUBP(mp) ? "sub-" : "", mp->mp_pgno, indx, data ? data->iov_len : 0, key ? key->iov_len : 0, DKEY_DEBUG(key)); mdbx_cassert(mc, key != NULL && data != NULL); - mdbx_cassert(mc, PAGETYPE(mp) == P_LEAF); + mdbx_cassert(mc, PAGETYPE_COMPAT(mp) == P_LEAF); mdbx_cassert(mc, page_room(mp) >= leaf_size(mc->mc_txn->mt_env, key, data)); MDBX_page *largepage = NULL; @@ -17312,19 +17314,19 @@ static int mdbx_node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, bool fromleft) { MDBX_page *psrc = csrc->mc_pg[csrc->mc_top]; MDBX_page *pdst = cdst->mc_pg[cdst->mc_top]; - mdbx_cassert(csrc, PAGETYPE(psrc) == PAGETYPE(pdst)); + mdbx_cassert(csrc, PAGETYPE_WHOLE(psrc) == PAGETYPE_WHOLE(pdst)); mdbx_cassert(csrc, csrc->mc_dbi == cdst->mc_dbi); mdbx_cassert(csrc, csrc->mc_top == cdst->mc_top); - if (unlikely(PAGETYPE(psrc) != PAGETYPE(pdst))) { + if (unlikely(PAGETYPE_WHOLE(psrc) != PAGETYPE_WHOLE(pdst))) { bailout: mdbx_error("Wrong or mismatch pages's types (src %d, dst %d) to move node", - PAGETYPE(psrc), PAGETYPE(pdst)); + PAGETYPE_WHOLE(psrc), PAGETYPE_WHOLE(pdst)); csrc->mc_txn->mt_flags |= MDBX_TXN_ERROR; return MDBX_PROBLEM; } MDBX_val key4move; - switch (PAGETYPE(psrc)) { + switch (PAGETYPE_WHOLE(psrc)) { case P_BRANCH: { const MDBX_node *srcnode = page_node(psrc, csrc->mc_ki[csrc->mc_top]); mdbx_cassert(csrc, node_flags(srcnode) == 0); @@ -17481,7 +17483,7 @@ static int mdbx_node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, bool fromleft) { mdbx_cassert(csrc, psrc == csrc->mc_pg[csrc->mc_top]); mdbx_cassert(cdst, pdst == cdst->mc_pg[cdst->mc_top]); - mdbx_cassert(csrc, PAGETYPE(psrc) == PAGETYPE(pdst)); + mdbx_cassert(csrc, PAGETYPE_WHOLE(psrc) == PAGETYPE_WHOLE(pdst)); { /* Adjust other cursors pointing to mp */ @@ -17626,7 +17628,7 @@ static int mdbx_page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { mdbx_debug("merging page %" PRIaPGNO " into %" PRIaPGNO, psrc->mp_pgno, pdst->mp_pgno); - mdbx_cassert(csrc, PAGETYPE(psrc) == PAGETYPE(pdst)); + mdbx_cassert(csrc, PAGETYPE_WHOLE(psrc) == PAGETYPE_WHOLE(pdst)); mdbx_cassert(csrc, csrc->mc_dbi == cdst->mc_dbi && csrc->mc_db == cdst->mc_db); mdbx_cassert(csrc, csrc->mc_snum > 1); /* can't merge root page */ @@ -17636,7 +17638,7 @@ static int mdbx_page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { mdbx_cassert(csrc, csrc->mc_snum < csrc->mc_db->md_depth || IS_LEAF(csrc->mc_pg[csrc->mc_db->md_depth - 1])); mdbx_cassert(cdst, page_room(pdst) >= page_used(cdst->mc_txn->mt_env, psrc)); - const int pagetype = PAGETYPE(psrc); + const int pagetype = PAGETYPE_WHOLE(psrc); /* Move all nodes from src to dst */ const unsigned dst_nkeys = page_numkeys(pdst); @@ -17795,14 +17797,15 @@ static int mdbx_page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { if (IS_LEAF(cdst->mc_pg[cdst->mc_top])) { /* LY: don't touch cursor if top-page is a LEAF */ - mdbx_cassert(cdst, IS_LEAF(cdst->mc_pg[cdst->mc_top]) || - PAGETYPE(cdst->mc_pg[cdst->mc_top]) == pagetype); + mdbx_cassert(cdst, + IS_LEAF(cdst->mc_pg[cdst->mc_top]) || + PAGETYPE_WHOLE(cdst->mc_pg[cdst->mc_top]) == pagetype); return MDBX_SUCCESS; } mdbx_cassert(cdst, page_numkeys(top_page) == dst_nkeys + src_nkeys); - if (unlikely(pagetype != PAGETYPE(top_page))) { + if (unlikely(pagetype != PAGETYPE_WHOLE(top_page))) { /* LY: LEAF-page becomes BRANCH, unable restore cursor's stack */ goto bailout; } @@ -17810,8 +17813,9 @@ static int mdbx_page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { if (top_page == cdst->mc_pg[cdst->mc_top]) { /* LY: don't touch cursor if prev top-page already on the top */ mdbx_cassert(cdst, cdst->mc_ki[cdst->mc_top] == top_indx); - mdbx_cassert(cdst, IS_LEAF(cdst->mc_pg[cdst->mc_top]) || - PAGETYPE(cdst->mc_pg[cdst->mc_top]) == pagetype); + mdbx_cassert(cdst, + IS_LEAF(cdst->mc_pg[cdst->mc_top]) || + PAGETYPE_WHOLE(cdst->mc_pg[cdst->mc_top]) == pagetype); return MDBX_SUCCESS; } @@ -17828,8 +17832,9 @@ static int mdbx_page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { cdst->mc_top = (uint16_t)new_snum - 1; mdbx_cassert(cdst, cdst->mc_snum < cdst->mc_db->md_depth || IS_LEAF(cdst->mc_pg[cdst->mc_db->md_depth - 1])); - mdbx_cassert(cdst, IS_LEAF(cdst->mc_pg[cdst->mc_top]) || - PAGETYPE(cdst->mc_pg[cdst->mc_top]) == pagetype); + mdbx_cassert(cdst, + IS_LEAF(cdst->mc_pg[cdst->mc_top]) || + PAGETYPE_WHOLE(cdst->mc_pg[cdst->mc_top]) == pagetype); return MDBX_SUCCESS; } @@ -17849,8 +17854,9 @@ static int mdbx_page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { cdst->mc_top = (uint16_t)new_snum - 1; mdbx_cassert(cdst, cdst->mc_snum < cdst->mc_db->md_depth || IS_LEAF(cdst->mc_pg[cdst->mc_db->md_depth - 1])); - mdbx_cassert(cdst, IS_LEAF(cdst->mc_pg[cdst->mc_top]) || - PAGETYPE(cdst->mc_pg[cdst->mc_top]) == pagetype); + mdbx_cassert(cdst, + IS_LEAF(cdst->mc_pg[cdst->mc_top]) || + PAGETYPE_WHOLE(cdst->mc_pg[cdst->mc_top]) == pagetype); return MDBX_SUCCESS; } @@ -17902,7 +17908,7 @@ static int mdbx_rebalance(MDBX_cursor *mc) { mdbx_cassert(mc, mc->mc_snum > 0); mdbx_cassert(mc, mc->mc_snum < mc->mc_db->md_depth || IS_LEAF(mc->mc_pg[mc->mc_db->md_depth - 1])); - const int pagetype = PAGETYPE(mc->mc_pg[mc->mc_top]); + const int pagetype = PAGETYPE_WHOLE(mc->mc_pg[mc->mc_top]); STATIC_ASSERT(P_BRANCH == 1); const unsigned minkeys = (pagetype & P_BRANCH) + 1; @@ -18004,7 +18010,7 @@ static int mdbx_rebalance(MDBX_cursor *mc) { } } mdbx_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top]) || - PAGETYPE(mc->mc_pg[mc->mc_top]) == pagetype); + PAGETYPE_WHOLE(mc->mc_pg[mc->mc_top]) == pagetype); mdbx_cassert(mc, mc->mc_snum < mc->mc_db->md_depth || IS_LEAF(mc->mc_pg[mc->mc_db->md_depth - 1])); @@ -18041,7 +18047,8 @@ static int mdbx_rebalance(MDBX_cursor *mc) { &left, mc->mc_pg[mc->mc_top]->mp_txnid); if (unlikely(rc != MDBX_SUCCESS)) return rc; - mdbx_cassert(mc, PAGETYPE(left) == PAGETYPE(mc->mc_pg[mc->mc_top])); + mdbx_cassert(mc, + PAGETYPE_WHOLE(left) == PAGETYPE_WHOLE(mc->mc_pg[mc->mc_top])); } if (mn.mc_ki[pre_top] + 1u < page_numkeys(mn.mc_pg[pre_top])) { rc = page_get( @@ -18049,7 +18056,8 @@ static int mdbx_rebalance(MDBX_cursor *mc) { &right, mc->mc_pg[mc->mc_top]->mp_txnid); if (unlikely(rc != MDBX_SUCCESS)) return rc; - mdbx_cassert(mc, PAGETYPE(right) == PAGETYPE(mc->mc_pg[mc->mc_top])); + mdbx_cassert(mc, PAGETYPE_WHOLE(right) == + PAGETYPE_WHOLE(mc->mc_pg[mc->mc_top])); } mdbx_cassert(mc, left || right); @@ -18172,7 +18180,7 @@ __cold static int mdbx_page_check(MDBX_cursor *const mc, mdbx_cassert(mc, (mc->mc_checking & CC_LEAF2) == 0 || (mc->mc_flags & C_SUB) != 0); - const uint8_t type = PAGETYPE_EXTRA(mp); + const uint8_t type = PAGETYPE_WHOLE(mp); switch (type) { default: return bad_page(mp, "invalid type (%u)\n", type); @@ -18359,7 +18367,7 @@ __cold static int mdbx_page_check(MDBX_cursor *const mc, page_get_large(mc, node_largedata_pgno(node), mp->mp_txnid); if (unlikely(lp.err != MDBX_SUCCESS)) return lp.err; - mdbx_cassert(mc, PAGETYPE_EXTRA(lp.page) == P_OVERFLOW); + mdbx_cassert(mc, PAGETYPE_WHOLE(lp.page) == P_OVERFLOW); const unsigned npages = number_of_ovpages(env, dsize); if (unlikely(lp.page->mp_pages != npages)) { if (lp.page->mp_pages < npages) @@ -18413,7 +18421,8 @@ __cold static int mdbx_page_check(MDBX_cursor *const mc, continue; } else { const MDBX_page *const sp = (MDBX_page *)data; - switch (sp->mp_flags & /* ignore legacy P_DIRTY flag */ ~0x10) { + switch (sp->mp_flags & + /* ignore legacy P_DIRTY flag */ ~P_LEGACY_DIRTY) { case P_LEAF | P_SUBP: case P_LEAF | P_LEAF2 | P_SUBP: break; @@ -19170,7 +19179,7 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *const newkey, if (unlikely(pure_left | pure_right)) { mc->mc_pg[mc->mc_top] = sister; mc->mc_ki[mc->mc_top] = 0; - switch (PAGETYPE(sister)) { + switch (PAGETYPE_WHOLE(sister)) { case P_LEAF: { mdbx_cassert(mc, newpgno == 0 || newpgno == P_INVALID); rc = mdbx_node_add_leaf(mc, 0, newkey, newdata, nflags); @@ -19181,7 +19190,7 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *const newkey, rc = mdbx_node_add_leaf2(mc, 0, newkey); } break; default: - rc = bad_page(sister, "wrong page-type %u\n", PAGETYPE(sister)); + rc = bad_page(sister, "wrong page-type %u\n", PAGETYPE_WHOLE(sister)); } if (unlikely(rc != MDBX_SUCCESS)) goto done; @@ -19242,7 +19251,7 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *const newkey, flags = node_flags(node); } - switch (PAGETYPE(sister)) { + switch (PAGETYPE_WHOLE(sister)) { case P_BRANCH: { mdbx_cassert(mc, 0 == (uint16_t)flags); /* First branch index doesn't need key data. */ @@ -19259,7 +19268,7 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *const newkey, rc = mdbx_node_add_leaf2(mc, n, &rkey); } break; */ default: - rc = bad_page(sister, "wrong page-type %u\n", PAGETYPE(sister)); + rc = bad_page(sister, "wrong page-type %u\n", PAGETYPE_WHOLE(sister)); } if (unlikely(rc != MDBX_SUCCESS)) goto done; @@ -19561,11 +19570,11 @@ static int compacting_put_page(mdbx_compacting_ctx *ctx, const MDBX_page *mp, if (tail_bytes) { assert(head_bytes + tail_bytes <= ctx->mc_env->me_psize); assert(npages == 1 && - (PAGETYPE_EXTRA(mp) == P_BRANCH || PAGETYPE_EXTRA(mp) == P_LEAF)); + (PAGETYPE_WHOLE(mp) == P_BRANCH || PAGETYPE_WHOLE(mp) == P_LEAF)); } else { assert(head_bytes <= pgno2bytes(ctx->mc_env, npages)); - assert((npages == 1 && PAGETYPE_EXTRA(mp) == (P_LEAF | P_LEAF2)) || - PAGETYPE_EXTRA(mp) == P_OVERFLOW); + assert((npages == 1 && PAGETYPE_WHOLE(mp) == (P_LEAF | P_LEAF2)) || + PAGETYPE_WHOLE(mp) == P_OVERFLOW); } const pgno_t pgno = ctx->mc_next_pgno; @@ -21068,7 +21077,7 @@ static int mdbx_drop_tree(MDBX_cursor *mc, const bool may_have_subDBs) { for (unsigned i = 0; i < nkeys; i++) { MDBX_node *node = page_node(mp, i); if (node_flags(node) & F_BIGDATA) { - rc = mdbx_page_retire_ex(mc, node_largedata_pgno(node), NULL, 0); + rc = mdbx_page_retire_ex(mc, node_largedata_pgno(node), nullptr, 0); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; if (!(may_have_subDBs | mc->mc_db->md_overflow_pages)) @@ -21089,7 +21098,7 @@ static int mdbx_drop_tree(MDBX_cursor *mc, const bool may_have_subDBs) { } else { mdbx_cassert(mc, mc->mc_snum < mc->mc_db->md_depth); mc->mc_checking |= CC_RETIRING; - const int pagetype = + const unsigned pagetype = (IS_FROZEN(txn, mp) ? P_FROZEN : 0) + ((mc->mc_snum + 1 == mc->mc_db->md_depth) ? P_LEAF : P_BRANCH); for (unsigned i = 0; i < nkeys; i++) { @@ -21097,7 +21106,7 @@ static int mdbx_drop_tree(MDBX_cursor *mc, const bool may_have_subDBs) { mdbx_tassert(txn, (node_flags(node) & (F_BIGDATA | F_SUBDATA | F_DUPDATA)) == 0); const pgno_t pgno = node_pgno(node); - rc = mdbx_page_retire_ex(mc, pgno, NULL, pagetype); + rc = mdbx_page_retire_ex(mc, pgno, nullptr, pagetype); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } @@ -21694,7 +21703,7 @@ __cold static int mdbx_walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, pgr_t lp = page_get_large(ctx->mw_cursor, large_pgno, mp->mp_txnid); err = lp.err; if (err == MDBX_SUCCESS) { - mdbx_cassert(ctx->mw_cursor, PAGETYPE_EXTRA(lp.page) == P_OVERFLOW); + mdbx_cassert(ctx->mw_cursor, PAGETYPE_WHOLE(lp.page) == P_OVERFLOW); npages = lp.page->mp_pages; } @@ -21740,7 +21749,7 @@ __cold static int mdbx_walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, size_t subalign_bytes = 0; MDBX_page_type_t subtype; - switch (sp->mp_flags & /* ignore legacy P_DIRTY flag */ ~0x10) { + switch (sp->mp_flags & /* ignore legacy P_DIRTY flag */ ~P_LEGACY_DIRTY) { case P_LEAF | P_SUBP: subtype = MDBX_subpage_leaf; break; diff --git a/src/internals.h b/src/internals.h index 185a5fac..25ba9dce 100644 --- a/src/internals.h +++ b/src/internals.h @@ -515,19 +515,18 @@ typedef struct MDBX_page { mp_txnid; /* txnid which created this page, maybe zero in legacy DB */ struct MDBX_page *mp_next; /* for in-memory list of freed pages */ }; - uint16_t mp_leaf2_ksize; /* key size if this is a LEAF2 page */ -#define P_BRANCH 0x01 /* branch page */ -#define P_LEAF 0x02 /* leaf page */ -#define P_OVERFLOW 0x04 /* overflow page */ -#define P_META 0x08 /* meta page */ -#define P_BAD 0x10 /* explicit flag for invalid/bad page */ -#define P_LEAF2 0x20 /* for MDBX_DUPFIXED records */ -#define P_SUBP 0x40 /* for MDBX_DUPSORT sub-pages */ -#define PAGETYPE_EXTRA(p) ((char)(p)->mp_flags) -#define PAGETYPE(p) (PAGETYPE_EXTRA(p) & ~P_SUBP) -#define P_SPILLED 0x2000 /* spilled in parent txn */ -#define P_LOOSE 0x4000 /* page was dirtied then freed, can be reused */ -#define P_FROZEN 0x8000 /* used for retire page with known status */ + uint16_t mp_leaf2_ksize; /* key size if this is a LEAF2 page */ +#define P_BRANCH 0x01 /* branch page */ +#define P_LEAF 0x02 /* leaf page */ +#define P_OVERFLOW 0x04 /* overflow page */ +#define P_META 0x08 /* meta page */ +#define P_LEGACY_DIRTY 0x10 /* legacy P_DIRTY flag prior to v0.10 958fd5b9 */ +#define P_BAD P_LEGACY_DIRTY /* explicit flag for invalid/bad page */ +#define P_LEAF2 0x20 /* for MDBX_DUPFIXED records */ +#define P_SUBP 0x40 /* for MDBX_DUPSORT sub-pages */ +#define P_SPILLED 0x2000 /* spilled in parent txn */ +#define P_LOOSE 0x4000 /* page was dirtied then freed, can be reused */ +#define P_FROZEN 0x8000 /* used for retire page with known status */ #define P_ILL_BITS \ ((uint16_t) ~(P_BRANCH | P_LEAF | P_LEAF2 | P_OVERFLOW | P_SPILLED)) uint16_t mp_flags; @@ -546,6 +545,14 @@ typedef struct MDBX_page { #endif /* C99 */ } MDBX_page; +#define PAGETYPE_WHOLE(p) ((char)(p)->mp_flags) + +/* Drop legacy P_DIRTY flag for sub-pages for compatilibity */ +#define PAGETYPE_COMPAT(p) \ + (unlikely(PAGETYPE_WHOLE(p) & P_SUBP) \ + ? PAGETYPE_WHOLE(p) & ~(P_SUBP | P_LEGACY_DIRTY) \ + : PAGETYPE_WHOLE(p)) + /* Size of the page header, excluding dynamic data at the end */ #define PAGEHDRSZ ((unsigned)offsetof(MDBX_page, mp_ptrs)) @@ -1064,7 +1071,7 @@ struct MDBX_cursor { }; #define CHECK_LEAF_TYPE(mc, mp) \ - (((PAGETYPE_EXTRA(mp) ^ (mc)->mc_checking) & \ + (((PAGETYPE_WHOLE(mp) ^ (mc)->mc_checking) & \ (CC_BRANCH | CC_LEAF | CC_OVERFLOW | CC_LEAF2)) == 0) /* Context for sorted-dup records. From 6076c510f87af3fa9cefb542c689dd482caef794 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Thu, 7 Jul 2022 15:51:11 +0300 Subject: [PATCH 024/364] mdbx-tools: refine assertions logging inside `mdbx_chk`. --- src/mdbx_chk.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/src/mdbx_chk.c b/src/mdbx_chk.c index 1f91b03f..e05a10c5 100644 --- a/src/mdbx_chk.c +++ b/src/mdbx_chk.c @@ -125,7 +125,8 @@ static void MDBX_PRINTF_ARGS(1, 2) print(const char *msg, ...) { } } -static void va_log(MDBX_log_level_t level, const char *msg, va_list args) { +static void va_log(MDBX_log_level_t level, const char *function, int line, + const char *msg, va_list args) { static const char *const prefixes[] = { "!!!fatal: ", " ! " /* error */, " ~ " /* warning */, " " /* notice */, " // " /* verbose */, " //// " /* debug */, @@ -143,13 +144,20 @@ static void va_log(MDBX_log_level_t level, const char *msg, va_list args) { fflush(nullptr); fputs(prefixes[level], out); vfprintf(out, msg, args); - if (msg[strlen(msg) - 1] != '\n') + + const bool have_lf = msg[strlen(msg) - 1] == '\n'; + if (level == MDBX_LOG_FATAL && function && line) + fprintf(out, have_lf ? " %s(), %u\n" : " (%s:%u)\n", + function + (strncmp(function, "mdbx_", 5) ? 5 : 0), line); + else if (!have_lf) fputc('\n', out); fflush(nullptr); } if (level == MDBX_LOG_FATAL) { +#if !MDBX_DEBUG && !MDBX_FORCE_ASSERTIONS exit(EXIT_FAILURE_MDBX); +#endif abort(); } } @@ -157,7 +165,7 @@ static void va_log(MDBX_log_level_t level, const char *msg, va_list args) { static void MDBX_PRINTF_ARGS(1, 2) error(const char *msg, ...) { va_list args; va_start(args, msg); - va_log(MDBX_LOG_ERROR, msg, args); + va_log(MDBX_LOG_ERROR, nullptr, 0, msg, args); va_end(args); } @@ -166,7 +174,7 @@ static void logger(MDBX_log_level_t level, const char *function, int line, (void)line; (void)function; if (level < MDBX_LOG_EXTRA) - va_log(level, msg, args); + va_log(level, function, line, msg, args); } static int check_user_break(void) { From 19c5e4d42427b3c549c66936fd0c01657021d501 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Thu, 7 Jul 2022 16:04:44 +0300 Subject: [PATCH 025/364] mdbx-tools: use `MDBX_VALIDATION`. --- src/mdbx_chk.c | 2 +- src/mdbx_dump.c | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/mdbx_chk.c b/src/mdbx_chk.c index e05a10c5..e9003573 100644 --- a/src/mdbx_chk.c +++ b/src/mdbx_chk.c @@ -93,7 +93,7 @@ struct { #define dbi_main walk.dbi[MAIN_DBI] #define dbi_meta walk.dbi[CORE_DBS] -int envflags = MDBX_RDONLY | MDBX_EXCLUSIVE; +int envflags = MDBX_RDONLY | MDBX_EXCLUSIVE | MDBX_VALIDATION; MDBX_env *env; MDBX_txn *txn; MDBX_envinfo envinfo; diff --git a/src/mdbx_dump.c b/src/mdbx_dump.c index 0f57b599..170a5332 100644 --- a/src/mdbx_dump.c +++ b/src/mdbx_dump.c @@ -356,7 +356,9 @@ int main(int argc, char *argv[]) { rc = mdbx_env_open( env, envname, - envflags | (rescue ? MDBX_RDONLY | MDBX_EXCLUSIVE : MDBX_RDONLY), 0); + envflags | (rescue ? MDBX_RDONLY | MDBX_EXCLUSIVE | MDBX_VALIDATION + : MDBX_RDONLY), + 0); if (unlikely(rc != MDBX_SUCCESS)) { error("mdbx_env_open", rc); goto env_close; From c95143f41bde5db4b6a1912de56362c52eb0c461 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Thu, 7 Jul 2022 16:44:01 +0300 Subject: [PATCH 026/364] mdbx: add `poor_page()`. --- src/core.c | 41 +++++++++++++++++++++++++++++++---------- 1 file changed, 31 insertions(+), 10 deletions(-) diff --git a/src/core.c b/src/core.c index 744a7328..176d6c7e 100644 --- a/src/core.c +++ b/src/core.c @@ -706,7 +706,7 @@ __cold static const char *pagetype_caption(const uint8_t type, } } -__cold static int MDBX_PRINTF_ARGS(2, 3) +__cold static __must_check_result int MDBX_PRINTF_ARGS(2, 3) bad_page(const MDBX_page *mp, const char *fmt, ...) { if (mdbx_log_enabled(MDBX_LOG_ERROR)) { static const MDBX_page *prev; @@ -727,6 +727,26 @@ __cold static int MDBX_PRINTF_ARGS(2, 3) return MDBX_CORRUPTED; } +__cold static void MDBX_PRINTF_ARGS(2, 3) + poor_page(const MDBX_page *mp, const char *fmt, ...) { + if (mdbx_log_enabled(MDBX_LOG_NOTICE)) { + static const MDBX_page *prev; + if (prev != mp) { + char buf4unknown[16]; + prev = mp; + mdbx_debug_log(MDBX_LOG_NOTICE, "poorpage", 0, + "suboptimal %s-page #%u, mod-txnid %" PRIaTXN "\n", + pagetype_caption(PAGETYPE_WHOLE(mp), buf4unknown), + mp->mp_pgno, mp->mp_txnid); + } + + va_list args; + va_start(args, fmt); + mdbx_debug_log_va(MDBX_LOG_NOTICE, "poorpage", 0, fmt, args); + va_end(args); + } +} + /* Address of node i in page p */ MDBX_NOTHROW_PURE_FUNCTION static __always_inline MDBX_node * page_node(const MDBX_page *mp, unsigned i) { @@ -13957,9 +13977,9 @@ __hot static __always_inline pgr_t page_get_inline(const uint16_t ILL, mdbx_assert(env, ((txn->mt_flags ^ env->me_flags) & MDBX_WRITEMAP) == 0); if (unlikely(r.page->mp_pgno != pgno)) { - bad_page(r.page, - "pgno mismatch (%" PRIaPGNO ") != expected (%" PRIaPGNO ")\n", - r.page->mp_pgno, pgno); + r.err = bad_page( + r.page, "pgno mismatch (%" PRIaPGNO ") != expected (%" PRIaPGNO ")\n", + r.page->mp_pgno, pgno); goto notfound; } @@ -14333,7 +14353,7 @@ static __noinline int node_read_bigdata(MDBX_cursor *mc, const MDBX_node *node, const MDBX_env *env = mc->mc_txn->mt_env; const size_t dsize = data->iov_len; if (unlikely(node_size_len(node_ks(node), dsize) <= env->me_leaf_nodemax)) - bad_page(mp, "too small data (%zu bytes) for bigdata-node", dsize); + poor_page(mp, "too small data (%zu bytes) for bigdata-node", dsize); const unsigned npages = number_of_ovpages(env, dsize); if (unlikely(lp.page->mp_pages != npages)) { if (lp.page->mp_pages < npages) @@ -14341,8 +14361,8 @@ static __noinline int node_read_bigdata(MDBX_cursor *mc, const MDBX_node *node, "too less n-pages %u for bigdata-node (%zu bytes)", lp.page->mp_pages, dsize); else - bad_page(lp.page, "extra n-pages %u for bigdata-node (%zu bytes)", - lp.page->mp_pages, dsize); + poor_page(lp.page, "extra n-pages %u for bigdata-node (%zu bytes)", + lp.page->mp_pages, dsize); } } return MDBX_SUCCESS; @@ -18360,7 +18380,7 @@ __cold static int mdbx_page_check(MDBX_cursor *const mc, dsize, mc->mc_dbx->md_vlen_min, mc->mc_dbx->md_vlen_max); if (unlikely(node_size_len(node_ks(node), dsize) <= mc->mc_txn->mt_env->me_leaf_nodemax)) - bad_page(mp, "too small data (%zu bytes) for bigdata-node", dsize); + poor_page(mp, "too small data (%zu bytes) for bigdata-node", dsize); if ((mc->mc_checking & CC_RETIRING) == 0) { const pgr_t lp = @@ -18375,8 +18395,9 @@ __cold static int mdbx_page_check(MDBX_cursor *const mc, "too less n-pages %u for bigdata-node (%zu bytes)", lp.page->mp_pages, dsize); else - bad_page(lp.page, "extra n-pages %u for bigdata-node (%zu bytes)", - lp.page->mp_pages, dsize); + poor_page(lp.page, + "extra n-pages %u for bigdata-node (%zu bytes)", + lp.page->mp_pages, dsize); } } continue; From f16bee8fa13fc5deec61b5c33bb919239167201c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Thu, 7 Jul 2022 17:33:18 +0300 Subject: [PATCH 027/364] mdbx: fix/setup zero `mod_txnid` during copy-with-compactification. --- src/core.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/src/core.c b/src/core.c index 176d6c7e..12169a8b 100644 --- a/src/core.c +++ b/src/core.c @@ -19570,6 +19570,8 @@ static int compacting_put_bytes(mdbx_compacting_ctx *ctx, const void *src, assert(chunk > PAGEHDRSZ); MDBX_page *mp = dst; mp->mp_pgno = pgno; + if (mp->mp_txnid == 0) + mp->mp_txnid = ctx->mc_txn->mt_txnid; if (mp->mp_flags == P_OVERFLOW) { assert(bytes <= pgno2bytes(ctx->mc_env, npages)); mp->mp_pages = npages; @@ -19769,12 +19771,18 @@ __cold static int compacting_walk_sdb(mdbx_compacting_ctx *ctx, MDBX_db *sdb) { couple.outer.mc_checking |= CC_SKIPORD | CC_PAGECHECK; couple.inner.mx_cursor.mc_checking |= CC_SKIPORD | CC_PAGECHECK; + if (!sdb->md_mod_txnid) + sdb->md_mod_txnid = ctx->mc_txn->mt_txnid; return compacting_walk_tree(ctx, &couple.outer, &sdb->md_root, - sdb->md_mod_txnid ? sdb->md_mod_txnid - : ctx->mc_txn->mt_txnid); + sdb->md_mod_txnid); } __cold static void compacting_fixup_meta(MDBX_env *env, MDBX_meta *meta) { + mdbx_assert(env, meta->mm_dbs[FREE_DBI].md_mod_txnid || + meta->mm_dbs[FREE_DBI].md_root == P_INVALID); + mdbx_assert(env, meta->mm_dbs[MAIN_DBI].md_mod_txnid || + meta->mm_dbs[MAIN_DBI].md_root == P_INVALID); + /* Calculate filesize taking in account shrink/growing thresholds */ if (meta->mm_geo.next != meta->mm_geo.now) { meta->mm_geo.now = meta->mm_geo.next; @@ -19884,6 +19892,8 @@ __cold static int mdbx_env_compact(MDBX_env *env, MDBX_txn *read_txn, int thread_err = mdbx_thread_create(&thread, compacting_write_thread, &ctx); if (likely(thread_err == MDBX_SUCCESS)) { if (dest_is_pipe) { + if (!meta->mm_dbs[MAIN_DBI].md_mod_txnid) + meta->mm_dbs[MAIN_DBI].md_mod_txnid = read_txn->mt_txnid; compacting_fixup_meta(env, meta); rc = mdbx_write(fd, buffer, meta_bytes); } From 81ea7bd41e9ca9c0062cdfb37cbf9e2c7de8e41f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Thu, 7 Jul 2022 17:54:13 +0300 Subject: [PATCH 028/364] mdbx: fix copy&paste typo inside `meta_checktxnid()`. --- src/core.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/core.c b/src/core.c index 12169a8b..49584cf2 100644 --- a/src/core.c +++ b/src/core.c @@ -7846,7 +7846,7 @@ static bool meta_checktxnid(const MDBX_env *env, const MDBX_meta *meta, likely(magic_and_version == MDBX_DATA_MAGIC)))) { if (report) mdbx_warning( - "catch invalid %sdb_mod_txnid %" PRIaTXN " for meta_txnid %" PRIaTXN + "catch invalid %sdb.mod_txnid %" PRIaTXN " for meta_txnid %" PRIaTXN " %s", "free", freedb_mod_txnid, head_txnid, "(workaround for incoherent flaw of unified page/buffer cache)"); @@ -7857,7 +7857,7 @@ static bool meta_checktxnid(const MDBX_env *env, const MDBX_meta *meta, likely(magic_and_version == MDBX_DATA_MAGIC)))) { if (report) mdbx_warning( - "catch invalid %sdb_mod_txnid %" PRIaTXN " for meta_txnid %" PRIaTXN + "catch invalid %sdb.mod_txnid %" PRIaTXN " for meta_txnid %" PRIaTXN " %s", "main", maindb_mod_txnid, head_txnid, "(workaround for incoherent flaw of unified page/buffer cache)"); @@ -7872,8 +7872,8 @@ static bool meta_checktxnid(const MDBX_env *env, const MDBX_meta *meta, if (report) mdbx_warning( "catch invalid root_page_txnid %" PRIaTXN - " for %sdb_mod_txnid %" PRIaTXN " %s", - root_txnid, "free", maindb_mod_txnid, + " for %sdb.mod_txnid %" PRIaTXN " %s", + root_txnid, "free", freedb_mod_txnid, "(workaround for incoherent flaw of unified page/buffer cache)"); ok = false; } @@ -7887,7 +7887,7 @@ static bool meta_checktxnid(const MDBX_env *env, const MDBX_meta *meta, if (report) mdbx_warning( "catch invalid root_page_txnid %" PRIaTXN - " for %sdb_mod_txnid %" PRIaTXN " %s", + " for %sdb.mod_txnid %" PRIaTXN " %s", root_txnid, "main", maindb_mod_txnid, "(workaround for incoherent flaw of unified page/buffer cache)"); ok = false; From a2c4f84f9c1321fb3587efa8f5f7421cc9cb3e1f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Fri, 8 Jul 2022 01:03:23 +0300 Subject: [PATCH 029/364] mdbx: update ChangeLog for v0.12.x --- ChangeLog.md | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/ChangeLog.md b/ChangeLog.md index b830b543..42b9251e 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -1,13 +1,31 @@ ChangeLog --------- +## v0.12.1 (scheduled to 2022-08-24) + +The release with set of new features. + +New: + + - Added the `Big Foot` feature which significantly reduces GC overhead for processing large lists of retired pages from huge transactions. + Now _libmdbx_ avoid creating large chunks of PNLs (page number lists) which required a long sequences of free pages, aka large/overflow pages. + Thus avoiding searching, allocating and storing such sequences inside GC. + - Added the `gcrtime_seconds16dot16` counter to the "Page Operation Statistics" that accumulates time spent for GC searching and reclaiming. + - Added the `MDBX_VALIDATION` environment options to extra validation of DB structure and pages content for carefully/safe handling damaged or untrusted DB. + - Improved hot/online validation and checking of database pages both for more robustness and performance. + - Added optionally cache for pointers to last/steady meta-pages (currently is off by default). + - Copy-with-compactification now clears/zeroes unused gaps inside database pages. + ## v0.12.0 at 2022-06-19 Not a release but preparation for changing feature set and API. +------------------------------------------------------------------------------- + ## v0.11.8 at 2022-06-12 + Acknowledgements: - [Masatoshi Fukunaga](https://github.com/mah0x211) for [Lua bindings](https://github.com/mah0x211/lua-libmdbx). @@ -56,6 +74,7 @@ Minors: ------------------------------------------------------------------------------- + ## v0.11.7 at 2022-04-22 The stable risen release after the Github's intentional malicious disaster. From db0f4e3d1e860bc6ddad6fbfd7fc0d2119228d09 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Fri, 8 Jul 2022 11:50:57 +0300 Subject: [PATCH 030/364] mdbx: minor fix `mdbx_page_check()` for case debug-audit enforced. --- src/core.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/core.c b/src/core.c index 3fbfbf8e..b27d46a6 100644 --- a/src/core.c +++ b/src/core.c @@ -18390,11 +18390,15 @@ __cold static int mdbx_page_check(MDBX_cursor *const mc, char *const end_of_page = (char *)mp + env->me_psize; const unsigned nkeys = page_numkeys(mp); - if (unlikely(nkeys <= IS_BRANCH(mp)) && - (!(mc->mc_flags & C_SUB) || mc->mc_db->md_entries) && - ((mc->mc_checking & CC_UPDATING) == 0 || !IS_MODIFIABLE(mc->mc_txn, mp))) - rc = bad_page(mp, "%s-page nkeys (%u) < %u\n", - IS_BRANCH(mp) ? "branch" : "leaf", nkeys, 1 + IS_BRANCH(mp)); + STATIC_ASSERT(P_BRANCH == 1); + if (unlikely(nkeys <= (uint8_t)(mp->mp_flags & P_BRANCH))) { + if ((!(mc->mc_flags & C_SUB) || mc->mc_db->md_entries) && + (!(mc->mc_checking & CC_UPDATING) || + !(IS_MODIFIABLE(mc->mc_txn, mp) || (mp->mp_flags & P_SUBP)))) + rc = + bad_page(mp, "%s-page nkeys (%u) < %u\n", + IS_BRANCH(mp) ? "branch" : "leaf", nkeys, 1 + IS_BRANCH(mp)); + } if (!IS_LEAF2(mp) && unlikely(PAGEHDRSZ + mp->mp_upper + nkeys * sizeof(MDBX_node) + nkeys - 1 > env->me_psize)) From b5346ee7658511bb4c8e03b181fd7910a22c3cbd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Fri, 8 Jul 2022 21:48:08 +0300 Subject: [PATCH 031/364] mdbx: use unsigned constants for page flags (to avoid MSVC warnings). --- src/internals.h | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/src/internals.h b/src/internals.h index da54282d..8cd0f329 100644 --- a/src/internals.h +++ b/src/internals.h @@ -516,17 +516,17 @@ typedef struct MDBX_page { struct MDBX_page *mp_next; /* for in-memory list of freed pages */ }; uint16_t mp_leaf2_ksize; /* key size if this is a LEAF2 page */ -#define P_BRANCH 0x01 /* branch page */ -#define P_LEAF 0x02 /* leaf page */ -#define P_OVERFLOW 0x04 /* overflow page */ -#define P_META 0x08 /* meta page */ -#define P_LEGACY_DIRTY 0x10 /* legacy P_DIRTY flag prior to v0.10 958fd5b9 */ +#define P_BRANCH 0x01u /* branch page */ +#define P_LEAF 0x02u /* leaf page */ +#define P_OVERFLOW 0x04u /* overflow page */ +#define P_META 0x08u /* meta page */ +#define P_LEGACY_DIRTY 0x10u /* legacy P_DIRTY flag prior to v0.10 958fd5b9 */ #define P_BAD P_LEGACY_DIRTY /* explicit flag for invalid/bad page */ -#define P_LEAF2 0x20 /* for MDBX_DUPFIXED records */ -#define P_SUBP 0x40 /* for MDBX_DUPSORT sub-pages */ -#define P_SPILLED 0x2000 /* spilled in parent txn */ -#define P_LOOSE 0x4000 /* page was dirtied then freed, can be reused */ -#define P_FROZEN 0x8000 /* used for retire page with known status */ +#define P_LEAF2 0x20u /* for MDBX_DUPFIXED records */ +#define P_SUBP 0x40u /* for MDBX_DUPSORT sub-pages */ +#define P_SPILLED 0x2000u /* spilled in parent txn */ +#define P_LOOSE 0x4000u /* page was dirtied then freed, can be reused */ +#define P_FROZEN 0x8000u /* used for retire page with known status */ #define P_ILL_BITS \ ((uint16_t) ~(P_BRANCH | P_LEAF | P_LEAF2 | P_OVERFLOW | P_SPILLED)) uint16_t mp_flags; @@ -545,7 +545,7 @@ typedef struct MDBX_page { #endif /* C99 */ } MDBX_page; -#define PAGETYPE_WHOLE(p) ((char)(p)->mp_flags) +#define PAGETYPE_WHOLE(p) ((uint8_t)(p)->mp_flags) /* Drop legacy P_DIRTY flag for sub-pages for compatilibity */ #define PAGETYPE_COMPAT(p) \ From ad5a83586b34e581b267b322599e7d41cb3aeb90 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Fri, 8 Jul 2022 21:56:39 +0300 Subject: [PATCH 032/364] mdbx: fix insignificant `uint8_t`-casting warnings. --- src/core.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/src/core.c b/src/core.c index b27d46a6..2366e746 100644 --- a/src/core.c +++ b/src/core.c @@ -15998,7 +15998,7 @@ int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, if (mc->mc_ki[mc->mc_top]) err = mdbx_update_key(mc, key); mdbx_cassert(mc, mc->mc_top + dtop < UINT16_MAX); - mc->mc_top += (uint16_t)dtop; + mc->mc_top += (uint8_t)dtop; if (unlikely(err != MDBX_SUCCESS)) return err; } @@ -16857,7 +16857,7 @@ static void mdbx_node_del(MDBX_cursor *mc, size_t ksize) { #if MDBX_DEBUG > 0 if (mdbx_audit_enabled()) { - const unsigned checking = mc->mc_checking; + const uint8_t checking = mc->mc_checking; mc->mc_checking |= CC_UPDATING; const int page_check_err = mdbx_page_check(mc, mp); mc->mc_checking = checking; @@ -17512,8 +17512,8 @@ static int mdbx_node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, bool fromleft) { } /* restore cursor after mdbx_page_search_lowest() */ - csrc->mc_snum = snum; - csrc->mc_top = snum - 1; + csrc->mc_snum = (uint8_t)snum; + csrc->mc_top = (uint8_t)snum - 1; csrc->mc_ki[csrc->mc_top] = 0; /* paranoia */ @@ -17547,8 +17547,8 @@ static int mdbx_node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, bool fromleft) { } /* restore cursor after mdbx_page_search_lowest() */ - mn.mc_snum = snum; - mn.mc_top = snum - 1; + mn.mc_snum = (uint8_t)snum; + mn.mc_top = (uint8_t)snum - 1; mn.mc_ki[mn.mc_top] = 0; const intptr_t delta = @@ -17985,8 +17985,8 @@ static int mdbx_page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { if (top_page == cdst->mc_pg[new_snum - 1]) { mdbx_cassert(cdst, cdst->mc_ki[new_snum - 1] == top_indx); /* LY: restore cursor stack */ - cdst->mc_snum = (uint16_t)new_snum; - cdst->mc_top = (uint16_t)new_snum - 1; + cdst->mc_snum = (uint8_t)new_snum; + cdst->mc_top = (uint8_t)new_snum - 1; mdbx_cassert(cdst, cdst->mc_snum < cdst->mc_db->md_depth || IS_LEAF(cdst->mc_pg[cdst->mc_db->md_depth - 1])); mdbx_cassert(cdst, @@ -18007,8 +18007,8 @@ static int mdbx_page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { cdst->mc_ki[new_snum - 1] = top_indx; cdst->mc_pg[new_snum] = (MDBX_page *)(~(uintptr_t)cdst->mc_pg[new_snum]); cdst->mc_ki[new_snum] = ~cdst->mc_ki[new_snum]; - cdst->mc_snum = (uint16_t)new_snum; - cdst->mc_top = (uint16_t)new_snum - 1; + cdst->mc_snum = (uint8_t)new_snum; + cdst->mc_top = (uint8_t)new_snum - 1; mdbx_cassert(cdst, cdst->mc_snum < cdst->mc_db->md_depth || IS_LEAF(cdst->mc_pg[cdst->mc_db->md_depth - 1])); mdbx_cassert(cdst, @@ -18757,7 +18757,7 @@ __cold static int mdbx_cursor_check(MDBX_cursor *mc) { } __cold static int mdbx_cursor_check_updating(MDBX_cursor *mc) { - const unsigned checking = mc->mc_checking; + const uint8_t checking = mc->mc_checking; mc->mc_checking |= CC_UPDATING; const int rc = mdbx_cursor_check(mc); mc->mc_checking = checking; @@ -19367,12 +19367,12 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *const newkey, page_node(mc->mc_pg[mc->mc_top - i], mc->mc_ki[mc->mc_top - i]), &sepkey); if (mc->mc_dbx->md_cmp(newkey, &sepkey) < 0) { - mc->mc_top -= i; + mc->mc_top -= (uint8_t)i; mdbx_debug("update new-first on parent [%i] page %u key %s", mc->mc_ki[mc->mc_top], mc->mc_pg[mc->mc_top]->mp_pgno, DKEY(newkey)); rc = mdbx_update_key(mc, newkey); - mc->mc_top += i; + mc->mc_top += (uint8_t)i; if (unlikely(rc != MDBX_SUCCESS)) goto done; } From baea4c81c9fc74de5b167b1493a8f5ef3b07953f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Fri, 8 Jul 2022 22:04:37 +0300 Subject: [PATCH 033/364] mdbx: simplify `safe64_reset()`. --- src/core.c | 44 +++++++++++++++++++------------------------- 1 file changed, 19 insertions(+), 25 deletions(-) diff --git a/src/core.c b/src/core.c index 2366e746..1f63520e 100644 --- a/src/core.c +++ b/src/core.c @@ -1086,40 +1086,34 @@ static __always_inline uint64_t safe64_txnid_next(uint64_t txnid) { return txnid; } -#if defined(MDBX_HAVE_C11ATOMICS) && defined(__LCC__) -#define safe64_reset(p, single_writer) \ - atomic_store64(p, UINT64_MAX, \ - (single_writer) ? mo_AcquireRelease \ - : mo_SequentialConsistency) -#else +/* Atomically make target value >= SAFE64_INVALID_THRESHOLD */ static __always_inline void safe64_reset(MDBX_atomic_uint64_t *p, bool single_writer) { -#if !MDBX_64BIT_CAS - if (!single_writer) { - STATIC_ASSERT(xMDBX_TXNID_STEP > 1); + if (single_writer) { +#if MDBX_64BIT_ATOMIC && MDBX_WORDBITS >= 64 + atomic_store64(p, UINT64_MAX, mo_AcquireRelease); +#else + atomic_store32(&p->high, UINT32_MAX, mo_AcquireRelease); +#endif /* MDBX_64BIT_ATOMIC && MDBX_WORDBITS >= 64 */ + } else { +#if MDBX_64BIT_CAS && MDBX_64BIT_ATOMIC + /* atomically make value >= SAFE64_INVALID_THRESHOLD by 64-bit operation */ + atomic_store64(p, UINT64_MAX, mo_SequentialConsistency); +#elif MDBX_64BIT_CAS + /* atomically make value >= SAFE64_INVALID_THRESHOLD by 32-bit operation */ + atomic_store32(&p->high, UINT32_MAX, mo_SequentialConsistency); +#else /* it is safe to increment low-part to avoid ABA, since xMDBX_TXNID_STEP > 1 * and overflow was preserved in safe64_txnid_next() */ + STATIC_ASSERT(xMDBX_TXNID_STEP > 1); atomic_add32(&p->low, 1) /* avoid ABA in safe64_reset_compare() */; - atomic_store32( - &p->high, UINT32_MAX, - mo_Relaxed) /* atomically make >= SAFE64_INVALID_THRESHOLD */; + atomic_store32(&p->high, UINT32_MAX, mo_SequentialConsistency); atomic_add32(&p->low, 1) /* avoid ABA in safe64_reset_compare() */; - } else -#endif /* !MDBX_64BIT_CAS */ -#if MDBX_64BIT_ATOMIC - /* atomically make value >= SAFE64_INVALID_THRESHOLD by 64-bit operation */ - atomic_store64(p, UINT64_MAX, - single_writer ? mo_AcquireRelease - : mo_SequentialConsistency); -#else - /* atomically make value >= SAFE64_INVALID_THRESHOLD by 32-bit operation */ - atomic_store32(&p->high, UINT32_MAX, - single_writer ? mo_AcquireRelease : mo_SequentialConsistency); -#endif /* MDBX_64BIT_ATOMIC */ +#endif /* MDBX_64BIT_CAS && MDBX_64BIT_ATOMIC */ + } assert(p->weak >= SAFE64_INVALID_THRESHOLD); mdbx_jitter4testing(true); } -#endif /* LCC && MDBX_HAVE_C11ATOMICS */ static __always_inline bool safe64_reset_compare(MDBX_atomic_uint64_t *p, txnid_t compare) { From 6f6c581c6e09240b46f6fd852fd35ff225977f9b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Fri, 8 Jul 2022 22:08:10 +0300 Subject: [PATCH 034/364] mdbx: minor refine `safe64_write()`, `safe64_read()` and `safe64_inc()`. --- src/core.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/core.c b/src/core.c index 1f63520e..9f38e65f 100644 --- a/src/core.c +++ b/src/core.c @@ -1146,7 +1146,7 @@ static __always_inline bool safe64_reset_compare(MDBX_atomic_uint64_t *p, static __always_inline void safe64_write(MDBX_atomic_uint64_t *p, const uint64_t v) { assert(p->weak >= SAFE64_INVALID_THRESHOLD); -#if MDBX_64BIT_ATOMIC +#if MDBX_64BIT_ATOMIC && MDBX_64BIT_CAS atomic_store64(p, v, mo_AcquireRelease); #else /* MDBX_64BIT_ATOMIC */ mdbx_compiler_barrier(); @@ -1163,8 +1163,10 @@ static __always_inline void safe64_write(MDBX_atomic_uint64_t *p, static __always_inline uint64_t safe64_read(const MDBX_atomic_uint64_t *p) { mdbx_jitter4testing(true); - uint64_t v = atomic_load64(p, mo_AcquireRelease); - mdbx_jitter4testing(true); + uint64_t v; + do + v = atomic_load64(p, mo_AcquireRelease); + while (!MDBX_64BIT_ATOMIC && unlikely(v != p->weak)); return v; } @@ -1206,7 +1208,7 @@ MDBX_MAYBE_UNUSED static void safe64_inc(MDBX_atomic_uint64_t *p, const uint64_t v) { assert(v > 0); - safe64_update(p, atomic_load64(p, mo_Relaxed) + v); + safe64_update(p, safe64_read(p) + v); } /*----------------------------------------------------------------------------*/ From d572052178c68e89861e38171b5100513b432c85 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Fri, 8 Jul 2022 22:09:57 +0300 Subject: [PATCH 035/364] mdbx: refine `meta_checktxnid()` and `meta_waittxnid()`. --- src/core.c | 50 ++++++++++++++++++++++++++++---------------------- 1 file changed, 28 insertions(+), 22 deletions(-) diff --git a/src/core.c b/src/core.c index 9f38e65f..75e67284 100644 --- a/src/core.c +++ b/src/core.c @@ -3919,7 +3919,7 @@ static int __must_check_result mdbx_page_split(MDBX_cursor *mc, MDBX_val *const newdata, pgno_t newpgno, unsigned nflags); -static bool meta_checktxnid(const MDBX_env *env, const MDBX_meta *meta, +static bool meta_checktxnid(const MDBX_env *env, const volatile MDBX_meta *meta, bool report); static int __must_check_result mdbx_validate_meta_copy(MDBX_env *env, const MDBX_meta *meta, @@ -7860,7 +7860,7 @@ __cold int mdbx_thread_unregister(const MDBX_env *env) { } /* check against todo4recovery://erased_by_github/libmdbx/issues/269 */ -static bool meta_checktxnid(const MDBX_env *env, const MDBX_meta *meta, +static bool meta_checktxnid(const MDBX_env *env, const volatile MDBX_meta *meta, bool report) { const txnid_t head_txnid = meta_txnid(env, meta); const txnid_t freedb_mod_txnid = meta->mm_dbs[FREE_DBI].md_mod_txnid; @@ -7875,9 +7875,9 @@ static bool meta_checktxnid(const MDBX_env *env, const MDBX_meta *meta, const MDBX_page *maindb_root = (env->me_map && maindb_root_pgno != P_INVALID) ? pgno2page(env, maindb_root_pgno) : nullptr; - const uint64_t magic_and_version = - unaligned_peek_u64(4, &meta->mm_magic_and_version); + unaligned_peek_u64_volatile(4, &meta->mm_magic_and_version); + bool ok = true; if (unlikely(!head_txnid || head_txnid < freedb_mod_txnid || (!freedb_mod_txnid && freedb_root && @@ -7934,31 +7934,37 @@ static bool meta_checktxnid(const MDBX_env *env, const MDBX_meta *meta, return ok; } +__cold static bool is_timeout(uint64_t *timestamp) { + if (likely(!*timestamp)) { + *timestamp = mdbx_osal_monotime(); + return false; + } + return mdbx_osal_monotime() - *timestamp > 65536 / 10; +} + /* check with timeout as the workaround * for todo4recovery://erased_by_github/libmdbx/issues/269 */ -static int meta_waittxnid(const MDBX_env *env, const MDBX_meta *meta, +static int meta_waittxnid(const MDBX_env *env, const volatile MDBX_meta *meta, uint64_t *timestamp) { if (likely(meta_checktxnid(env, meta, !*timestamp))) return MDBX_SUCCESS; - if (!*timestamp) - *timestamp = mdbx_osal_monotime(); - else if (unlikely(mdbx_osal_monotime() - *timestamp > 65536 / 10)) { - mdbx_error("bailout waiting for valid snapshot %s", - "(workaround for incoherent flaw of unified page/buffer cache)"); - return MDBX_CORRUPTED; + if (likely(!is_timeout(timestamp))) { +#if defined(_WIN32) || defined(_WIN64) + SwitchToThread(); +#elif defined(__linux__) || defined(__gnu_linux__) || defined(_UNIX03_SOURCE) + sched_yield(); +#elif (defined(_GNU_SOURCE) && __GLIBC_PREREQ(2, 1)) || defined(_OPEN_THREADS) + pthread_yield(); +#else + usleep(42); +#endif + return MDBX_RESULT_TRUE; } -#if defined(_WIN32) || defined(_WIN64) - SwitchToThread(); -#elif defined(__linux__) || defined(__gnu_linux__) || defined(_UNIX03_SOURCE) - sched_yield(); -#elif (defined(_GNU_SOURCE) && __GLIBC_PREREQ(2, 1)) || defined(_OPEN_THREADS) - pthread_yield(); -#else - usleep(42); -#endif - return MDBX_RESULT_TRUE; + mdbx_error("bailout waiting for valid snapshot (%s)", + "workaround for incoherent flaw of unified page/buffer cache"); + return MDBX_CORRUPTED; } /* Common code for mdbx_txn_begin() and mdbx_txn_renew(). */ @@ -8171,7 +8177,7 @@ static int mdbx_txn_renew0(MDBX_txn *txn, const unsigned flags) { uint64_t timestamp = 0; while ( "workaround for todo4recovery://erased_by_github/libmdbx/issues/269") { - rc = meta_waittxnid(env, (const MDBX_meta *)meta, ×tamp); + rc = meta_waittxnid(env, meta, ×tamp); if (likely(rc == MDBX_SUCCESS)) break; if (unlikely(rc != MDBX_RESULT_TRUE)) From a4a35ce9cb7176b914e0606d38c6d6e47a9431f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Fri, 8 Jul 2022 22:14:28 +0300 Subject: [PATCH 036/364] mdbx: rework `find_oldest_reader()`. 1. Fix regression `assert: oldest >= lck->mti_oldest_reader.weak` after d4bf0a3332c7b05331ab0a87e3cd65b0903edc3c. 2. Add explicit check, kick and notice for stuck reader. 3. Made more e2k-frendly. --- src/core.c | 81 +++++++++++++++++++++++++++++++----------------------- 1 file changed, 47 insertions(+), 34 deletions(-) diff --git a/src/core.c b/src/core.c index 75e67284..6488bc0d 100644 --- a/src/core.c +++ b/src/core.c @@ -5952,53 +5952,66 @@ static const char *mdbx_durable_str(volatile const MDBX_meta *const meta) { /*----------------------------------------------------------------------------*/ /* Find oldest txnid still referenced. */ -static txnid_t find_oldest_reader(const MDBX_env *env) { - const txnid_t steady_edge = - constmeta_txnid(env, constmeta_prefer_steady(env)); - mdbx_assert(env, steady_edge <= env->me_txn0->mt_txnid); +static txnid_t find_oldest_reader(MDBX_env *env) { + const uint32_t nothing_changed = MDBX_STRING_TETRAD("None"); + const txnid_t steady = constmeta_txnid(env, constmeta_prefer_steady(env)); + mdbx_assert(env, steady <= env->me_txn0->mt_txnid); MDBX_lockinfo *const lck = env->me_lck_mmap.lck; if (unlikely(lck == NULL /* exclusive without-lck mode */)) { mdbx_assert(env, env->me_lck == (void *)&env->x_lckless_stub); - return env->me_lck->mti_oldest_reader.weak = steady_edge; + return env->me_lck->mti_oldest_reader.weak = steady; } - const txnid_t last_oldest = + const txnid_t prev_oldest = atomic_load64(&lck->mti_oldest_reader, mo_AcquireRelease); - mdbx_assert(env, steady_edge >= last_oldest); - if (likely(last_oldest == steady_edge)) - return steady_edge; + mdbx_assert(env, steady >= prev_oldest); - const uint32_t nothing_changed = MDBX_STRING_TETRAD("None"); - const uint32_t snap_readers_refresh_flag = - atomic_load32(&lck->mti_readers_refresh_flag, mo_AcquireRelease); - mdbx_jitter4testing(false); - if (snap_readers_refresh_flag == nothing_changed) - return last_oldest; + txnid_t new_oldest = prev_oldest; + while (new_oldest != steady && + nothing_changed != + atomic_load32(&lck->mti_readers_refresh_flag, mo_AcquireRelease)) { + lck->mti_readers_refresh_flag.weak = nothing_changed; + mdbx_jitter4testing(false); + const unsigned snap_nreaders = + atomic_load32(&lck->mti_numreaders, mo_AcquireRelease); + new_oldest = steady; - atomic_store32(&lck->mti_readers_refresh_flag, nothing_changed, mo_Relaxed); - const unsigned snap_nreaders = - atomic_load32(&lck->mti_numreaders, mo_AcquireRelease); - txnid_t oldest = steady_edge; - for (unsigned i = 0; i < snap_nreaders; ++i) { - if (atomic_load32(&lck->mti_readers[i].mr_pid, mo_AcquireRelease)) { - /* mdbx_jitter4testing(true); */ - const txnid_t snap = safe64_read(&lck->mti_readers[i].mr_txnid); - if (oldest > snap && /* ignore pending updates */ snap <= steady_edge) { - oldest = snap; - if (oldest == last_oldest) - return oldest; + for (unsigned i = 0; i < snap_nreaders; ++i) { + const mdbx_pid_t pid = + atomic_load32(&lck->mti_readers[i].mr_pid, mo_AcquireRelease); + if (!pid) + continue; + mdbx_jitter4testing(true); + + const txnid_t rtxn = safe64_read(&lck->mti_readers[i].mr_txnid); + if (unlikely(rtxn < prev_oldest)) { + if (unlikely(nothing_changed == + atomic_load32(&lck->mti_readers_refresh_flag, + mo_AcquireRelease)) && + safe64_reset_compare(&lck->mti_readers[i].mr_txnid, rtxn)) { + mdbx_notice("kick stuck reader[%u of %u].pid_%u %" PRIaTXN + " < prev-oldest %" PRIaTXN ", steady-txn %" PRIaTXN, + i, snap_nreaders, pid, rtxn, prev_oldest, steady); + } + continue; + } + + if (rtxn < new_oldest) { + new_oldest = rtxn; + if (!MDBX_DEBUG && !MDBX_FORCE_ASSERTIONS && new_oldest == prev_oldest) + break; } } - } - if (oldest != last_oldest) { - mdbx_verbose("update oldest %" PRIaTXN " -> %" PRIaTXN, last_oldest, - oldest); - mdbx_assert(env, oldest >= lck->mti_oldest_reader.weak); - atomic_store64(&lck->mti_oldest_reader, oldest, mo_Relaxed); + if (new_oldest != prev_oldest) { + mdbx_verbose("update oldest %" PRIaTXN " -> %" PRIaTXN, prev_oldest, + new_oldest); + mdbx_assert(env, new_oldest >= lck->mti_oldest_reader.weak); + atomic_store64(&lck->mti_oldest_reader, new_oldest, mo_Relaxed); + } } - return oldest; + return new_oldest; } /* Find largest mvcc-snapshot still referenced. */ From 434ad8edc866d5f86c6e0c5cd2b5ac161b094b8a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Fri, 8 Jul 2022 22:21:39 +0300 Subject: [PATCH 037/364] mdbx: refine `bind_rslot()`. --- src/core.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/core.c b/src/core.c index 6488bc0d..5c8d5825 100644 --- a/src/core.c +++ b/src/core.c @@ -7767,10 +7767,10 @@ static bind_rslot_result bind_rslot(MDBX_env *env, const uintptr_t tid) { result.err = MDBX_SUCCESS; unsigned slot, nreaders; while (1) { - nreaders = atomic_load32(&env->me_lck->mti_numreaders, mo_Relaxed); + nreaders = env->me_lck->mti_numreaders.weak; for (slot = 0; slot < nreaders; slot++) - if (atomic_load32(&env->me_lck->mti_readers[slot].mr_pid, mo_Relaxed) == - 0) + if (!atomic_load32(&env->me_lck->mti_readers[slot].mr_pid, + mo_AcquireRelease)) break; if (likely(slot < env->me_maxreaders)) @@ -7791,13 +7791,12 @@ static bind_rslot_result bind_rslot(MDBX_env *env, const uintptr_t tid) { * slot, next publish it in lck->mti_numreaders. After * that, it is safe for mdbx_env_close() to touch it. * When it will be closed, we can finally claim it. */ - atomic_store32(&result.rslot->mr_pid, 0, mo_Relaxed); + atomic_store32(&result.rslot->mr_pid, 0, mo_SequentialConsistency); safe64_reset(&result.rslot->mr_txnid, true); if (slot == nreaders) - atomic_store32(&env->me_lck->mti_numreaders, ++nreaders, mo_Relaxed); - atomic_store64(&result.rslot->mr_tid, (env->me_flags & MDBX_NOTLS) ? 0 : tid, - mo_Relaxed); - atomic_store32(&result.rslot->mr_pid, env->me_pid, mo_Relaxed); + env->me_lck->mti_numreaders.weak = ++nreaders; + result.rslot->mr_tid.weak = (env->me_flags & MDBX_NOTLS) ? 0 : tid; + atomic_store32(&result.rslot->mr_pid, env->me_pid, mo_AcquireRelease); mdbx_rdt_unlock(env); if (likely(env->me_flags & MDBX_ENV_TXKEY)) { @@ -7862,6 +7861,7 @@ __cold int mdbx_thread_unregister(const MDBX_env *env) { r->mr_tid.weak != mdbx_thread_self())) return MDBX_BAD_RSLOT; + mdbx_assert(env, r->mr_txnid.weak >= SAFE64_INVALID_THRESHOLD); if (unlikely(r->mr_txnid.weak < SAFE64_INVALID_THRESHOLD)) return MDBX_BUSY /* transaction is still active */; From 9421bb424d91b1531a37aad109a8594b27990dec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Fri, 8 Jul 2022 22:29:24 +0300 Subject: [PATCH 038/364] mdbx: refine/simplify read-latch loop inside `mdbx_txn_renew0()`. 1. Explicitly check and handle a race/collision case with `find_oldest_reader()`. 2. Handle "recovery mode" (me_stuck_meta >= 0) by the same code as for regular latch. 3. Add bailout error message for buggy compiler and/or hardware (paranoid). --- src/core.c | 125 +++++++++++++++++++++++++---------------------------- 1 file changed, 59 insertions(+), 66 deletions(-) diff --git a/src/core.c b/src/core.c index 5c8d5825..1632c78c 100644 --- a/src/core.c +++ b/src/core.c @@ -8054,84 +8054,77 @@ static int mdbx_txn_renew0(MDBX_txn *txn, const unsigned flags) { } /* Seek & fetch the last meta */ - if (likely(/* not recovery mode */ env->me_stuck_meta < 0)) { - uint64_t timestamp = 0; - while (1) { - meta_cache_clear(env); - volatile const MDBX_meta *const meta = meta_prefer_last(env); - mdbx_jitter4testing(false); - const txnid_t snap = meta_txnid(env, meta); - mdbx_jitter4testing(false); - if (likely(r)) { - safe64_reset(&r->mr_txnid, false); - atomic_store32(&r->mr_snapshot_pages_used, meta->mm_geo.next, - mo_Relaxed); - atomic_store64(&r->mr_snapshot_pages_retired, - unaligned_peek_u64_volatile(4, meta->mm_pages_retired), - mo_Relaxed); - safe64_write(&r->mr_txnid, snap); - mdbx_jitter4testing(false); - mdbx_assert(env, r->mr_pid.weak == mdbx_getpid()); - mdbx_assert( - env, r->mr_tid.weak == - ((env->me_flags & MDBX_NOTLS) ? 0 : mdbx_thread_self())); - mdbx_assert(env, r->mr_txnid.weak == snap); - atomic_store32(&env->me_lck->mti_readers_refresh_flag, true, - mo_AcquireRelease); - } else { - /* exclusive mode without lck */ - } - mdbx_jitter4testing(true); - - /* Snap the state from current meta-head */ - txn->mt_txnid = snap; - txn->mt_geo = meta->mm_geo; - STATIC_ASSERT(CORE_DBS == 2); - txn->mt_dbs[0] = meta->mm_dbs[0]; - txn->mt_dbs[1] = meta->mm_dbs[1]; - txn->mt_canary = meta->mm_canary; - - /* LY: Retry on a race, ITS#7970. - * The barrier is not needed here since C11-atomics are used, - * but it is reasonable paranoia to avoid compiler misoptimization - * and makes clarity for code readers. */ - mdbx_compiler_barrier(); - if (likely(meta == meta_prefer_last(env) && - snap == meta_txnid(env, meta) && - snap >= atomic_load64(&env->me_lck->mti_oldest_reader, - mo_AcquireRelease))) { - rc = meta_waittxnid(env, (const MDBX_meta *)meta, ×tamp); - mdbx_jitter4testing(false); - if (likely(rc == MDBX_SUCCESS)) - break; - if (likely(rc == MDBX_RESULT_TRUE)) - continue; - goto bailout; - } - } - } else { - /* r/o recovery mode */ - MDBX_meta *const meta = METAPAGE(env, env->me_stuck_meta); - txn->mt_txnid = constmeta_txnid(env, meta); - txn->mt_geo = meta->mm_geo; - memcpy(txn->mt_dbs, meta->mm_dbs, CORE_DBS * sizeof(MDBX_db)); - txn->mt_canary = meta->mm_canary; + uint64_t timestamp = 0; + unsigned loop = 0; + while (1) { + meta_cache_clear(env); + volatile const MDBX_meta *const meta = + likely(env->me_stuck_meta < 0) + ? /* regular */ meta_prefer_last(env) + : /* recovery mode */ METAPAGE(env, env->me_stuck_meta); + mdbx_jitter4testing(false); + const txnid_t target_txnid = meta_txnid(env, meta); + mdbx_jitter4testing(false); if (likely(r)) { + safe64_reset(&r->mr_txnid, false); atomic_store32(&r->mr_snapshot_pages_used, meta->mm_geo.next, mo_Relaxed); atomic_store64(&r->mr_snapshot_pages_retired, - unaligned_peek_u64(4, meta->mm_pages_retired), + unaligned_peek_u64_volatile(4, meta->mm_pages_retired), mo_Relaxed); - atomic_store64(&r->mr_txnid, txn->mt_txnid, mo_Relaxed); + safe64_write(&r->mr_txnid, target_txnid); mdbx_jitter4testing(false); mdbx_assert(env, r->mr_pid.weak == mdbx_getpid()); mdbx_assert( env, r->mr_tid.weak == ((env->me_flags & MDBX_NOTLS) ? 0 : mdbx_thread_self())); - mdbx_assert(env, r->mr_txnid.weak == txn->mt_txnid); + mdbx_assert(env, + r->mr_txnid.weak == target_txnid || + (r->mr_txnid.weak >= SAFE64_INVALID_THRESHOLD && + target_txnid < env->me_lck->mti_oldest_reader.weak)); atomic_store32(&env->me_lck->mti_readers_refresh_flag, true, - mo_Relaxed); + mo_AcquireRelease); + } else { + /* exclusive mode without lck */ + mdbx_assert(env, !env->me_lck_mmap.lck && + env->me_lck == (void *)&env->x_lckless_stub); } + mdbx_jitter4testing(true); + + /* Snap the state from current meta-head */ + txn->mt_txnid = target_txnid; + txn->mt_geo = meta->mm_geo; + STATIC_ASSERT(CORE_DBS == 2); + txn->mt_dbs[0] = meta->mm_dbs[0]; + txn->mt_dbs[1] = meta->mm_dbs[1]; + txn->mt_canary = meta->mm_canary; + + /* LY: Retry on a race, ITS#7970. + * The barrier is not needed here since C11-atomics are used, + * but it is reasonable paranoia to avoid compiler misoptimization + * and makes clarity for code readers. */ + mdbx_compiler_barrier(); + const txnid_t oldest = + atomic_load64(&env->me_lck->mti_oldest_reader, mo_AcquireRelease); + if (unlikely(target_txnid < oldest || + (meta != meta_prefer_last(env) && env->me_stuck_meta < 0) || + target_txnid != meta_txnid(env, meta))) { + if (unlikely(++loop > 42)) { + mdbx_error("bailout waiting for valid snapshot (%s)", + "metapages are too volatile"); + rc = MDBX_PROBLEM; + goto bailout; + } + timestamp = 0; + continue; + } + + rc = meta_waittxnid(env, meta, ×tamp); + mdbx_jitter4testing(false); + if (likely(rc == MDBX_SUCCESS)) + break; + if (unlikely(rc != MDBX_RESULT_TRUE)) + goto bailout; } if (unlikely(txn->mt_txnid < MIN_TXNID || txn->mt_txnid > MAX_TXNID)) { From 9aa2aae93e07a815909f68ac5c86c7c9a69b0a72 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Fri, 8 Jul 2022 23:11:16 +0300 Subject: [PATCH 039/364] mdbx: rework/simplify `kick_longlived_readers()`. --- mdbx.h | 11 +++-- src/core.c | 135 ++++++++++++++++++++++++----------------------------- 2 files changed, 68 insertions(+), 78 deletions(-) diff --git a/mdbx.h b/mdbx.h index 9af7e74a..edc9fa02 100644 --- a/mdbx.h +++ b/mdbx.h @@ -5070,11 +5070,12 @@ LIBMDBX_API int mdbx_thread_unregister(const MDBX_env *env); * this value into account to evaluate the impact that * a long-running transaction has. * \param [in] retry A retry number starting from 0. - * If callback has returned 0 at least once, then at end - * of current handling loop the callback function will be - * called additionally with negative value to notify about - * the end of loop. The callback function can use this value - * to implement timeout logic while waiting for readers. + * If callback has returned 0 at least once, then at end of + * current handling loop the callback function will be + * called additionally with negative `retry` value to notify + * about the end of loop. The callback function can use this + * fact to implement timeout reset logic while waiting for + * a readers. * * \returns The RETURN CODE determines the further actions libmdbx and must * match the action which was executed by the callback: diff --git a/src/core.c b/src/core.c index 1632c78c..fa5102ab 100644 --- a/src/core.c +++ b/src/core.c @@ -3839,8 +3839,7 @@ typedef struct page_result { int err; } pgr_t; -static txnid_t mdbx_kick_longlived_readers(MDBX_env *env, - const txnid_t laggard); +static txnid_t kick_longlived_readers(MDBX_env *env, const txnid_t laggard); static pgr_t page_new(MDBX_cursor *mc, const unsigned flags); static pgr_t page_new_large(MDBX_cursor *mc, const unsigned npages); @@ -6966,9 +6965,8 @@ __cold static pgr_t page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num, if (flags & MDBX_ALLOC_GC) { const txnid_t laggard = find_oldest_reader(env); - if (laggard >= detent || - (laggard < txn->mt_txnid - xMDBX_TXNID_STEP && - mdbx_kick_longlived_readers(env, laggard) >= detent)) + if (laggard >= detent || (laggard < txn->mt_txnid - xMDBX_TXNID_STEP && + kick_longlived_readers(env, laggard) >= detent)) continue; } @@ -21648,69 +21646,51 @@ __cold int mdbx_setup_debug(int loglevel, int flags, MDBX_debug_func *logger) { return rc; } -__cold static txnid_t mdbx_kick_longlived_readers(MDBX_env *env, - const txnid_t laggard) { +__cold static txnid_t kick_longlived_readers(MDBX_env *env, + const txnid_t laggard) { mdbx_debug("DB size maxed out by reading #%" PRIaTXN, laggard); - - int retry; - for (retry = 0; retry < INT_MAX; ++retry) { - txnid_t oldest = constmeta_txnid(env, constmeta_prefer_steady(env)); + MDBX_hsr_func *const callback = env->me_hsr_callback; + txnid_t oldest = 0; + bool notify_eof_of_loop = false; + int retry = 0; + do { + env->me_lck->mti_readers_refresh_flag.weak = /* force refresh */ true; + oldest = find_oldest_reader(env); mdbx_assert(env, oldest < env->me_txn0->mt_txnid); mdbx_assert(env, oldest >= laggard); mdbx_assert(env, oldest >= env->me_lck->mti_oldest_reader.weak); + + const txnid_t steady = meta_txnid(env, meta_prefer_steady(env)); MDBX_lockinfo *const lck = env->me_lck_mmap.lck; - if (oldest == laggard || unlikely(!lck /* without-LCK mode */)) - return oldest; + if (oldest == steady || oldest > laggard || /* without-LCK mode */ !lck) + break; if (MDBX_IS_ERROR(mdbx_cleanup_dead_readers(env, false, NULL))) break; - MDBX_reader *asleep = nullptr; - uint64_t oldest_retired = UINT64_MAX; - const unsigned snap_nreaders = - atomic_load32(&lck->mti_numreaders, mo_AcquireRelease); - for (unsigned i = 0; i < snap_nreaders; ++i) { - retry: - if (atomic_load32(&lck->mti_readers[i].mr_pid, mo_AcquireRelease)) { - /* mdbx_jitter4testing(true); */ - const uint64_t snap_retired = atomic_load64( - &lck->mti_readers[i].mr_snapshot_pages_retired, mo_Relaxed); - const txnid_t snap_txnid = safe64_read(&lck->mti_readers[i].mr_txnid); - if (unlikely(snap_retired != - atomic_load64( - &lck->mti_readers[i].mr_snapshot_pages_retired, - mo_AcquireRelease) || - snap_txnid != safe64_read(&lck->mti_readers[i].mr_txnid))) - goto retry; - if (oldest > snap_txnid && - laggard <= /* ignore pending updates */ snap_txnid) { - oldest = snap_txnid; - oldest_retired = snap_retired; - asleep = &lck->mti_readers[i]; - } - } - } - - if (laggard < oldest || !asleep) { - if (retry && env->me_hsr_callback) { - /* LY: notify end of hsr-loop */ - const txnid_t gap = oldest - laggard; - env->me_hsr_callback(env, env->me_txn, 0, 0, laggard, - (gap < UINT_MAX) ? (unsigned)gap : UINT_MAX, 0, - -retry); - } - mdbx_notice("hsr-kick: update oldest %" PRIaTXN " -> %" PRIaTXN, - lck->mti_oldest_reader.weak, oldest); - mdbx_assert(env, lck->mti_oldest_reader.weak <= oldest); - return atomic_store64(&lck->mti_oldest_reader, oldest, mo_Relaxed); - } - - if (!env->me_hsr_callback) + if (!callback) break; - uint32_t pid = atomic_load32(&asleep->mr_pid, mo_AcquireRelease); - uint64_t tid = asleep->mr_tid.weak; - if (safe64_read(&asleep->mr_txnid) != laggard || pid <= 0) + MDBX_reader *stucked = nullptr; + uint64_t hold_retired = 0; + for (unsigned i = 0; i < lck->mti_numreaders.weak; ++i) { + const uint64_t snap_retired = atomic_load64( + &lck->mti_readers[i].mr_snapshot_pages_retired, mo_Relaxed); + const txnid_t rtxn = safe64_read(&lck->mti_readers[i].mr_txnid); + if (rtxn == laggard && + atomic_load32(&lck->mti_readers[i].mr_pid, mo_AcquireRelease)) { + hold_retired = snap_retired; + stucked = &lck->mti_readers[i]; + } + } + + if (!stucked) + break; + + uint32_t pid = atomic_load32(&stucked->mr_pid, mo_AcquireRelease); + uint64_t tid = atomic_load64(&stucked->mr_tid, mo_AcquireRelease); + if (safe64_read(&stucked->mr_txnid) != laggard || !pid || + stucked->mr_snapshot_pages_retired.weak != hold_retired) continue; const MDBX_meta *head_meta = constmeta_prefer_last(env); @@ -21719,32 +21699,41 @@ __cold static txnid_t mdbx_kick_longlived_readers(MDBX_env *env, const uint64_t head_retired = unaligned_peek_u64(4, head_meta->mm_pages_retired); const size_t space = - (head_retired > oldest_retired) - ? pgno2bytes(env, (pgno_t)(head_retired - oldest_retired)) + (head_retired > hold_retired) + ? pgno2bytes(env, (pgno_t)(head_retired - hold_retired)) : 0; - int rc = env->me_hsr_callback( - env, env->me_txn, pid, (mdbx_tid_t)tid, laggard, - (gap < UINT_MAX) ? (unsigned)gap : UINT_MAX, space, retry); + int rc = + callback(env, env->me_txn, pid, (mdbx_tid_t)tid, laggard, + (gap < UINT_MAX) ? (unsigned)gap : UINT_MAX, space, retry); if (rc < 0) + /* hsr returned error and/or agree MDBX_MAP_FULL error */ break; if (rc > 0) { if (rc == 1) { - safe64_reset_compare(&asleep->mr_txnid, laggard); + /* hsr reported transaction (will be) aborted asynchronous */ + safe64_reset_compare(&stucked->mr_txnid, laggard); } else { - safe64_reset(&asleep->mr_txnid, true); - atomic_store64(&asleep->mr_tid, 0, mo_Relaxed); - atomic_store32(&asleep->mr_pid, 0, mo_Relaxed); + /* hsr reported reader process was killed and slot should be cleared */ + safe64_reset(&stucked->mr_txnid, true); + atomic_store64(&stucked->mr_tid, 0, mo_Relaxed); + atomic_store32(&stucked->mr_pid, 0, mo_AcquireRelease); } - atomic_store32(&lck->mti_readers_refresh_flag, true, mo_Relaxed); - } - } + } else + notify_eof_of_loop = true; - if (retry && env->me_hsr_callback) { - /* LY: notify end of hsr-loop */ - env->me_hsr_callback(env, env->me_txn, 0, 0, laggard, 0, 0, -retry); + } while (++retry < INT_MAX); + + if (notify_eof_of_loop) { + /* notify end of hsr-loop */ + const txnid_t turn = oldest - laggard; + if (turn) + mdbx_notice("hsr-kick: done turn %" PRIaTXN " -> %" PRIaTXN " +%" PRIaTXN, + laggard, oldest, turn); + callback(env, env->me_txn, 0, 0, laggard, + (turn < UINT_MAX) ? (unsigned)turn : UINT_MAX, 0, -retry); } - return find_oldest_reader(env); + return oldest; } #ifndef LIBMDBX_NO_EXPORTS_LEGACY_API From 9108a241a261dc1516ccf86cb5700c6cb1209cb2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Fri, 8 Jul 2022 23:13:44 +0300 Subject: [PATCH 040/364] mdbx: minor fix/clarify debug logging inside `page_alloc_slowpath()`. --- src/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/core.c b/src/core.c index fa5102ab..eb07cfe7 100644 --- a/src/core.c +++ b/src/core.c @@ -6907,7 +6907,7 @@ __cold static pgr_t page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num, if (head != steady && META_IS_STEADY(steady) && detent == constmeta_txnid(env, steady) + 1) { mdbx_debug("gc-kick-steady: head %" PRIaTXN "-%s, tail %" PRIaTXN - "-%s, oldest %" PRIaTXN, + "-%s, detent %" PRIaTXN, constmeta_txnid(env, head), mdbx_durable_str(head), constmeta_txnid(env, steady), mdbx_durable_str(steady), detent); From e8dd208e967c809fd7036801d209cddeace804e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Sat, 9 Jul 2022 00:39:41 +0300 Subject: [PATCH 041/364] mdbx: more cursor-checking for audit-without-debug. --- src/core.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/core.c b/src/core.c index eb07cfe7..354fd730 100644 --- a/src/core.c +++ b/src/core.c @@ -16861,7 +16861,6 @@ static void mdbx_node_del(MDBX_cursor *mc, size_t ksize) { mdbx_cassert(mc, (size_t)UINT16_MAX - mp->mp_upper >= sz); mp->mp_upper += (indx_t)sz; -#if MDBX_DEBUG > 0 if (mdbx_audit_enabled()) { const uint8_t checking = mc->mc_checking; mc->mc_checking |= CC_UPDATING; @@ -16869,7 +16868,6 @@ static void mdbx_node_del(MDBX_cursor *mc, size_t ksize) { mc->mc_checking = checking; mdbx_cassert(mc, page_check_err == MDBX_SUCCESS); } -#endif /* MDBX_DEBUG > 0 */ } /* Compact the main page after deleting a node on a subpage. From 194f2f45d27b54498951e50f8409e48629f3f312 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Sat, 9 Jul 2022 18:07:54 +0300 Subject: [PATCH 042/364] mdbx: refine/fix using nested cursor's db inside copy-with-compactification. --- src/core.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/core.c b/src/core.c index 354fd730..d950402f 100644 --- a/src/core.c +++ b/src/core.c @@ -19839,8 +19839,15 @@ __cold static int compacting_walk_tree(mdbx_compacting_ctx *ctx, &nested->md_root, mp->mp_txnid); } } else { + mdbx_cassert(mc, + (mc->mc_flags & C_SUB) == 0 && mc->mc_xcursor == 0); MDBX_cursor_couple *couple = - container_of(mc, MDBX_cursor_couple, inner.mx_cursor); + container_of(mc, MDBX_cursor_couple, outer); + mdbx_cassert(mc, couple->inner.mx_cursor.mc_signature == + ~MDBX_MC_LIVE && + !couple->inner.mx_cursor.mc_flags && + !couple->inner.mx_cursor.mc_db && + !couple->inner.mx_cursor.mc_dbx); nested = &couple->inner.mx_db; memcpy(nested, node_data(node), sizeof(MDBX_db)); rc = compacting_walk_sdb(ctx, nested); @@ -19908,6 +19915,8 @@ __cold static int compacting_walk_sdb(mdbx_compacting_ctx *ctx, MDBX_db *sdb) { return MDBX_SUCCESS; /* empty db */ MDBX_cursor_couple couple; + memset(&couple, 0, sizeof(couple)); + couple.inner.mx_cursor.mc_signature = ~MDBX_MC_LIVE; MDBX_dbx dbx = {.md_klen_min = INT_MAX}; uint8_t dbistate = DBI_VALID | DBI_AUDITED; int rc = mdbx_couple_init(&couple, ~0u, ctx->mc_txn, sdb, &dbx, &dbistate); From 12d2879a9fa3561391702e7641345788202694f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Sat, 9 Jul 2022 19:40:09 +0300 Subject: [PATCH 043/364] mdbx: extend descriptions for `MDBX_VALIDATION` and update TODO for done item. --- TODO.md | 6 +++++- mdbx.h | 6 +++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/TODO.md b/TODO.md index 79c4e54d..db853653 100644 --- a/TODO.md +++ b/TODO.md @@ -12,7 +12,6 @@ For the same reason ~~Github~~ is blacklisted forever. So currently most of the links are broken due to noted malicious ~~Github~~ sabotage. - [Engage an "overlapped I/O" on Windows](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/224). - - [Simple careful mode for working with corrupted DB](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/223). - [Move most of `mdbx_chk` functional to the library API](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/204). - [Replace SRW-lock on Windows to allow shrink DB with `MDBX_NOTLS` option](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/210). - [More flexible support of asynchronous runtime/framework(s)](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/200). @@ -23,3 +22,8 @@ So currently most of the links are broken due to noted malicious ~~Github~~ sabo - [Support MessagePack for Keys & Values](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/115). - [Engage new terminology](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/137). - Packages for [Astra Linux](https://astralinux.ru/), [ALT Linux](https://www.altlinux.org/), [ROSA Linux](https://www.rosalinux.ru/), etc. + +Done +---- + + - [Simple careful mode for working with corrupted DB](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/223). diff --git a/mdbx.h b/mdbx.h index edc9fa02..3d13751b 100644 --- a/mdbx.h +++ b/mdbx.h @@ -1023,7 +1023,11 @@ LIBMDBX_API void mdbx_assert_fail(const MDBX_env *env, const char *msg, enum MDBX_env_flags_t { MDBX_ENV_DEFAULTS = 0, - /** Extra validation of DB structure and pages content. */ + /** Extra validation of DB structure and pages content. + * + * The `MDBX_VALIDATION` enabled the simple safe/careful mode for working + * with damaged or untrusted DB. However, a notable performance + * degradation should be expected. */ MDBX_VALIDATION = UINT32_C(0x00002000), /** No environment directory. From 149e708830da5b6c84f330c0c4a7679bc9908bb1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Sun, 10 Jul 2022 09:18:29 +0300 Subject: [PATCH 044/364] mdbx: rename `MDBX_CACHE_METAPTR` build-time option. --- src/core.c | 45 +++++++++++++++++++++------------------------ src/internals.h | 4 ++-- src/options.h | 10 +++++----- 3 files changed, 28 insertions(+), 31 deletions(-) diff --git a/src/core.c b/src/core.c index d950402f..078ac313 100644 --- a/src/core.c +++ b/src/core.c @@ -5743,12 +5743,12 @@ constmeta_txnid(const MDBX_env *env, const MDBX_meta *meta) { } static __inline void meta_cache_clear(MDBX_env *env) { -#if MDBX_CACHE_METAS +#if MDBX_CACHE_METAPTR env->cache_last_meta = nullptr; env->cache_steady_meta = nullptr; #else (void)env; -#endif /* MDBX_CACHE_METAS */ +#endif /* MDBX_CACHE_METAPTR */ } static __inline txnid_t meta_txnid(const MDBX_env *env, @@ -5888,44 +5888,41 @@ meta_mostrecent(const enum meta_choise_mode mode, const MDBX_env *env) { static volatile const MDBX_meta *meta_prefer_steady(const MDBX_env *env) { return -#if MDBX_CACHE_METAS +#if MDBX_CACHE_METAPTR ((MDBX_env *)env)->cache_steady_meta = -#endif /* MDBX_CACHE_METAS */ +#endif /* MDBX_CACHE_METAPTR */ meta_mostrecent(prefer_steady, env); } MDBX_NOTHROW_PURE_FUNCTION static const MDBX_meta * constmeta_prefer_steady(const MDBX_env *env) { -#if MDBX_CACHE_METAS - mdbx_assert(env, !env->cache_steady_meta || - env->cache_steady_meta == - meta_mostrecent(prefer_steady, env)); - return (const MDBX_meta *)(env->cache_steady_meta ? env->cache_steady_meta : -#else - return (const MDBX_meta *)( -#endif /* MDBX_CACHE_METAS */ - meta_prefer_steady(env)); +#if MDBX_CACHE_METAPTR + if (likely(env->cache_steady_meta)) { + mdbx_assert(env, + env->cache_steady_meta == meta_mostrecent(prefer_steady, env)); + return (const MDBX_meta *)env->cache_steady_meta; + } +#endif /* MDBX_CACHE_METAPTR */ + return (const MDBX_meta *)meta_prefer_steady(env); } static volatile const MDBX_meta *meta_prefer_last(const MDBX_env *env) { return -#if MDBX_CACHE_METAS +#if MDBX_CACHE_METAPTR ((MDBX_env *)env)->cache_last_meta = -#endif /* MDBX_CACHE_METAS */ +#endif /* MDBX_CACHE_METAPTR */ meta_mostrecent(prefer_last, env); } MDBX_NOTHROW_PURE_FUNCTION static const MDBX_meta * constmeta_prefer_last(const MDBX_env *env) { -#if MDBX_CACHE_METAS - mdbx_assert(env, - !env->cache_last_meta || - env->cache_last_meta == meta_mostrecent(prefer_last, env)); - return (const MDBX_meta *)(env->cache_last_meta ? env->cache_last_meta : -#else - return (const MDBX_meta *)( -#endif /* MDBX_CACHE_METAS */ - meta_prefer_last(env)); +#if MDBX_CACHE_METAPTR + if (likely(env->cache_last_meta)) { + mdbx_assert(env, env->cache_last_meta == meta_mostrecent(prefer_last, env)); + return (const MDBX_meta *)env->cache_last_meta; + } +#endif /* MDBX_CACHE_METAPTR */ + return (const MDBX_meta *)meta_prefer_last(env); } static txnid_t mdbx_recent_committed_txnid(const MDBX_env *env) { diff --git a/src/internals.h b/src/internals.h index 8cd0f329..cf6a5f33 100644 --- a/src/internals.h +++ b/src/internals.h @@ -1189,10 +1189,10 @@ struct MDBX_env { MDBX_txn *me_txn; /* current write transaction */ mdbx_fastmutex_t me_dbi_lock; -#if MDBX_CACHE_METAS +#if MDBX_CACHE_METAPTR volatile const MDBX_meta *cache_last_meta; volatile const MDBX_meta *cache_steady_meta; -#endif /* MDBX_CACHE_METAS */ +#endif /* MDBX_CACHE_METAPTR */ MDBX_dbi me_numdbs; /* number of DBs opened */ MDBX_page *me_dp_reserve; /* list of malloc'ed blocks for re-use */ diff --git a/src/options.h b/src/options.h index 7bf26f1e..b4559f3a 100644 --- a/src/options.h +++ b/src/options.h @@ -92,11 +92,11 @@ #error MDBX_ENABLE_BIGFOOT must be defined as 0 or 1 #endif /* MDBX_ENABLE_BIGFOOT */ -#ifndef MDBX_CACHE_METAS -#define MDBX_CACHE_METAS 0 -#elif !(MDBX_CACHE_METAS == 0 || MDBX_CACHE_METAS == 1) -#error MDBX_CACHE_METAS must be defined as 0 or 1 -#endif /* MDBX_CACHE_METAS */ +#ifndef MDBX_CACHE_METAPTR +#define MDBX_CACHE_METAPTR 0 +#elif !(MDBX_CACHE_METAPTR == 0 || MDBX_CACHE_METAPTR == 1) +#error MDBX_CACHE_METAPTR must be defined as 0 or 1 +#endif /* MDBX_CACHE_METAPTR */ /** Controls use of POSIX madvise() hints and friends. */ #ifndef MDBX_ENABLE_MADVISE From 5ccfb5f30a4ab124052369768da2ba679b8bc8c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Sun, 10 Jul 2022 13:56:41 +0300 Subject: [PATCH 045/364] mdbx-tools: use `MDBX_DBG_DUMP`, `MDBX_DBG_ASSERT`, `MDBX_DBG_AUDIT` inside `mdbx_chk`. --- src/mdbx_chk.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/mdbx_chk.c b/src/mdbx_chk.c index e9003573..0b288f08 100644 --- a/src/mdbx_chk.c +++ b/src/mdbx_chk.c @@ -1239,7 +1239,9 @@ int main(int argc, char *argv[]) { mdbx_setup_debug((verbose < MDBX_LOG_TRACE - 1) ? (MDBX_log_level_t)(verbose + 1) : MDBX_LOG_TRACE, - MDBX_DBG_LEGACY_OVERLAP | MDBX_DBG_DONT_UPGRADE, logger); + MDBX_DBG_DUMP | MDBX_DBG_ASSERT | MDBX_DBG_AUDIT | + MDBX_DBG_LEGACY_OVERLAP | MDBX_DBG_DONT_UPGRADE, + logger); rc = mdbx_env_create(&env); if (rc) { From ac4b6d7121af548ba588343be6e27bec451841ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Mon, 11 Jul 2022 10:24:28 +0300 Subject: [PATCH 046/364] mdbx-test: always engage `MDBX_DBG_DUMP.` --- test/log.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/log.cc b/test/log.cc index beb96203..9597328f 100644 --- a/test/log.cc +++ b/test/log.cc @@ -61,7 +61,8 @@ static FILE *last; void setlevel(loglevel priority) { level = priority; int rc = mdbx_setup_debug(MDBX_log_level_t(priority), - MDBX_DBG_ASSERT | MDBX_DBG_AUDIT | MDBX_DBG_JITTER, + MDBX_DBG_ASSERT | MDBX_DBG_AUDIT | MDBX_DBG_JITTER | + MDBX_DBG_DUMP, mdbx_logger); log_trace("set mdbx debug-opts: 0x%02x", rc); } From 0018164fef048b68dd84d503fde95dab5fdea94b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Mon, 11 Jul 2022 20:19:55 +0300 Subject: [PATCH 047/364] mdbx: fix wrong `}` oops-like typo. This is a `devel`-only 3-days old regression since a4a35ce9cb7176b914e0606d38c6d6e47a9431f1. --- src/core.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/core.c b/src/core.c index 078ac313..b5a8d6f5 100644 --- a/src/core.c +++ b/src/core.c @@ -5999,13 +5999,13 @@ static txnid_t find_oldest_reader(MDBX_env *env) { break; } } + } - if (new_oldest != prev_oldest) { - mdbx_verbose("update oldest %" PRIaTXN " -> %" PRIaTXN, prev_oldest, - new_oldest); - mdbx_assert(env, new_oldest >= lck->mti_oldest_reader.weak); - atomic_store64(&lck->mti_oldest_reader, new_oldest, mo_Relaxed); - } + if (new_oldest != prev_oldest) { + mdbx_verbose("update oldest %" PRIaTXN " -> %" PRIaTXN, prev_oldest, + new_oldest); + mdbx_assert(env, new_oldest >= lck->mti_oldest_reader.weak); + atomic_store64(&lck->mti_oldest_reader, new_oldest, mo_Relaxed); } return new_oldest; } From 08e936a8092385e0a73ba3fe6527eb0e4d12aa89 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Mon, 11 Jul 2022 23:26:02 +0300 Subject: [PATCH 048/364] mdbx: re-verify atomic-ops and remove `mo_SequentialConsistency`. --- src/core.c | 11 +++++------ src/internals.h | 4 ++-- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/src/core.c b/src/core.c index fb36cd5f..30123880 100644 --- a/src/core.c +++ b/src/core.c @@ -1099,16 +1099,16 @@ static __always_inline void safe64_reset(MDBX_atomic_uint64_t *p, } else { #if MDBX_64BIT_CAS && MDBX_64BIT_ATOMIC /* atomically make value >= SAFE64_INVALID_THRESHOLD by 64-bit operation */ - atomic_store64(p, UINT64_MAX, mo_SequentialConsistency); + atomic_store64(p, UINT64_MAX, mo_AcquireRelease); #elif MDBX_64BIT_CAS /* atomically make value >= SAFE64_INVALID_THRESHOLD by 32-bit operation */ - atomic_store32(&p->high, UINT32_MAX, mo_SequentialConsistency); + atomic_store32(&p->high, UINT32_MAX, mo_AcquireRelease); #else /* it is safe to increment low-part to avoid ABA, since xMDBX_TXNID_STEP > 1 * and overflow was preserved in safe64_txnid_next() */ STATIC_ASSERT(xMDBX_TXNID_STEP > 1); atomic_add32(&p->low, 1) /* avoid ABA in safe64_reset_compare() */; - atomic_store32(&p->high, UINT32_MAX, mo_SequentialConsistency); + atomic_store32(&p->high, UINT32_MAX, mo_AcquireRelease); atomic_add32(&p->low, 1) /* avoid ABA in safe64_reset_compare() */; #endif /* MDBX_64BIT_CAS && MDBX_64BIT_ATOMIC */ } @@ -1794,8 +1794,7 @@ static int uniq_poke(const mdbx_mmap_t *pending, mdbx_mmap_t *scan, << 24 | *abra >> 40; MDBX_lockinfo *const scan_lck = scan->lck; - atomic_store64(&scan_lck->mti_bait_uniqueness, cadabra, - mo_SequentialConsistency); + atomic_store64(&scan_lck->mti_bait_uniqueness, cadabra, mo_AcquireRelease); *abra = *abra * UINT64_C(6364136223846793005) + 1; return uniq_peek(pending, scan); } @@ -7787,7 +7786,7 @@ static bind_rslot_result bind_rslot(MDBX_env *env, const uintptr_t tid) { * slot, next publish it in lck->mti_numreaders. After * that, it is safe for mdbx_env_close() to touch it. * When it will be closed, we can finally claim it. */ - atomic_store32(&result.rslot->mr_pid, 0, mo_SequentialConsistency); + atomic_store32(&result.rslot->mr_pid, 0, mo_AcquireRelease); safe64_reset(&result.rslot->mr_txnid, true); if (slot == nreaders) env->me_lck->mti_numreaders.weak = ++nreaders; diff --git a/src/internals.h b/src/internals.h index cf6a5f33..dff6b216 100644 --- a/src/internals.h +++ b/src/internals.h @@ -218,8 +218,8 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; enum MDBX_memory_order { mo_Relaxed, - mo_AcquireRelease, - mo_SequentialConsistency + mo_AcquireRelease + /* , mo_SequentialConsistency */ }; typedef union { From a82f59a998cd0650e8f6e485d244bdac4f2daf0a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Mon, 11 Jul 2022 23:06:09 +0300 Subject: [PATCH 049/364] mdbx: minor refine `MDBX_UNALIGNED_OK`. --- src/options.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/options.h b/src/options.h index b4559f3a..4fb8b4fb 100644 --- a/src/options.h +++ b/src/options.h @@ -370,14 +370,11 @@ #endif /* MDBX_64BIT_CAS */ #ifndef MDBX_UNALIGNED_OK -#if defined(__ALIGNED__) || defined(__SANITIZE_UNDEFINED__) +#if defined(__ALIGNED__) || defined(__SANITIZE_UNDEFINED__) || \ + defined(ENABLE_UBSAN) #define MDBX_UNALIGNED_OK 0 /* no unaligned access allowed */ #elif defined(__ARM_FEATURE_UNALIGNED) #define MDBX_UNALIGNED_OK 4 /* ok unaligned for 32-bit words */ -#elif __CLANG_PREREQ(5, 0) || __GNUC_PREREQ(5, 0) -/* expecting an optimization will well done, also this - * hushes false-positives from UBSAN (undefined behaviour sanitizer) */ -#define MDBX_UNALIGNED_OK 0 #elif defined(__e2k__) || defined(__elbrus__) #if __iset__ > 4 #define MDBX_UNALIGNED_OK 8 /* ok unaligned for 64-bit words */ @@ -386,6 +383,10 @@ #endif #elif defined(__ia32__) #define MDBX_UNALIGNED_OK 8 /* ok unaligned for 64-bit words */ +#elif __CLANG_PREREQ(5, 0) || __GNUC_PREREQ(5, 0) +/* expecting an optimization will well done, also this + * hushes false-positives from UBSAN (undefined behaviour sanitizer) */ +#define MDBX_UNALIGNED_OK 0 #else #define MDBX_UNALIGNED_OK 0 /* no unaligned access allowed */ #endif From cc51a7f76eb2e4bac1f0df0016d7fd8c2ac34e84 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Tue, 12 Jul 2022 16:33:27 +0300 Subject: [PATCH 050/364] mdbx: minor refine attributes-related macros for LCC. --- src/base.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/base.h b/src/base.h index e5297745..ebd54fc2 100644 --- a/src/base.h +++ b/src/base.h @@ -548,6 +548,8 @@ __extern_C key_t ftok(const char *, int); (defined(__linux__) || defined(__gnu_linux__)) /* just put frequently used functions in separate section */ #define __hot __attribute__((__section__("text.hot"))) __optimize("O3") +#elif defined(__LCC__) +#define __hot __attribute__((__hot__, __optimize__("Ofast,O4"))) #elif defined(__GNUC__) || __has_attribute(__hot__) #define __hot __attribute__((__hot__)) __optimize("O3") #else @@ -567,6 +569,8 @@ __extern_C key_t ftok(const char *, int); (defined(__linux__) || defined(__gnu_linux__)) /* just put infrequently used functions in separate section */ #define __cold __attribute__((__section__("text.unlikely"))) __optimize("Os") +#elif defined(__LCC__) +#define __hot __attribute__((__cold__, __optimize__("Osize"))) #elif defined(__GNUC__) || __has_attribute(cold) #define __cold __attribute__((__cold__)) __optimize("Os") #else From c4dd83fbdf6dfa5486362960cd7cfd8af6f4ded2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Sat, 23 Jul 2022 12:14:01 +0300 Subject: [PATCH 051/364] mdbx: minor refine `page_split()`. --- src/core.c | 107 +++++++++++++++++++++++++++-------------------------- 1 file changed, 54 insertions(+), 53 deletions(-) diff --git a/src/core.c b/src/core.c index 30123880..28c6271c 100644 --- a/src/core.c +++ b/src/core.c @@ -3913,10 +3913,10 @@ static int __must_check_result mdbx_page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst); #define MDBX_SPLIT_REPLACE MDBX_APPENDDUP /* newkey is not new */ -static int __must_check_result mdbx_page_split(MDBX_cursor *mc, - const MDBX_val *const newkey, - MDBX_val *const newdata, - pgno_t newpgno, unsigned nflags); +static int __must_check_result page_split(MDBX_cursor *mc, + const MDBX_val *const newkey, + MDBX_val *const newdata, + pgno_t newpgno, const unsigned naf); static bool meta_checktxnid(const MDBX_env *env, const volatile MDBX_meta *meta, bool report); @@ -16299,23 +16299,22 @@ int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, rdata = data; new_sub:; - unsigned nflags = flags & NODE_ADD_FLAGS; + const unsigned naf = flags & NODE_ADD_FLAGS; size_t nsize = IS_LEAF2(mc->mc_pg[mc->mc_top]) ? key->iov_len : leaf_size(env, key, rdata); if (page_room(mc->mc_pg[mc->mc_top]) < nsize) { - if (!insert_key) - nflags |= MDBX_SPLIT_REPLACE; - rc = mdbx_page_split(mc, key, rdata, P_INVALID, nflags); + rc = page_split(mc, key, rdata, P_INVALID, + insert_key ? naf : naf | MDBX_SPLIT_REPLACE); if (rc == MDBX_SUCCESS && mdbx_audit_enabled()) rc = insert_key ? mdbx_cursor_check(mc) : mdbx_cursor_check_updating(mc); } else { /* There is room already in this leaf page. */ if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { - mdbx_cassert(mc, (nflags & (F_BIGDATA | F_SUBDATA | F_DUPDATA)) == 0 && + mdbx_cassert(mc, !(naf & (F_BIGDATA | F_SUBDATA | F_DUPDATA)) && rdata->iov_len == 0); rc = mdbx_node_add_leaf2(mc, mc->mc_ki[mc->mc_top], key); } else - rc = mdbx_node_add_leaf(mc, mc->mc_ki[mc->mc_top], key, rdata, nflags); + rc = mdbx_node_add_leaf(mc, mc->mc_ki[mc->mc_top], key, rdata, naf); if (likely(rc == 0)) { /* Adjust other cursors pointing to mp */ const MDBX_dbi dbi = mc->mc_dbi; @@ -17434,10 +17433,10 @@ static int mdbx_update_key(MDBX_cursor *mc, const MDBX_val *key) { mdbx_debug("Not enough room, delta = %zd, splitting...", delta); pgno_t pgno = node_pgno(node); mdbx_node_del(mc, 0); - int rc = mdbx_page_split(mc, key, NULL, pgno, MDBX_SPLIT_REPLACE); - if (rc == MDBX_SUCCESS && mdbx_audit_enabled()) - rc = mdbx_cursor_check_updating(mc); - return rc; + int err = page_split(mc, key, NULL, pgno, MDBX_SPLIT_REPLACE); + if (err == MDBX_SUCCESS && mdbx_audit_enabled()) + err = mdbx_cursor_check_updating(mc); + return err; } nkeys = page_numkeys(mp); @@ -18950,11 +18949,11 @@ static int mdbx_del0(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, * [in] newkey The key for the newly inserted node. * [in] newdata The data for the newly inserted node. * [in] newpgno The page number, if the new node is a branch node. - * [in] nflags The NODE_ADD_FLAGS for the new node. + * [in] naf The NODE_ADD_FLAGS for the new node. * Returns 0 on success, non-zero on failure. */ -static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *const newkey, - MDBX_val *const newdata, pgno_t newpgno, - unsigned nflags) { +static int page_split(MDBX_cursor *mc, const MDBX_val *const newkey, + MDBX_val *const newdata, pgno_t newpgno, + const unsigned naf) { unsigned flags; int rc = MDBX_SUCCESS, foliage = 0; unsigned i, ptop; @@ -19046,7 +19045,7 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *const newkey, unsigned split_indx = (newindx < nkeys) - ? /* split at the middle */ (nkeys + 1) / 2 + ? /* split at the middle */ (nkeys + 1) >> 1 : /* split at the end (i.e. like append-mode ) */ nkeys - minkeys + 1; mdbx_assert(env, split_indx >= minkeys && split_indx <= nkeys - minkeys + 1); @@ -19054,7 +19053,7 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *const newkey, /* It is reasonable and possible to split the page at the begin */ if (unlikely(newindx < minkeys)) { split_indx = minkeys; - if (newindx == 0 && foliage == 0 && !(nflags & MDBX_SPLIT_REPLACE)) { + if (newindx == 0 && foliage == 0 && !(naf & MDBX_SPLIT_REPLACE)) { split_indx = 0; /* Checking for ability of splitting by the left-side insertion * of a pure page with the new key */ @@ -19098,7 +19097,7 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *const newkey, char *split, *ins; unsigned lsize, rsize, ksize; /* Move half of the keys to the right sibling */ - const int x = mc->mc_ki[mc->mc_top] - split_indx; + const int distance = mc->mc_ki[mc->mc_top] - split_indx; ksize = mc->mc_db->md_xsize; split = page_leaf2key(mp, split_indx, ksize); rsize = (nkeys - split_indx) * ksize; @@ -19113,7 +19112,7 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *const newkey, sister->mp_upper -= (indx_t)(rsize - lsize); sepkey.iov_len = ksize; sepkey.iov_base = (newindx != split_indx) ? split : newkey->iov_base; - if (x < 0) { + if (distance < 0) { mdbx_cassert(mc, ksize >= sizeof(indx_t)); ins = page_leaf2key(mp, mc->mc_ki[mc->mc_top], ksize); memcpy(sister->mp_ptrs, split, rsize); @@ -19125,16 +19124,16 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *const newkey, mdbx_cassert(mc, mp->mp_upper >= ksize - sizeof(indx_t)); mp->mp_upper -= (indx_t)(ksize - sizeof(indx_t)); } else { - memcpy(sister->mp_ptrs, split, x * ksize); - ins = page_leaf2key(sister, x, ksize); + memcpy(sister->mp_ptrs, split, distance * ksize); + ins = page_leaf2key(sister, distance, ksize); memcpy(ins, newkey->iov_base, ksize); - memcpy(ins + ksize, split + x * ksize, rsize - x * ksize); + memcpy(ins + ksize, split + distance * ksize, rsize - distance * ksize); mdbx_cassert(mc, UINT16_MAX - sister->mp_lower >= (int)sizeof(indx_t)); sister->mp_lower += sizeof(indx_t); mdbx_cassert(mc, sister->mp_upper >= ksize - sizeof(indx_t)); sister->mp_upper -= (indx_t)(ksize - sizeof(indx_t)); - mdbx_cassert(mc, x <= (int)UINT16_MAX); - mc->mc_ki[mc->mc_top] = (indx_t)x; + mdbx_cassert(mc, distance <= (int)UINT16_MAX); + mc->mc_ki[mc->mc_top] = (indx_t)distance; } if (mdbx_audit_enabled()) { @@ -19158,11 +19157,11 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *const newkey, : branch_size(env, newkey); /* prepare to insert */ - for (unsigned j = i = 0; i < nkeys; ++i, ++j) { - tmp_ki_copy->mp_ptrs[j] = 0; - j += (i == newindx); - tmp_ki_copy->mp_ptrs[j] = mp->mp_ptrs[i]; - } + for (i = 0; i < newindx; ++i) + tmp_ki_copy->mp_ptrs[i] = mp->mp_ptrs[i]; + tmp_ki_copy->mp_ptrs[i] = (indx_t)-1; + while (++i <= nkeys) + tmp_ki_copy->mp_ptrs[i] = mp->mp_ptrs[i - 1]; tmp_ki_copy->mp_pgno = mp->mp_pgno; tmp_ki_copy->mp_flags = mp->mp_flags; tmp_ki_copy->mp_txnid = INVALID_TXNID; @@ -19184,20 +19183,22 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *const newkey, * будет в каждом ключе, в худшем случае кроме одного, который может быть * нулевого размера. */ - if (newindx == split_indx && split_indx + minkeys <= nkeys) - split_indx += 1; + if (newindx == split_indx && nkeys >= 5) { + STATIC_ASSERT(P_BRANCH == 1); + split_indx += mp->mp_flags & P_BRANCH; + } mdbx_assert(env, - split_indx >= minkeys && split_indx <= nkeys - minkeys + 1); + split_indx >= minkeys && split_indx <= nkeys + 1 - minkeys); const unsigned dim_nodes = (newindx >= split_indx) ? split_indx : nkeys - split_indx; const unsigned dim_used = (sizeof(indx_t) + NODESIZE + 1) * dim_nodes; if (new_size >= dim_used) { - /* Find split point */ + /* Search for best acceptable split point */ i = (newindx < split_indx) ? 0 : nkeys; int dir = (newindx < split_indx) ? 1 : -1; size_t before = 0, after = new_size + page_used(env, mp); unsigned best_split = split_indx; - unsigned best_offset = INT_MAX; + unsigned best_shift = INT_MAX; mdbx_trace("seek separator from %u, step %i, default %u, new-idx %u, " "new-size %zu", @@ -19223,11 +19224,13 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *const newkey, if (before <= max_space && after <= max_space) { const unsigned split = i + (dir > 0); if (split >= minkeys && split <= nkeys + 1 - minkeys) { - const unsigned offset = branchless_abs(split_indx - split); - if (offset >= best_offset) + const unsigned shift = branchless_abs(split_indx - split); + if (shift >= best_shift) break; - best_offset = offset; + best_shift = shift; best_split = split; + if (!best_shift) + break; } } i += dir; @@ -19237,10 +19240,9 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *const newkey, mdbx_trace("chosen %u", split_indx); } mdbx_assert(env, - split_indx >= minkeys && split_indx <= nkeys - minkeys + 1); + split_indx >= minkeys && split_indx <= nkeys + 1 - minkeys); - sepkey.iov_len = newkey->iov_len; - sepkey.iov_base = newkey->iov_base; + sepkey = *newkey; if (split_indx != newindx) { MDBX_node *node = (MDBX_node *)((char *)mp + tmp_ki_copy->mp_ptrs[split_indx] + @@ -19265,7 +19267,7 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *const newkey, did_split_parent = true; /* We want other splits to find mn when doing fixups */ WITH_CURSOR_TRACKING( - mn, rc = mdbx_page_split(&mn, &sepkey, NULL, sister->mp_pgno, 0)); + mn, rc = page_split(&mn, &sepkey, NULL, sister->mp_pgno, 0)); if (unlikely(rc != MDBX_SUCCESS)) goto done; mdbx_cassert(mc, (int)mc->mc_snum - snum == mc->mc_db->md_depth - depth); @@ -19349,10 +19351,10 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *const newkey, switch (PAGETYPE_WHOLE(sister)) { case P_LEAF: { mdbx_cassert(mc, newpgno == 0 || newpgno == P_INVALID); - rc = mdbx_node_add_leaf(mc, 0, newkey, newdata, nflags); + rc = mdbx_node_add_leaf(mc, 0, newkey, newdata, naf); } break; case P_LEAF | P_LEAF2: { - mdbx_cassert(mc, (nflags & (F_BIGDATA | F_SUBDATA | F_DUPDATA)) == 0); + mdbx_cassert(mc, (naf & (F_BIGDATA | F_SUBDATA | F_DUPDATA)) == 0); mdbx_cassert(mc, newpgno == 0 || newpgno == P_INVALID); rc = mdbx_node_add_leaf2(mc, 0, newkey); } break; @@ -19389,19 +19391,18 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *const newkey, mc->mc_pg[mc->mc_top] = sister; i = split_indx; unsigned n = 0; - pgno_t pgno = 0; do { mdbx_trace("i %u, nkeys %u => n %u, rp #%u", i, nkeys, n, sister->mp_pgno); + pgno_t pgno = 0; MDBX_val *rdata = NULL; if (i == newindx) { - rkey.iov_base = newkey->iov_base; - rkey.iov_len = newkey->iov_len; + rkey = *newkey; if (IS_LEAF(mp)) rdata = newdata; else pgno = newpgno; - flags = nflags; + flags = naf; /* Update index for the new key. */ mc->mc_ki[mc->mc_top] = (indx_t)n; } else { @@ -19506,14 +19507,14 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *const newkey, m3->mc_ki[k + 1] = m3->mc_ki[k]; m3->mc_pg[k + 1] = m3->mc_pg[k]; } - m3->mc_ki[0] = (m3->mc_ki[0] >= nkeys) ? 1 : 0; + m3->mc_ki[0] = m3->mc_ki[0] >= nkeys; m3->mc_pg[0] = mc->mc_pg[0]; m3->mc_snum++; m3->mc_top++; } if (m3->mc_top >= mc->mc_top && m3->mc_pg[mc->mc_top] == mp && !pure_left) { - if (m3->mc_ki[mc->mc_top] >= newindx && !(nflags & MDBX_SPLIT_REPLACE)) + if (m3->mc_ki[mc->mc_top] >= newindx && !(naf & MDBX_SPLIT_REPLACE)) m3->mc_ki[mc->mc_top]++; if (m3->mc_ki[mc->mc_top] >= nkeys) { m3->mc_pg[mc->mc_top] = sister; @@ -19544,7 +19545,7 @@ done: else { if (mdbx_audit_enabled()) rc = mdbx_cursor_check_updating(mc); - if (unlikely(nflags & MDBX_RESERVE)) { + if (unlikely(naf & MDBX_RESERVE)) { MDBX_node *node = page_node(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); if (!(node_flags(node) & F_BIGDATA)) newdata->iov_base = node_data(node); From 289636834cca925f9c669a94f2f950a2569fb49e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Sat, 23 Jul 2022 18:56:12 +0300 Subject: [PATCH 052/364] mdbx: fix `unused` warning for case `MDBX_DISABLE_VALIDATION`. --- src/core.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/core.c b/src/core.c index 28c6271c..464f535d 100644 --- a/src/core.c +++ b/src/core.c @@ -14181,7 +14181,9 @@ __hot static __always_inline pgr_t page_get_inline(const uint16_t ILL, } else { assert(false); } -#endif /* !MDBX_DISABLE_VALIDATION */ +#else + (void)ILL; +#endif /* MDBX_DISABLE_VALIDATION */ if (unlikely(mc->mc_checking & CC_PAGECHECK) && unlikely(MDBX_SUCCESS != (r.err = mdbx_page_check(mc, r.page)))) From 262fafd00efc9c742f1c3d72328437fe4077cc06 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Sat, 23 Jul 2022 19:24:56 +0300 Subject: [PATCH 053/364] mdbx: fix `unused` warning for case `MDBX_ENABLE_PGOP_STAT=0`. --- src/core.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/core.c b/src/core.c index 464f535d..9793580d 100644 --- a/src/core.c +++ b/src/core.c @@ -7417,7 +7417,9 @@ retry:; if (!inside_txn) { if (!locked) { int err; +#if MDBX_ENABLE_PGOP_STAT unsigned wops = 0; +#endif /* MDBX_ENABLE_PGOP_STAT */ /* pre-sync to avoid latency for writer */ if (unsynced_pages > /* FIXME: define threshold */ 16 && (flags & MDBX_SAFE_NOSYNC) == 0) { @@ -7446,8 +7448,10 @@ retry:; if (unlikely(err != MDBX_SUCCESS)) return err; - /* pre-sync done */ +#if MDBX_ENABLE_PGOP_STAT wops = 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ + /* pre-sync done */ rc = MDBX_SUCCESS /* means "some data was synced" */; } From fe6c6b2068f4ed9addd5b4a6062921d1a889fd71 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Sun, 24 Jul 2022 17:31:38 +0300 Subject: [PATCH 054/364] mdbx: add `MDBX_HAVE_CMOV` macro/option. --- src/base.h | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/src/base.h b/src/base.h index ebd54fc2..08c2492c 100644 --- a/src/base.h +++ b/src/base.h @@ -375,6 +375,33 @@ __extern_C key_t ftok(const char *, int); #endif #endif /* __BYTE_ORDER__ || __ORDER_LITTLE_ENDIAN__ || __ORDER_BIG_ENDIAN__ */ +/*----------------------------------------------------------------------------*/ +/* Availability of CMOV or equivalent */ + +#ifndef MDBX_HAVE_CMOV +#if defined(__e2k__) +#define MDBX_HAVE_CMOV 1 +#elif defined(__thumb2__) || defined(__thumb2) +#define MDBX_HAVE_CMOV 1 +#elif defined(__thumb__) || defined(__thumb) || defined(__TARGET_ARCH_THUMB) +#define MDBX_HAVE_CMOV 0 +#elif defined(_M_ARM) || defined(_M_ARM64) || defined(__aarch64__) || \ + defined(__aarch64) || defined(__arm__) || defined(__arm) || \ + defined(__CC_ARM) +#define MDBX_HAVE_CMOV 1 +#elif (defined(__riscv__) || defined(__riscv64)) && \ + (defined(__riscv_b) || defined(__riscv_bitmanip)) +#define MDBX_HAVE_CMOV 1 +#elif defined(i686) || defined(__i686) || defined(__i686__) || \ + (defined(_M_IX86) && _M_IX86 > 600) || defined(__x86_64) || \ + defined(__x86_64__) || defined(__amd64__) || defined(__amd64) || \ + defined(_M_X64) || defined(_M_AMD64) +#define MDBX_HAVE_CMOV 1 +#else +#define MDBX_HAVE_CMOV 0 +#endif +#endif /* MDBX_HAVE_CMOV */ + /*----------------------------------------------------------------------------*/ /* Compiler's includes for builtins/intrinsics */ From 47d5fa7fd475b45408aa72489405620b2d7fd3ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Sun, 24 Jul 2022 17:54:40 +0300 Subject: [PATCH 055/364] mdbx: refine/speedup `pnl_merge()`. --- src/core.c | 62 +++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 43 insertions(+), 19 deletions(-) diff --git a/src/core.c b/src/core.c index 9793580d..65f1369c 100644 --- a/src/core.c +++ b/src/core.c @@ -3215,25 +3215,49 @@ static __always_inline bool mdbx_pnl_check4assert(const MDBX_PNL pl, return mdbx_pnl_check(pl, limit); } -/* Merge an PNL onto an PNL. The destination PNL must be big enough */ -static void __hot mdbx_pnl_xmerge(MDBX_PNL dst, const MDBX_PNL src) { +static __always_inline void +pnl_merge_inner(pgno_t *__restrict dst, const pgno_t *__restrict src_a, + const pgno_t *__restrict src_b, + const pgno_t *__restrict const src_b_detent) { + do { +#if MDBX_HAVE_CMOV + const bool flag = MDBX_PNL_ORDERED(*src_b, *src_a); +#if defined(__LCC__) || __CLANG_PREREQ(13, 0) + // lcc 1.26: 13ШК (подготовка и первая итерация) + 7ШК (цикл), БЕЗ loop-mode + // gcc>=7: cmp+jmp с возвратом в тело цикла (WTF?) + // gcc<=6: cmov×3 + // clang<=12: cmov×3 + // clang>=13: cmov, set+add/sub + *dst = flag ? *src_a-- : *src_b--; +#else + // gcc: cmov, cmp+set+add/sub + // clang<=5: cmov×2, set+add/sub + // clang>=6: cmov, set+add/sub + *dst = flag ? *src_a : *src_b; + src_b += flag - 1; + src_a -= flag; +#endif + --dst; +#else /* MDBX_HAVE_CMOV */ + while (MDBX_PNL_ORDERED(*src_b, *src_a)) + *dst-- = *src_a--; + *dst-- = *src_b--; +#endif /* !MDBX_HAVE_CMOV */ + } while (likely(src_b > src_b_detent)); +} + +/* Merge a PNL onto a PNL. The destination PNL must be big enough */ +static void __hot pnl_merge(MDBX_PNL dst, const MDBX_PNL src) { assert(mdbx_pnl_check4assert(dst, MAX_PAGENO + 1)); assert(mdbx_pnl_check(src, MAX_PAGENO + 1)); - if (likely(MDBX_PNL_SIZE(src) > 0)) { - const size_t total = MDBX_PNL_SIZE(dst) + MDBX_PNL_SIZE(src); + const pgno_t src_len = MDBX_PNL_SIZE(src); + const pgno_t dst_len = MDBX_PNL_SIZE(dst); + if (likely(src_len > 0)) { + const pgno_t total = dst_len + src_len; assert(MDBX_PNL_ALLOCLEN(dst) >= total); - pgno_t *w = dst + total; - pgno_t *d = dst + MDBX_PNL_SIZE(dst); - const pgno_t *s = src + MDBX_PNL_SIZE(src); - dst[0] = /* detent for scan below */ (MDBX_PNL_ASCENDING ? 0 : ~(pgno_t)0); - do { - const bool cmp = MDBX_PNL_ORDERED(*s, *d); - *w = cmp ? *d : *s; - d -= cmp ? 1 : 0; - s -= cmp ? 0 : 1; - --w; - } while (s > src); - MDBX_PNL_SIZE(dst) = (pgno_t)total; + dst[0] = /* the detent */ (MDBX_PNL_ASCENDING ? 0 : P_INVALID); + pnl_merge_inner(dst + total, dst + dst_len, src + src_len, src); + MDBX_PNL_SIZE(dst) = total; } assert(mdbx_pnl_check4assert(dst, MAX_PAGENO + 1)); } @@ -6835,7 +6859,7 @@ __cold static pgr_t page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num, /* Merge in descending sorted order */ const unsigned prev_re_len = MDBX_PNL_SIZE(re_list); - mdbx_pnl_xmerge(re_list, gc_pnl); + pnl_merge(re_list, gc_pnl); if (mdbx_audit_enabled() && unlikely(!mdbx_pnl_check(re_list, txn->mt_next_pgno))) { ret.err = MDBX_CORRUPTED; @@ -9574,7 +9598,7 @@ retry: mdbx_tassert(txn, count == txn->tw.loose_count); MDBX_PNL_SIZE(loose) = count; mdbx_pnl_sort(loose, txn->mt_next_pgno); - mdbx_pnl_xmerge(txn->tw.reclaimed_pglist, loose); + pnl_merge(txn->tw.reclaimed_pglist, loose); mdbx_trace("%s: append %u loose-pages to reclaimed-pages", dbg_prefix_mode, txn->tw.loose_count); } @@ -10586,7 +10610,7 @@ static __inline void mdbx_txn_merge(MDBX_txn *const parent, MDBX_txn *const txn, if (txn->tw.spill_pages) { if (parent->tw.spill_pages) { /* Must not fail since space was preserved above. */ - mdbx_pnl_xmerge(parent->tw.spill_pages, txn->tw.spill_pages); + pnl_merge(parent->tw.spill_pages, txn->tw.spill_pages); mdbx_pnl_free(txn->tw.spill_pages); } else { parent->tw.spill_pages = txn->tw.spill_pages; From dc39ecfb9fc153b33063c3de1009aad3757f471c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Sun, 24 Jul 2022 21:20:22 +0300 Subject: [PATCH 056/364] mdbx: auto-coalesce of GC's records with less overhead. --- src/core.c | 20 +++++++------------- src/internals.h | 2 +- 2 files changed, 8 insertions(+), 14 deletions(-) diff --git a/src/core.c b/src/core.c index 65f1369c..991141a0 100644 --- a/src/core.c +++ b/src/core.c @@ -6645,19 +6645,18 @@ __hot static pgno_t *scan4range(const MDBX_PNL pnl, const unsigned len, #define MDBX_ALLOC_NOLOG 32 #define MDBX_ALLOC_ALL (MDBX_ALLOC_GC | MDBX_ALLOC_NEW) -__cold static pgr_t page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num, - int flags) { +static pgr_t page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num, int flags) { pgr_t ret; MDBX_txn *const txn = mc->mc_txn; MDBX_env *const env = txn->mt_env; mdbx_assert(env, num == 0 || !(flags & MDBX_ALLOC_SLOT)); mdbx_assert(env, num > 0 || !(flags & MDBX_ALLOC_NEW)); - const unsigned coalesce_threshold = - env->me_maxgc_ov1page - env->me_maxgc_ov1page / 4; + const unsigned coalesce_threshold = env->me_maxgc_ov1page >> 2; if (likely(flags & MDBX_ALLOC_GC)) { flags |= env->me_flags & MDBX_LIFORECLAIM; - if (MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) < coalesce_threshold) + if (txn->mt_dbs[FREE_DBI].md_branch_pages && + MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) < coalesce_threshold) flags |= MDBX_ALLOC_COALESCE; if (unlikely( /* If mc is updating the GC, then the retired-list cannot play @@ -6858,7 +6857,6 @@ __cold static pgr_t page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num, } /* Merge in descending sorted order */ - const unsigned prev_re_len = MDBX_PNL_SIZE(re_list); pnl_merge(re_list, gc_pnl); if (mdbx_audit_enabled() && unlikely(!mdbx_pnl_check(re_list, txn->mt_next_pgno))) { @@ -6891,15 +6889,11 @@ __cold static pgr_t page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num, } /* Don't try to coalesce too much. */ - if (flags & MDBX_ALLOC_COALESCE) { - if (re_len /* current size */ > coalesce_threshold || - (re_len > prev_re_len && - re_len - prev_re_len /* delta from prev */ >= - coalesce_threshold / 2)) { + if (re_len /* current size */ > coalesce_threshold) { + if (flags & MDBX_ALLOC_COALESCE) mdbx_trace("clear %s %s", "MDBX_ALLOC_COALESCE", "since got threshold"); - flags &= ~MDBX_ALLOC_COALESCE; - } + flags &= ~MDBX_ALLOC_COALESCE; } } diff --git a/src/internals.h b/src/internals.h index dff6b216..cc2a578a 100644 --- a/src/internals.h +++ b/src/internals.h @@ -1109,7 +1109,7 @@ struct MDBX_env { #define MDBX_ENV_TXKEY UINT32_C(0x10000000) /* Legacy MDBX_MAPASYNC (prior v0.9) */ #define MDBX_DEPRECATED_MAPASYNC UINT32_C(0x100000) - /* Legacy MDBX_MAPASYNC (prior v0.12) */ + /* Legacy MDBX_COALESCE (prior v0.12) */ #define MDBX_DEPRECATED_COALESCE UINT32_C(0x2000000) #define ENV_INTERNAL_FLAGS (MDBX_FATAL_ERROR | MDBX_ENV_ACTIVE | MDBX_ENV_TXKEY) uint32_t me_flags; From 75d19b58068c4a1cea81c9e913c7599ec079d7b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Sun, 24 Jul 2022 22:06:47 +0300 Subject: [PATCH 057/364] mdbx: minor refine/speedup `pnl_check()`. --- src/core.c | 137 ++++++++++++++++++++++++++--------------------------- 1 file changed, 67 insertions(+), 70 deletions(-) diff --git a/src/core.c b/src/core.c index 991141a0..064420a0 100644 --- a/src/core.c +++ b/src/core.c @@ -3182,37 +3182,35 @@ static __hot int __must_check_result mdbx_pnl_insert_range(MDBX_PNL *ppl, return MDBX_SUCCESS; } -static bool mdbx_pnl_check(const MDBX_PNL pl, const size_t limit) { +__hot static bool pnl_check(const pgno_t *pl, const size_t limit) { assert(limit >= MIN_PAGENO - MDBX_ENABLE_REFUND); if (likely(MDBX_PNL_SIZE(pl))) { - assert(MDBX_PNL_LEAST(pl) >= MIN_PAGENO); - assert(MDBX_PNL_MOST(pl) < limit); - assert(MDBX_PNL_SIZE(pl) <= MDBX_PGL_LIMIT); if (unlikely(MDBX_PNL_SIZE(pl) > MDBX_PGL_LIMIT)) return false; if (unlikely(MDBX_PNL_LEAST(pl) < MIN_PAGENO)) return false; if (unlikely(MDBX_PNL_MOST(pl) >= limit)) return false; - if (!MDBX_DISABLE_VALIDATION || mdbx_audit_enabled()) { - for (const pgno_t *scan = &MDBX_PNL_LAST(pl); --scan > pl;) { - assert(MDBX_PNL_ORDERED(scan[0], scan[1])); - if (unlikely(!MDBX_PNL_ORDERED(scan[0], scan[1]))) + + if ((!MDBX_DISABLE_VALIDATION || mdbx_audit_enabled()) && + likely(MDBX_PNL_SIZE(pl) > 1)) { + const pgno_t *scan = MDBX_PNL_BEGIN(pl); + const pgno_t *const end = MDBX_PNL_END(pl); + pgno_t prev = *scan++; + do { + if (unlikely(!MDBX_PNL_ORDERED(prev, *scan))) return false; - } + prev = *scan; + } while (likely(++scan != end)); } } return true; } -static __always_inline bool mdbx_pnl_check4assert(const MDBX_PNL pl, - const size_t limit) { - if (unlikely(pl == nullptr)) - return true; - assert(MDBX_PNL_ALLOCLEN(pl) >= MDBX_PNL_SIZE(pl)); - if (unlikely(MDBX_PNL_ALLOCLEN(pl) < MDBX_PNL_SIZE(pl))) - return false; - return mdbx_pnl_check(pl, limit); +static __always_inline bool pnl_check_allocated(const pgno_t *pl, + const size_t limit) { + return pl == nullptr || + (MDBX_PNL_ALLOCLEN(pl) >= MDBX_PNL_SIZE(pl) && pnl_check(pl, limit)); } static __always_inline void @@ -3248,8 +3246,8 @@ pnl_merge_inner(pgno_t *__restrict dst, const pgno_t *__restrict src_a, /* Merge a PNL onto a PNL. The destination PNL must be big enough */ static void __hot pnl_merge(MDBX_PNL dst, const MDBX_PNL src) { - assert(mdbx_pnl_check4assert(dst, MAX_PAGENO + 1)); - assert(mdbx_pnl_check(src, MAX_PAGENO + 1)); + assert(pnl_check_allocated(dst, MAX_PAGENO + 1)); + assert(pnl_check(src, MAX_PAGENO + 1)); const pgno_t src_len = MDBX_PNL_SIZE(src); const pgno_t dst_len = MDBX_PNL_SIZE(dst); if (likely(src_len > 0)) { @@ -3259,7 +3257,7 @@ static void __hot pnl_merge(MDBX_PNL dst, const MDBX_PNL src) { pnl_merge_inner(dst + total, dst + dst_len, src + src_len, src); MDBX_PNL_SIZE(dst) = total; } - assert(mdbx_pnl_check4assert(dst, MAX_PAGENO + 1)); + assert(pnl_check_allocated(dst, MAX_PAGENO + 1)); } static void mdbx_spill_remove(MDBX_txn *txn, unsigned idx, unsigned npages) { @@ -3329,7 +3327,7 @@ static __hot void mdbx_pnl_sort_nochk(MDBX_PNL pnl) { static __inline void mdbx_pnl_sort(MDBX_PNL pnl, size_t limit4check) { mdbx_pnl_sort_nochk(pnl); - assert(mdbx_pnl_check(pnl, limit4check)); + assert(pnl_check(pnl, limit4check)); (void)limit4check; } @@ -3351,7 +3349,7 @@ static __hot unsigned mdbx_pnl_search_nochk(const MDBX_PNL pnl, pgno_t pgno) { static __inline unsigned mdbx_pnl_search(const MDBX_PNL pnl, pgno_t pgno, size_t limit) { - assert(mdbx_pnl_check4assert(pnl, limit)); + assert(pnl_check_allocated(pnl, limit)); assert(pgno < limit); (void)limit; return mdbx_pnl_search_nochk(pnl, pgno); @@ -4620,8 +4618,8 @@ static void mdbx_refund_reclaimed(MDBX_txn *txn) { mdbx_verbose("refunded %" PRIaPGNO " pages: %" PRIaPGNO " -> %" PRIaPGNO, txn->mt_next_pgno - next_pgno, txn->mt_next_pgno, next_pgno); txn->mt_next_pgno = next_pgno; - mdbx_tassert(txn, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, - txn->mt_next_pgno - 1)); + mdbx_tassert(txn, pnl_check_allocated(txn->tw.reclaimed_pglist, + txn->mt_next_pgno - 1)); } static void mdbx_refund_loose(MDBX_txn *txn) { @@ -5102,8 +5100,8 @@ status_done: mdbx_debug("reclaim %u %s page %" PRIaPGNO, npages, "dirty", pgno); rc = mdbx_pnl_insert_range(&txn->tw.reclaimed_pglist, pgno, npages); mdbx_tassert(txn, - mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, - txn->mt_next_pgno - MDBX_ENABLE_REFUND)); + pnl_check_allocated(txn->tw.reclaimed_pglist, + txn->mt_next_pgno - MDBX_ENABLE_REFUND)); mdbx_tassert(txn, mdbx_dirtylist_check(txn)); return rc; } @@ -6671,9 +6669,8 @@ static pgr_t page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num, int flags) { flags &= ~(MDBX_ALLOC_GC | MDBX_ALLOC_COALESCE); } - mdbx_assert(env, - mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, - txn->mt_next_pgno - MDBX_ENABLE_REFUND)); + mdbx_assert(env, pnl_check_allocated(txn->tw.reclaimed_pglist, + txn->mt_next_pgno - MDBX_ENABLE_REFUND)); pgno_t pgno, *re_list = txn->tw.reclaimed_pglist; unsigned re_len = MDBX_PNL_SIZE(re_list); pgno_t *range = nullptr; @@ -6690,8 +6687,8 @@ static pgr_t page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num, int flags) { /* Seek a big enough contiguous page range. * Prefer pages with lower pgno. */ - mdbx_assert(env, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, - txn->mt_next_pgno)); + mdbx_assert(env, pnl_check_allocated(txn->tw.reclaimed_pglist, + txn->mt_next_pgno)); if (!(flags & (MDBX_ALLOC_COALESCE | MDBX_ALLOC_SLOT)) && re_len >= num) { mdbx_assert(env, MDBX_PNL_LAST(re_list) < txn->mt_next_pgno && MDBX_PNL_FIRST(re_list) < txn->mt_next_pgno); @@ -6809,7 +6806,7 @@ static pgr_t page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num, int flags) { mdbx_tassert(txn, data.iov_len >= MDBX_PNL_SIZEOF(gc_pnl)); if (unlikely(data.iov_len % sizeof(pgno_t) || data.iov_len < MDBX_PNL_SIZEOF(gc_pnl) || - !mdbx_pnl_check(gc_pnl, txn->mt_next_pgno))) { + !pnl_check(gc_pnl, txn->mt_next_pgno))) { ret.err = MDBX_CORRUPTED; goto fail; } @@ -6859,7 +6856,7 @@ static pgr_t page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num, int flags) { /* Merge in descending sorted order */ pnl_merge(re_list, gc_pnl); if (mdbx_audit_enabled() && - unlikely(!mdbx_pnl_check(re_list, txn->mt_next_pgno))) { + unlikely(!pnl_check(re_list, txn->mt_next_pgno))) { ret.err = MDBX_CORRUPTED; goto fail; } @@ -7022,8 +7019,8 @@ static pgr_t page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num, int flags) { mdbx_osal_monotime() - timestamp; #endif /* MDBX_ENABLE_PGOP_STAT */ mdbx_assert(env, - mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, - txn->mt_next_pgno - MDBX_ENABLE_REFUND)); + pnl_check_allocated(txn->tw.reclaimed_pglist, + txn->mt_next_pgno - MDBX_ENABLE_REFUND)); int level; const char *what; if (likely(!(flags & MDBX_ALLOC_FAKE))) { @@ -7086,8 +7083,8 @@ done: #endif MDBX_PNL_SIZE(re_list) = re_len -= num; mdbx_tassert(txn, - mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, - txn->mt_next_pgno - MDBX_ENABLE_REFUND)); + pnl_check_allocated(txn->tw.reclaimed_pglist, + txn->mt_next_pgno - MDBX_ENABLE_REFUND)); } else { txn->mt_next_pgno = pgno + num; mdbx_assert(env, txn->mt_next_pgno <= txn->mt_end_pgno); @@ -7109,8 +7106,8 @@ done: goto fail; mdbx_tassert(txn, - mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, - txn->mt_next_pgno - MDBX_ENABLE_REFUND)); + pnl_check_allocated(txn->tw.reclaimed_pglist, + txn->mt_next_pgno - MDBX_ENABLE_REFUND)); return ret; } @@ -7174,9 +7171,9 @@ __hot static pgr_t page_alloc(MDBX_cursor *mc) { mdbx_tassert(txn, ret.page->mp_pgno >= NUM_METAS); ret.err = mdbx_page_dirty(txn, ret.page, 1); - mdbx_tassert( - txn, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, - txn->mt_next_pgno - MDBX_ENABLE_REFUND)); + mdbx_tassert(txn, + pnl_check_allocated(txn->tw.reclaimed_pglist, + txn->mt_next_pgno - MDBX_ENABLE_REFUND)); return ret; } } @@ -8514,7 +8511,7 @@ int mdbx_txn_begin_ex(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags, MDBX_PNL_SIZE(parent->tw.reclaimed_pglist)); memcpy(txn->tw.reclaimed_pglist, parent->tw.reclaimed_pglist, MDBX_PNL_SIZEOF(parent->tw.reclaimed_pglist)); - mdbx_assert(env, mdbx_pnl_check4assert( + mdbx_assert(env, pnl_check_allocated( txn->tw.reclaimed_pglist, (txn->mt_next_pgno /* LY: intentional assignment here, only for assertion */ @@ -8864,8 +8861,8 @@ static void dbi_update(MDBX_txn *txn, int keep) { static void mdbx_dpl_sift(MDBX_txn *const txn, MDBX_PNL pl, const bool spilled) { if (MDBX_PNL_SIZE(pl) && txn->tw.dirtylist->length) { - mdbx_tassert( - txn, mdbx_pnl_check4assert(pl, (size_t)txn->mt_next_pgno << spilled)); + mdbx_tassert(txn, + pnl_check_allocated(pl, (size_t)txn->mt_next_pgno << spilled)); MDBX_dpl *dl = mdbx_dpl_sort(txn); /* Scanning in ascend order */ @@ -9017,9 +9014,9 @@ static int mdbx_txn_end(MDBX_txn *txn, const unsigned mode) { mdbx_assert(env, parent->mt_signature == MDBX_MT_SIGNATURE); mdbx_assert(env, parent->mt_child == txn && (parent->mt_flags & MDBX_TXN_HAS_CHILD) != 0); - mdbx_assert( - env, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, - txn->mt_next_pgno - MDBX_ENABLE_REFUND)); + mdbx_assert(env, + pnl_check_allocated(txn->tw.reclaimed_pglist, + txn->mt_next_pgno - MDBX_ENABLE_REFUND)); if (txn->tw.lifo_reclaimed) { mdbx_assert(env, MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) >= @@ -9413,8 +9410,8 @@ retry: mdbx_trace("%s", " >> restart"); int rc = MDBX_SUCCESS; mdbx_tassert(txn, - mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, - txn->mt_next_pgno - MDBX_ENABLE_REFUND)); + pnl_check_allocated(txn->tw.reclaimed_pglist, + txn->mt_next_pgno - MDBX_ENABLE_REFUND)); mdbx_tassert(txn, mdbx_dirtylist_check(txn)); if (unlikely(/* paranoia */ ctx->loop > ((MDBX_DEBUG > 0) ? 12 : 42))) { mdbx_error("too more loops %u, bailout", ctx->loop); @@ -9448,8 +9445,8 @@ retry: } mdbx_tassert(txn, - mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, - txn->mt_next_pgno - MDBX_ENABLE_REFUND)); + pnl_check_allocated(txn->tw.reclaimed_pglist, + txn->mt_next_pgno - MDBX_ENABLE_REFUND)); if (ctx->lifo) { if (ctx->cleaned_slot < (txn->tw.lifo_reclaimed ? MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) @@ -9525,8 +9522,8 @@ retry: } mdbx_tassert(txn, - mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, - txn->mt_next_pgno - MDBX_ENABLE_REFUND)); + pnl_check_allocated(txn->tw.reclaimed_pglist, + txn->mt_next_pgno - MDBX_ENABLE_REFUND)); mdbx_tassert(txn, mdbx_dirtylist_check(txn)); if (mdbx_audit_enabled()) { rc = mdbx_audit_ex(txn, ctx->retired_stored, false); @@ -9536,9 +9533,9 @@ retry: /* return suitable into unallocated space */ if (mdbx_refund(txn)) { - mdbx_tassert( - txn, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, - txn->mt_next_pgno - MDBX_ENABLE_REFUND)); + mdbx_tassert(txn, + pnl_check_allocated(txn->tw.reclaimed_pglist, + txn->mt_next_pgno - MDBX_ENABLE_REFUND)); if (mdbx_audit_enabled()) { rc = mdbx_audit_ex(txn, ctx->retired_stored, false); if (unlikely(rc != MDBX_SUCCESS)) @@ -9743,8 +9740,8 @@ retry: /* handle reclaimed and lost pages - merge and store both into gc */ mdbx_tassert(txn, - mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, - txn->mt_next_pgno - MDBX_ENABLE_REFUND)); + pnl_check_allocated(txn->tw.reclaimed_pglist, + txn->mt_next_pgno - MDBX_ENABLE_REFUND)); mdbx_tassert(txn, txn->tw.loose_count == 0); mdbx_trace("%s", " >> reserving"); @@ -10019,8 +10016,8 @@ retry: rc = mdbx_cursor_put(&ctx->cursor.outer, &key, &data, MDBX_RESERVE | MDBX_NOOVERWRITE); mdbx_tassert(txn, - mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, - txn->mt_next_pgno - MDBX_ENABLE_REFUND)); + pnl_check_allocated(txn->tw.reclaimed_pglist, + txn->mt_next_pgno - MDBX_ENABLE_REFUND)); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; @@ -10054,8 +10051,8 @@ retry: : ctx->reused_slot; rc = MDBX_SUCCESS; mdbx_tassert(txn, - mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, - txn->mt_next_pgno - MDBX_ENABLE_REFUND)); + pnl_check_allocated(txn->tw.reclaimed_pglist, + txn->mt_next_pgno - MDBX_ENABLE_REFUND)); mdbx_tassert(txn, mdbx_dirtylist_check(txn)); if (MDBX_PNL_SIZE(txn->tw.reclaimed_pglist)) { MDBX_val key, data; @@ -10389,8 +10386,8 @@ static __inline void mdbx_txn_merge(MDBX_txn *const parent, MDBX_txn *const txn, memmove(sl + 1, sl + 1 + i, len * sizeof(sl[0])); #endif } - mdbx_tassert( - txn, mdbx_pnl_check4assert(sl, (size_t)parent->mt_next_pgno << 1)); + mdbx_tassert(txn, + pnl_check_allocated(sl, (size_t)parent->mt_next_pgno << 1)); /* Remove reclaimed pages from parent's spill list */ s = MDBX_PNL_SIZE(sl), r = MDBX_PNL_SIZE(reclaimed_list); @@ -10451,8 +10448,8 @@ static __inline void mdbx_txn_merge(MDBX_txn *const parent, MDBX_txn *const txn, /* Remove anything in our spill list from parent's dirty list */ if (txn->tw.spill_pages) { - mdbx_tassert(txn, mdbx_pnl_check4assert(txn->tw.spill_pages, - (size_t)parent->mt_next_pgno << 1)); + mdbx_tassert(txn, pnl_check_allocated(txn->tw.spill_pages, + (size_t)parent->mt_next_pgno << 1)); mdbx_dpl_sift(parent, txn->tw.spill_pages, true); mdbx_tassert(parent, parent->tw.dirtyroom + parent->tw.dirtylist->length == @@ -10615,8 +10612,8 @@ static __inline void mdbx_txn_merge(MDBX_txn *const parent, MDBX_txn *const txn, parent->mt_flags &= ~MDBX_TXN_HAS_CHILD; if (parent->tw.spill_pages) { - assert(mdbx_pnl_check4assert(parent->tw.spill_pages, - (size_t)parent->mt_next_pgno << 1)); + assert(pnl_check_allocated(parent->tw.spill_pages, + (size_t)parent->mt_next_pgno << 1)); if (MDBX_PNL_SIZE(parent->tw.spill_pages)) parent->mt_flags |= MDBX_TXN_SPILLS; } @@ -20042,7 +20039,7 @@ __cold static int mdbx_env_compact(MDBX_env *env, MDBX_txn *read_txn, const MDBX_PNL pnl = data.iov_base; if (unlikely(data.iov_len % sizeof(pgno_t) || data.iov_len < MDBX_PNL_SIZEOF(pnl) || - !(mdbx_pnl_check(pnl, read_txn->mt_next_pgno)))) + !(pnl_check(pnl, read_txn->mt_next_pgno)))) return MDBX_CORRUPTED; gc += MDBX_PNL_SIZE(pnl); } From 268b33cbf7fdf0e12de0da8b1a3b761233f59aca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Mon, 25 Jul 2022 12:53:58 +0300 Subject: [PATCH 058/364] mdbx: simplify/speedup `scan4seq()`. --- src/core.c | 132 +++++++++++++++++++++++++++++++++-------------------- 1 file changed, 82 insertions(+), 50 deletions(-) diff --git a/src/core.c b/src/core.c index 064420a0..5b9670a8 100644 --- a/src/core.c +++ b/src/core.c @@ -6562,59 +6562,86 @@ __cold static int mdbx_wipe_steady(MDBX_env *env, const txnid_t last_steady) { return MDBX_SUCCESS; } -__hot static pgno_t *scan4range(const MDBX_PNL pnl, const unsigned len, - const int num) { - assert(num > 0 && len >= (unsigned)num && len == MDBX_PNL_SIZE(pnl)); +__hot static pgno_t *scan4seq(pgno_t *range, const size_t len, + const unsigned seq) { + assert(seq > 0 && len > seq); #if MDBX_PNL_ASCENDING - const pgno_t *const detent = pnl + len - num; - pgno_t *scan = pnl + 1; - while (likely(scan + 7 <= detent)) { - if (unlikely(scan[num] == *scan + num)) - return scan; - if (unlikely(scan[num + 1] == scan[1] + num)) - return scan + 1; - if (unlikely(scan[num + 2] == scan[2] + num)) - return scan + 2; - if (unlikely(scan[num + 3] == scan[3] + num)) - return scan + 3; - if (unlikely(scan[num + 4] == scan[4] + num)) - return scan + 4; - if (unlikely(scan[num + 5] == scan[5] + num)) - return scan + 5; - if (unlikely(scan[num + 6] == scan[6] + num)) - return scan + 6; - if (unlikely(scan[num + 7] == scan[7] + num)) - return scan + 7; - scan += 8; + assert(range[-1] == len); + const pgno_t *const detent = range + len - seq; + const ptrdiff_t offset = (ptrdiff_t)seq; + const pgno_t target = (pgno_t)offset; + if (likely(len > seq + 3)) { + do { + const pgno_t diff0 = range[offset + 0] - range[0]; + const pgno_t diff1 = range[offset + 1] - range[1]; + const pgno_t diff2 = range[offset + 2] - range[2]; + const pgno_t diff3 = range[offset + 3] - range[3]; + if (diff0 == target) + return range + 0; + if (diff1 == target) + return range + 1; + if (diff2 == target) + return range + 2; + if (diff3 == target) + return range + 3; + range += 4; + } while (range + 3 < detent); + if (range == detent) + return nullptr; } - for (; scan <= detent; ++scan) - if (scan[num] == *scan + num) - return scan; + do + if (range[offset] - *range == target) + return range; + while (++range < detent); #else - const pgno_t *const detent = pnl + num; - pgno_t *scan = pnl + len; - while (likely(scan - 7 >= detent)) { - if (unlikely(scan[-num] == *scan + num)) - return scan; - if (unlikely(scan[-num - 1] == scan[-1] + num)) - return scan - 1; - if (unlikely(scan[-num - 2] == scan[-2] + num)) - return scan - 2; - if (unlikely(scan[-num - 3] == scan[-3] + num)) - return scan - 3; - if (unlikely(scan[-num - 4] == scan[-4] + num)) - return scan - 4; - if (unlikely(scan[-num - 5] == scan[-5] + num)) - return scan - 5; - if (unlikely(scan[-num - 6] == scan[-6] + num)) - return scan - 6; - if (unlikely(scan[-num - 7] == scan[-7] + num)) - return scan - 7; - scan -= 8; + assert(range[-len] == len); + const pgno_t *const detent = range - len + seq; + const ptrdiff_t offset = -(ptrdiff_t)seq; + const pgno_t target = (pgno_t)offset; + if (likely(len > seq + 3)) { + do { + const pgno_t diff0 = range[-0] - range[offset - 0]; + const pgno_t diff1 = range[-1] - range[offset - 1]; + const pgno_t diff2 = range[-2] - range[offset - 2]; + const pgno_t diff3 = range[-3] - range[offset - 3]; + /* Смысл вычислений до ветвлений в том, чтобы позволить компилятору + * загружать и вычислять все значения параллельно. */ + if (diff0 == target) + return range - 0; + if (diff1 == target) + return range - 1; + if (diff2 == target) + return range - 2; + if (diff3 == target) + return range - 3; + range -= 4; + } while (range > detent + 3); + if (range == detent) + return nullptr; + } + do + if (*range - range[offset] == target) + return range; + while (--range > detent); +#endif /* MDBX_PNL sort-order */ + return nullptr; +} + +MDBX_MAYBE_UNUSED static const pgno_t *scan4range_checker(const MDBX_PNL pnl, + const unsigned seq) { + size_t begin = MDBX_PNL_ASCENDING ? 1 : MDBX_PNL_SIZE(pnl); +#if MDBX_PNL_ASCENDING + while (seq <= MDBX_PNL_SIZE(pnl) - begin) { + if (pnl[begin + seq] - pnl[begin] == seq) + return pnl + begin; + ++begin; + } +#else + while (begin > seq) { + if (pnl[begin - seq] - pnl[begin] == seq) + return pnl + begin; + --begin; } - for (; scan >= detent; --scan) - if (scan[-num] == *scan + num) - return scan; #endif /* MDBX_PNL sort-order */ return nullptr; } @@ -6692,7 +6719,12 @@ static pgr_t page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num, int flags) { if (!(flags & (MDBX_ALLOC_COALESCE | MDBX_ALLOC_SLOT)) && re_len >= num) { mdbx_assert(env, MDBX_PNL_LAST(re_list) < txn->mt_next_pgno && MDBX_PNL_FIRST(re_list) < txn->mt_next_pgno); - range = scan4range(re_list, re_len, num); + range = re_list + (MDBX_PNL_ASCENDING ? 1 : re_len); + pgno = *range; + if (num == 1) + goto done; + range = scan4seq(range, re_len, num - 1); + mdbx_tassert(txn, range == scan4range_checker(re_list, num - 1)); if (likely(range)) { pgno = *range; goto done; From bfac10418fcb58831277af22f17356ccbb09ea32 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Mon, 25 Jul 2022 20:17:16 +0300 Subject: [PATCH 059/364] mdbx-make: preserve `MDBX_BUILD_OPTION` for assertion-targets. --- GNUmakefile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/GNUmakefile b/GNUmakefile index 59d7b8d9..f9f9707c 100644 --- a/GNUmakefile +++ b/GNUmakefile @@ -382,11 +382,11 @@ MDBX_SMOKE_EXTRA ?= check: DESTDIR = $(shell pwd)/@check-install check: test dist install -smoke-assertion: MDBX_BUILD_OPTIONS=-DMDBX_FORCE_ASSERTIONS=1 +smoke-assertion: MDBX_BUILD_OPTIONS:=$(strip $(MDBX_BUILD_OPTIONS) -DMDBX_FORCE_ASSERTIONS=1) smoke-assertion: smoke -test-assertion: MDBX_BUILD_OPTIONS=-DMDBX_FORCE_ASSERTIONS=1 +test-assertion: MDBX_BUILD_OPTIONS:=$(strip $(MDBX_BUILD_OPTIONS) -DMDBX_FORCE_ASSERTIONS=1) test-assertion: smoke -long-test-assertion: MDBX_BUILD_OPTIONS=-DMDBX_FORCE_ASSERTIONS=1 +long-test-assertion: MDBX_BUILD_OPTIONS:=$(strip $(MDBX_BUILD_OPTIONS) -DMDBX_FORCE_ASSERTIONS=1) long-test-assertion: smoke smoke: build-test From 9eb69537784343b03b2a406d19a73a7ed62bb54e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Tue, 26 Jul 2022 14:27:57 +0300 Subject: [PATCH 060/364] mdbx: fix minor typo. --- src/core.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/core.c b/src/core.c index 5b9670a8..1c91ec35 100644 --- a/src/core.c +++ b/src/core.c @@ -5828,9 +5828,9 @@ static __inline uint64_t meta_sign(const MDBX_meta *meta) { return (sign > MDBX_DATASIGN_WEAK) ? sign : ~sign; } -enum meta_choise_mode { prefer_last, prefer_steady }; +enum meta_choice_mode { prefer_last, prefer_steady }; -static __inline bool meta_ot(const enum meta_choise_mode mode, +static __inline bool meta_ot(const enum meta_choice_mode mode, const MDBX_env *env, volatile const MDBX_meta *a, volatile const MDBX_meta *b) { mdbx_jitter4testing(true); @@ -5882,7 +5882,7 @@ static int meta_eq_mask(const MDBX_env *env) { } static __always_inline volatile const MDBX_meta * -meta_recent(const enum meta_choise_mode mode, const MDBX_env *env, +meta_recent(const enum meta_choice_mode mode, const MDBX_env *env, volatile const MDBX_meta *a, volatile const MDBX_meta *b) { const bool a_older_that_b = meta_ot(mode, env, a, b); mdbx_assert(env, !meta_eq(env, a, b)); @@ -5898,7 +5898,7 @@ static const MDBX_meta *meta_ancient_prefer_weak(const MDBX_env *env, } static __always_inline volatile const MDBX_meta * -meta_mostrecent(const enum meta_choise_mode mode, const MDBX_env *env) { +meta_mostrecent(const enum meta_choice_mode mode, const MDBX_env *env) { volatile const MDBX_meta *m0 = METAPAGE(env, 0); volatile const MDBX_meta *m1 = METAPAGE(env, 1); volatile const MDBX_meta *m2 = METAPAGE(env, 2); From c37fb505326dbe4120745d798a39f05041791c03 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Tue, 26 Jul 2022 14:29:22 +0300 Subject: [PATCH 061/364] mdbx: more for `__amd64__` macro. --- src/base.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/base.h b/src/base.h index 08c2492c..14ec5e78 100644 --- a/src/base.h +++ b/src/base.h @@ -306,8 +306,9 @@ __extern_C key_t ftok(const char *, int); /* LY: define neutral __ia32__ for x86 and x86-64 */ #define __ia32__ 1 #endif /* __ia32__ */ -#if !defined(__amd64__) && (defined(__x86_64) || defined(__x86_64__) || \ - defined(__amd64) || defined(_M_X64)) +#if !defined(__amd64__) && \ + (defined(__x86_64) || defined(__x86_64__) || defined(__amd64) || \ + defined(_M_X64) || defined(_M_AMD64)) /* LY: define trusty __amd64__ for all AMD64/x86-64 arch */ #define __amd64__ 1 #endif /* __amd64__ */ From 480dc2531e300662ae0515c6b610edcd0f6143e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Tue, 26 Jul 2022 14:38:03 +0300 Subject: [PATCH 062/364] =?UTF-8?q?mdbx:=20=C3=974=20accelerated=20`scan4s?= =?UTF-8?q?eq()`=20(SSE2=20only=20for=20now).?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/base.h | 6 +- src/core.c | 180 ++++++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 181 insertions(+), 5 deletions(-) diff --git a/src/base.h b/src/base.h index 14ec5e78..a40b34fd 100644 --- a/src/base.h +++ b/src/base.h @@ -409,11 +409,13 @@ __extern_C key_t ftok(const char *, int); #if defined(_MSC_VER) || defined(__INTEL_COMPILER) #include #elif __GNUC_PREREQ(4, 4) || defined(__clang__) -#if defined(__ia32__) || defined(__e2k__) +#if defined(__e2k__) +#include #include -#endif /* __ia32__ */ +#endif /* __e2k__ */ #if defined(__ia32__) #include +#include #endif /* __ia32__ */ #elif defined(__SUNPRO_C) || defined(__sun) || defined(sun) #include diff --git a/src/core.c b/src/core.c index 1c91ec35..37bf0048 100644 --- a/src/core.c +++ b/src/core.c @@ -6562,8 +6562,10 @@ __cold static int mdbx_wipe_steady(MDBX_env *env, const txnid_t last_steady) { return MDBX_SUCCESS; } -__hot static pgno_t *scan4seq(pgno_t *range, const size_t len, - const unsigned seq) { +//------------------------------------------------------------------------------ + +MDBX_MAYBE_UNUSED __hot static pgno_t * +scan4seq_fallback(pgno_t *range, const size_t len, const unsigned seq) { assert(seq > 0 && len > seq); #if MDBX_PNL_ASCENDING assert(range[-1] == len); @@ -6594,7 +6596,7 @@ __hot static pgno_t *scan4seq(pgno_t *range, const size_t len, return range; while (++range < detent); #else - assert(range[-len] == len); + assert(range[-(ptrdiff_t)len] == len); const pgno_t *const detent = range - len + seq; const ptrdiff_t offset = -(ptrdiff_t)seq; const pgno_t target = (pgno_t)offset; @@ -6646,6 +6648,178 @@ MDBX_MAYBE_UNUSED static const pgno_t *scan4range_checker(const MDBX_PNL pnl, return nullptr; } +#if !defined(MDBX_ATTRIBUTE_TARGET) && \ + (__has_attribute(__target__) || __GNUC_PREREQ(4, 8)) +#define MDBX_ATTRIBUTE_TARGET(target) __attribute__((__target__(target))) +#endif /* MDBX_ATTRIBUTE_TARGET */ + +#ifdef MDBX_ATTRIBUTE_TARGET_AVX512BW +MDBX_MAYBE_UNUSED +__hot MDBX_ATTRIBUTE_TARGET_AVX512BW static pgno_t *static pgno_t * +scan4seq_avx512bw(pgno_t *range, const size_t len, const unsigned seq) { + return nullptr; +} +#endif /* MDBX_ATTRIBUTE_TARGET_AVX512BW */ + +#ifdef MDBX_ATTRIBUTE_TARGET_AVX2 +MDBX_MAYBE_UNUSED +__hot MDBX_ATTRIBUTE_TARGET_AVX2 static pgno_t *static pgno_t * +scan4seq_avx2(pgno_t *range, const size_t len, const unsigned seq) { + return nullptr; +} +#endif /* MDBX_ATTRIBUTE_TARGET_AVX2 */ + +#ifdef MDBX_ATTRIBUTE_TARGET_AVX +MDBX_MAYBE_UNUSED __hot MDBX_ATTRIBUTE_TARGET_AVX static pgno_t *static pgno_t * +scan4seq_avx(pgno_t *range, const size_t len, const unsigned seq) { + return nullptr; +} +#endif /* MDBX_ATTRIBUTE_TARGET_AVX */ + +#if defined(__SSE2__) +#define MDBX_ATTRIBUTE_TARGET_SSE2 /* nope */ +#elif (defined(_M_IX86_FP) && _M_IX86_FP >= 2) || defined(__amd64__) +#define __SSE2__ +#define MDBX_ATTRIBUTE_TARGET_SSE2 /* nope */ +#elif defined(MDBX_ATTRIBUTE_TARGET) && defined(__ia32__) +#define MDBX_ATTRIBUTE_TARGET_SSE2 MDBX_ATTRIBUTE_TARGET("sse2") +#endif + +#ifdef MDBX_ATTRIBUTE_TARGET_SSE2 +MDBX_ATTRIBUTE_TARGET_SSE2 static __always_inline unsigned +diffcmp2mask_sse2(const pgno_t *const ptr, const ptrdiff_t offset, + const __m128i pattern) { + const __m128i f = _mm_loadu_si128((const __m128i *)ptr); + const __m128i l = _mm_loadu_si128((const __m128i *)(ptr + offset)); + const __m128i cmp = _mm_cmpeq_epi32(_mm_sub_epi32(f, l), pattern); + return _mm_movemask_ps(*(const __m128 *)&cmp); +} + +MDBX_MAYBE_UNUSED __hot MDBX_ATTRIBUTE_TARGET_SSE2 static pgno_t * +scan4seq_sse2(pgno_t *range, const size_t len, const unsigned seq) { + assert(seq > 0 && len > seq); +#if MDBX_PNL_ASCENDING +#error "FIXME: Not implemented" +#endif /* MDBX_PNL_ASCENDING */ + assert(range[-(ptrdiff_t)len] == len); + pgno_t *const detent = range - len + seq; + const ptrdiff_t offset = -(ptrdiff_t)seq; + const pgno_t target = (pgno_t)offset; + const __m128i pattern = _mm_set_epi32(target, target, target, target); + uint8_t mask; + if (likely(len > seq + 3)) { + do { + mask = (uint8_t)diffcmp2mask_sse2(range - 3, offset, pattern); + if (mask) + goto found; + range -= 4; + } while (range > detent + 3); + if (range == detent) + return nullptr; + } + + /* Далее происходит чтение от 4 до 12 лишних байт, которые могут быть не + * только за пределами региона выделенного под PNL, но и пересекать границу + * страницы памяти. Что может приводить как к ошибкам ASAN, так и к падению. + * Поэтому проверяем смещение на странице, а с ASAN всегда страхуемся. */ +#ifdef __SANITIZE_ADDRESS__ + const unsigned on_page_safe_mask = 0; +#else + const unsigned on_page_safe_mask = 0xff0 /* enough for '-15' bytes offset */; +#endif + if (likely(on_page_safe_mask & (uintptr_t)range)) { + const unsigned extra = (unsigned)(detent + 4 - range); + assert(extra > 0 && extra < 4); + mask = 0xF << extra; + mask &= diffcmp2mask_sse2(range - 3, offset, pattern); + if (mask) { + found:; +#ifdef _MSC_VER + unsigned long index; + _BitScanReverse(&index, mask); +#else + const unsigned index = __builtin_clz(mask); +#endif /* _MSC_VER */ + range = range + 28 - index; + return range; + } + return nullptr; + } + do + if (*range - range[offset] == target) + return range; + while (--range != detent); + return nullptr; +} +#endif /* MDBX_ATTRIBUTE_TARGET_SSE2 */ + +#if defined(__AVX512BW__) +#define scan4seq_default scan4seq_avx512bw +#define scan4seq scan4seq_default +#elif defined(__AVX2__) +#define scan4seq_default scan4seq_avx2 +#elif defined(__AVX__) +#define scan4seq_default scan4seq_avx +#elif defined(__SSE2__) +#define scan4seq_default scan4seq_sse2 +/* Choosing of another variants should be added here. */ +#endif /* scan4seq_default */ + +#ifndef scan4seq_default +#define scan4seq_default scan4seq_fallback +#endif /* scan4seq_default */ + +#ifdef scan4seq +/* The scan4seq() is the best or no alternatives */ +#else +#if !(__has_builtin(__builtin_cpu_supports) || \ + defined(__BUILTIN_CPU_SUPPORTS__) || \ + (defined(__ia32__) && __GNUC_PREREQ(4, 8) && __GLIBC_PREREQ(2, 23))) +/* The scan4seq_default() will be used since no cpu-features detection support + * from compiler. Please don't ask to implement cpuid-based detection and don't + * make such PRs. */ +#define scan4seq scan4seq_default +#else +/* Selecting the most appropriate implementation at runtime, + * depending on the available CPU features. */ +static pgno_t *scan4seq_resolver(pgno_t *range, const size_t len, + const unsigned seq); +static pgno_t *(*scan4seq)(pgno_t *range, const size_t len, + const unsigned seq) = scan4seq_resolver; + +static pgno_t *scan4seq_resolver(pgno_t *range, const size_t len, + const unsigned seq) { + pgno_t *(*choice)(pgno_t * range, const size_t len, const unsigned seq) = + nullptr; +#if __has_builtin(__builtin_cpu_init) || defined(__BUILTIN_CPU_INIT__) || \ + __GNUC_PREREQ(4, 8) + __builtin_cpu_init(); +#endif /* __builtin_cpu_init() */ +#ifdef MDBX_ATTRIBUTE_TARGET_AVX512BW + if (__builtin_cpu_supports("avx512bw")) + choice = scan4seq_avx512; +#endif /* MDBX_ATTRIBUTE_TARGET_AVX512BW */ +#ifdef MDBX_ATTRIBUTE_TARGET_AVX2 + if (__builtin_cpu_supports("avx2")) + choice = scan4seq_avx2; +#endif /* MDBX_ATTRIBUTE_TARGET_AVX2 */ +#ifdef MDBX_ATTRIBUTE_TARGET_AVX + if (__builtin_cpu_supports("avx")) + choice = scan4seq_avx; +#endif /* MDBX_ATTRIBUTE_TARGET_AVX2 */ +#ifdef MDBX_ATTRIBUTE_TARGET_SSE2 + if (!choice && __builtin_cpu_supports("sse2")) + choice = scan4seq_sse2; +#endif /* MDBX_ATTRIBUTE_TARGET_SSE2 */ + /* Choosing of another variants should be added here. */ + scan4seq = choice ? choice : scan4seq_default; + return scan4seq(range, len, seq); +} +#endif /* __has_builtin(__builtin_cpu_supports */ +#endif /* scan4seq */ + +//------------------------------------------------------------------------------ + /* Allocate page numbers and memory for writing. Maintain mt_last_reclaimed, * mt_reclaimed_pglist and mt_next_pgno. Set MDBX_TXN_ERROR on failure. * From d28110373ebed9bf6834b392b79853b13a6c8810 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Wed, 27 Jul 2022 20:46:10 +0300 Subject: [PATCH 063/364] mdbx: add simple `SORT_CMP_SWAP()` macro for `MDBX_HAVE_CMOV=0` case. --- src/core.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/core.c b/src/core.c index 37bf0048..ddb9de38 100644 --- a/src/core.c +++ b/src/core.c @@ -1872,6 +1872,7 @@ static int lcklist_detach_locked(MDBX_env *env) { * and network-sort for small chunks. * Thanks to John M. Gamble for the http://pages.ripco.net/~jgamble/nw.html */ +#if MDBX_HAVE_CMOV #define SORT_CMP_SWAP(TYPE, CMP, a, b) \ do { \ const TYPE swap_tmp = (a); \ @@ -1879,6 +1880,16 @@ static int lcklist_detach_locked(MDBX_env *env) { (a) = swap_cmp ? swap_tmp : b; \ (b) = swap_cmp ? b : swap_tmp; \ } while (0) +#else +#define SORT_CMP_SWAP(TYPE, CMP, a, b) \ + do \ + if (!CMP(a, b)) { \ + const TYPE swap_tmp = (a); \ + (a) = (b); \ + (b) = swap_tmp; \ + } \ + while (0) +#endif // 3 comparators, 3 parallel operations // o-----^--^--o From c06d072daf930b3f2649b98c22903e8034b6a2b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Thu, 28 Jul 2022 17:02:51 +0300 Subject: [PATCH 064/364] mdbx-make: support for `MDBX_BUILD_CXX=YES/NO` option. --- GNUmakefile | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/GNUmakefile b/GNUmakefile index f9f9707c..063a9d46 100644 --- a/GNUmakefile +++ b/GNUmakefile @@ -53,8 +53,9 @@ CFLAGS_EXTRA ?= LD ?= ld # build options -MDBX_BUILD_OPTIONS ?=-DNDEBUG=1 +MDBX_BUILD_OPTIONS ?=-DNDEBUG=1 MDBX_BUILD_TIMESTAMP ?=$(shell date +%Y-%m-%dT%H:%M:%S%z) +MDBX_BUILD_CXX ?= YES # probe and compose common compiler flags with variable expansion trick (seems this work two times per session for GNU Make 3.81) CFLAGS ?= $(strip $(eval CFLAGS := -std=gnu11 -O2 -g -Wall -Werror -Wextra -Wpedantic -ffunction-sections -fPIC -fvisibility=hidden -pthread -Wno-error=attributes $$(shell for opt in -fno-semantic-interposition -Wno-unused-command-line-argument -Wno-tautological-compare; do [ -z "$$$$($(CC) '-DMDBX_BUILD_FLAGS="probe"' $$$${opt} -c $(SRC_PROBE_C) -o /dev/null >/dev/null 2>&1 || echo failed)" ] && echo "$$$${opt} "; done)$(CFLAGS_EXTRA))$(CFLAGS)) @@ -127,6 +128,9 @@ TIP := // TIP: .PHONY: all help options lib libs tools clean install uninstall check_buildflags_tag tools-static .PHONY: install-strip install-no-strip strip libmdbx mdbx show-options lib-static lib-shared +boolean = $(if $(findstring $(strip $($1)),YES Yes yes y ON On on 1 true True TRUE),1,$(if $(findstring $(strip $($1)),NO No no n OFF Off off 0 false False FALSE),,$(error Wrong value `$($1)` of $1 for YES/NO option))) +select_by = $(if $(call boolean,$(1)),$(2),$(3)) + ifeq ("$(origin V)", "command line") MDBX_BUILD_VERBOSE := $(V) endif @@ -134,7 +138,7 @@ ifndef MDBX_BUILD_VERBOSE MDBX_BUILD_VERBOSE := 0 endif -ifeq ($(MDBX_BUILD_VERBOSE),1) +ifeq ($(call boolean,MDBX_BUILD_VERBOSE),1) QUIET := HUSH := $(info $(TIP) Use `make V=0` for quiet.) @@ -193,12 +197,12 @@ help: show-options: @echo " MDBX_BUILD_OPTIONS = $(MDBX_BUILD_OPTIONS)" + @echo " MDBX_BUILD_CXX = $(MDBX_BUILD_CXX)" @echo " MDBX_BUILD_TIMESTAMP = $(MDBX_BUILD_TIMESTAMP)" @echo '$(TIP) Use `make options` to listing available build options.' - @echo " CC =`which $(CC)` | `$(CC) --version | head -1`" - @echo " CFLAGS =$(CFLAGS)" - @echo " CXXFLAGS =$(CXXFLAGS)" - @echo " LDFLAGS =$(LDFLAGS) $(LIB_STDCXXFS) $(LIBS) $(EXE_LDFLAGS)" + @echo $(call select_by,MDBX_BUILD_CXX," CXX =`which $(CXX)` | `$(CXX) --version | head -1`"," CC =`which $(CC)` | `$(CC) --version | head -1`") + @echo $(call select_by,MDBX_BUILD_CXX," CXXFLAGS =$(CXXFLAGS)"," CFLAGS =$(CFLAGS)") + @echo $(call select_by,MDBX_BUILD_CXX," LDFLAGS =$(LDFLAGS) $(LIB_STDCXXFS) $(LIBS) $(EXE_LDFLAGS)"," LDFLAGS =$(LDFLAGS) $(LIBS) $(EXE_LDFLAGS)") @echo '$(TIP) Use `make help` to listing available targets.' options: @@ -254,7 +258,7 @@ clean: config.h src/config.h src/version.c *.tar* buildflags.tag \ mdbx_*.static mdbx_*.static-lto -MDBX_BUILD_FLAGS =$(strip $(MDBX_BUILD_OPTIONS) $(CXXSTD) $(CFLAGS) $(LDFLAGS) $(LIBS)) +MDBX_BUILD_FLAGS =$(strip MDBX_BUILD_CXX=$(MDBX_BUILD_CXX) $(MDBX_BUILD_OPTIONS) $(call select_by,MDBX_BUILD_CXX,$(CXXFLAGS) $(LDFLAGS) $(LIB_STDCXXFS) $(LIBS),$(CFLAGS) $(LDFLAGS) $(LIBS))) check_buildflags_tag: $(QUIET)if [ "$(MDBX_BUILD_FLAGS)" != "$$(cat buildflags.tag 2>&1)" ]; then \ echo -n " CLEAN for build with specified flags..." && \ @@ -264,13 +268,13 @@ check_buildflags_tag: buildflags.tag: check_buildflags_tag -lib-static libmdbx.a: mdbx-static.o mdbx++-static.o +lib-static libmdbx.a: mdbx-static.o $(call select_by,MDBX_BUILD_CXX,mdbx++-static.o) @echo ' AR $@' $(QUIET)$(AR) rcs $@ $? $(HUSH) -lib-shared libmdbx.$(SO_SUFFIX): mdbx-dylib.o mdbx++-dylib.o +lib-shared libmdbx.$(SO_SUFFIX): mdbx-dylib.o $(call select_by,MDBX_BUILD_CXX,mdbx++-dylib.o) @echo ' LD $@' - $(QUIET)$(CXX) $(CXXFLAGS) $^ -pthread -shared $(LDFLAGS) $(LIB_STDCXXFS) $(LIBS) -o $@ + $(QUIET)$(call select_by,MDBX_BUILD_CXX,$(CXX) $(CXXFLAGS),$(CC) $(CFLAGS)) $^ -pthread -shared $(LDFLAGS) $(call select_by,MDBX_BUILD_CXX,$(LIB_STDCXXFS)) $(LIBS) -o $@ #> dist-cutoff-begin ifeq ($(wildcard mdbx.c),mdbx.c) From a44eb1accbd91d638b5385815d69b0aee6d85284 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Thu, 28 Jul 2022 18:48:21 +0300 Subject: [PATCH 065/364] mdbx-cmake: add `MDBX_MANAGE_BUILD_FLAGS` build-time option. --- CMakeLists.txt | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index d7a939dc..0b9155aa 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -233,6 +233,7 @@ if(SUBPROJECT) if(NOT DEFINED CMAKE_POSITION_INDEPENDENT_CODE) option(CMAKE_POSITION_INDEPENDENT_CODE "Generate position independent (PIC)" ON) endif() + set(MDBX_MANAGE_BUILD_FLAGS_DEFAULT OFF) else() option(BUILD_SHARED_LIBS "Build shared libraries (DLLs)" ON) option(CMAKE_POSITION_INDEPENDENT_CODE "Generate position independent (PIC)" ON) @@ -341,9 +342,14 @@ else() endif() endif(NOT MDBX_AMALGAMATED_SOURCE) - setup_compile_flags() + set(MDBX_MANAGE_BUILD_FLAGS_DEFAULT ON) endif(SUBPROJECT) +option(MDBX_MANAGE_BUILD_FLAGS "Allow libmdbx to configure/manage/override its own build flags" ${MDBX_MANAGE_BUILD_FLAGS_DEFAULT}) +if(MDBX_MANAGE_BUILD_FLAGS) + setup_compile_flags() +endif() + list(FIND CMAKE_C_COMPILE_FEATURES c_std_11 HAS_C11) list(FIND CMAKE_CXX_COMPILE_FEATURES cxx_std_11 HAS_CXX11) list(FIND CMAKE_CXX_COMPILE_FEATURES cxx_std_14 HAS_CXX14) From 77635116c62c79c680138155d2f2b9f08f8dbce0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Thu, 4 Aug 2022 14:28:35 +0300 Subject: [PATCH 066/364] mdbx: enable solib profiling with `-pg` and `gprof` with GLIBC >= 2.37. However such profiling requires https://sourceware.org/bugzilla/show_bug.cgi?id=29438 to be fixed. --- src/core.c | 9 +++++++-- src/internals.h | 10 ++++++++++ src/lck-posix.c | 20 ++++++++++++++++++++ 3 files changed, 37 insertions(+), 2 deletions(-) diff --git a/src/core.c b/src/core.c index ddb9de38..816dd55f 100644 --- a/src/core.c +++ b/src/core.c @@ -1459,7 +1459,8 @@ __cold void mdbx_rthc_global_init(void) { } bootid = mdbx_osal_bootid(); -#if 0 /* debug */ + +#if 0 /* debug */ for (unsigned i = 0; i < 65536; ++i) { size_t pages = pv2pages(i); unsigned x = pages2pv(pages); @@ -1469,7 +1470,7 @@ __cold void mdbx_rthc_global_init(void) { assert(pages == xp); } fflush(stdout); -#endif +#endif /* #if 0 */ } /* dtor called for thread, i.e. for all mdbx's environment objects */ @@ -1547,6 +1548,7 @@ __cold void mdbx_rthc_thread_dtor(void *rthc) { #endif } +MDBX_EXCLUDE_FOR_GPROF __cold void mdbx_rthc_global_dtor(void) { mdbx_trace(">> pid %d", mdbx_getpid()); @@ -23877,6 +23879,9 @@ __dll_export #endif /* MDBX_BUILD_TYPE */ , "MDBX_DEBUG=" MDBX_STRINGIFY(MDBX_DEBUG) +#ifdef ENABLE_GPROF + " ENABLE_GPROF" +#endif /* ENABLE_GPROF */ " MDBX_WORDBITS=" MDBX_STRINGIFY(MDBX_WORDBITS) " BYTE_ORDER=" #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ diff --git a/src/internals.h b/src/internals.h index cc2a578a..4d830abf 100644 --- a/src/internals.h +++ b/src/internals.h @@ -195,6 +195,16 @@ #endif #endif /* -Walignment-reduction-ignored */ +#ifndef MDBX_EXCLUDE_FOR_GPROF +#ifdef ENABLE_GPROF +#define MDBX_EXCLUDE_FOR_GPROF \ + __attribute__((__no_instrument_function__, \ + __no_profile_instrument_function__)) +#else +#define MDBX_EXCLUDE_FOR_GPROF +#endif /* ENABLE_GPROF */ +#endif /* MDBX_EXCLUDE_FOR_GPROF */ + #ifdef __cplusplus extern "C" { #endif diff --git a/src/lck-posix.c b/src/lck-posix.c index a1a465f8..d4c26e4c 100644 --- a/src/lck-posix.c +++ b/src/lck-posix.c @@ -32,6 +32,7 @@ uint32_t mdbx_linux_kernel_version; bool mdbx_RunningOnWSL1; #endif /* xMDBX_ALLOY */ +MDBX_EXCLUDE_FOR_GPROF __cold static uint8_t probe_for_WSL(const char *tag) { const char *const WSL = strstr(tag, "WSL"); if (WSL && WSL[3] >= '2' && WSL[3] <= '9') @@ -48,8 +49,22 @@ __cold static uint8_t probe_for_WSL(const char *tag) { #endif /* Linux */ +#ifdef ENABLE_GPROF +extern void _mcleanup(void); +extern void monstartup(unsigned long, unsigned long); +extern void _init(void); +extern void _fini(void); +extern void __gmon_start__(void) __attribute__((__weak__)); +#endif /* ENABLE_GPROF */ + +MDBX_EXCLUDE_FOR_GPROF __cold static __attribute__((__constructor__)) void mdbx_global_constructor(void) { +#ifdef ENABLE_GPROF + if (!&__gmon_start__) + monstartup((uintptr_t)&_init, (uintptr_t)&_fini); +#endif /* ENABLE_GPROF */ + #if defined(__linux__) || defined(__gnu_linux__) struct utsname buffer; if (uname(&buffer) == 0) { @@ -84,9 +99,14 @@ mdbx_global_constructor(void) { mdbx_rthc_global_init(); } +MDBX_EXCLUDE_FOR_GPROF __cold static __attribute__((__destructor__)) void mdbx_global_destructor(void) { mdbx_rthc_global_dtor(); +#ifdef ENABLE_GPROF + if (!&__gmon_start__) + _mcleanup(); +#endif /* ENABLE_GPROF */ } /*----------------------------------------------------------------------------*/ From 654b020bc72d6e2af7905939cdc4aaba1ae2e451 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Wed, 27 Jul 2022 17:51:03 +0300 Subject: [PATCH 067/364] mdbx: add `__restrict` to quicksort internal pointers. --- src/core.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/core.c b/src/core.c index 816dd55f..b2acb378 100644 --- a/src/core.c +++ b/src/core.c @@ -2847,11 +2847,12 @@ static int lcklist_detach_locked(MDBX_env *env) { TYPE *lo, *hi; \ } NAME##_stack; \ \ - static __hot void NAME(TYPE *const begin, TYPE *const end) { \ - NAME##_stack stack[sizeof(unsigned) * CHAR_BIT], *top = stack; \ + static __hot void NAME(TYPE *const __restrict begin, \ + TYPE *const __restrict end) { \ + NAME##_stack stack[sizeof(unsigned) * CHAR_BIT], *__restrict top = stack; \ \ - TYPE *hi = end - 1; \ - TYPE *lo = begin; \ + TYPE *__restrict hi = end - 1; \ + TYPE *__restrict lo = begin; \ while (true) { \ const ptrdiff_t len = hi - lo; \ if (len < 16) { \ @@ -2862,7 +2863,7 @@ static int lcklist_detach_locked(MDBX_env *env) { continue; \ } \ \ - TYPE *mid = lo + (len >> 1); \ + TYPE *__restrict mid = lo + (len >> 1); \ SORT_CMP_SWAP(TYPE, CMP, *lo, *mid); \ SORT_CMP_SWAP(TYPE, CMP, *mid, *hi); \ SORT_CMP_SWAP(TYPE, CMP, *lo, *mid); \ From 8404cc1fd7f7e3c5a788451b4707bd91a3b8b51d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Wed, 27 Jul 2022 21:19:48 +0300 Subject: [PATCH 068/364] mdbx: reduce sorting-network to 8. --- src/core.c | 704 +---------------------------------------------------- 1 file changed, 2 insertions(+), 702 deletions(-) diff --git a/src/core.c b/src/core.c index b2acb378..5077cabc 100644 --- a/src/core.c +++ b/src/core.c @@ -2082,686 +2082,10 @@ static int lcklist_detach_locked(MDBX_env *env) { SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[6]); \ } while (0) -// 25 comparators, 9 parallel operations -// o--^-----^--^-----^-----------------------------------o -// | | | | -// o--v--^--v--|-----|--^-----^-----------^--------------o -// | | | | | | -// o-----v-----|-----|--|-----|--^-----^--|--^-----^--^--o -// | | | | | | | | | | -// o--^-----^--v--^--v--|-----|--|-----|--v--|-----|--v--o -// | | | | | | | | | -// o--v--^--v-----|-----v--^--v--|-----|-----|--^--v-----o -// | | | | | | | -// o-----v--------|--------|-----v--^--v--^--|--|--^-----o -// | | | | | | | -// o--^-----^-----v--------|--------|-----|--v--v--v-----o -// | | | | | -// o--v--^--v--------------v--------|-----v--------------o -// | | -// o-----v--------------------------v--------------------o -// -// [[0,1],[3,4],[6,7]] -// [[1,2],[4,5],[7,8]] -// [[0,1],[3,4],[6,7],[2,5]] -// [[0,3],[1,4],[5,8]] -// [[3,6],[4,7],[2,5]] -// [[0,3],[1,4],[5,7],[2,6]] -// [[1,3],[4,6]] -// [[2,4],[5,6]] -// [[2,3]] -#define SORT_NETWORK_9(TYPE, CMP, begin) \ - do { \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[7]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[2]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[5]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[7]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[5]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[3]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[7]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[5]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[3]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[7]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[3]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]); \ - } while (0) - -// 29 comparators, 9 parallel operations -// o--------------^-----^--^--^-----------------------o -// | | | | -// o-----------^--|--^--|--|--v--^--------^-----------o -// | | | | | | | -// o--------^--|--|--|--|--v--^--v-----^--|--^--------o -// | | | | | | | | | -// o-----^--|--|--|--|--v--^--|-----^--|--v--v--^-----o -// | | | | | | | | | | -// o--^--|--|--|--|--v-----|--v--^--|--|--^-----v--^--o -// | | | | | | | | | | | -// o--|--|--|--|--v--^-----|--^--|--v--v--|-----^--v--o -// | | | | | | | | | | -// o--|--|--|--v--^--|-----v--|--v--^-----|--^--v-----o -// | | | | | | | | | -// o--|--|--v-----|--|--^-----v--^--|-----v--v--------o -// | | | | | | | -// o--|--v--------|--v--|--^-----v--v-----------------o -// | | | | -// o--v-----------v-----v--v--------------------------o -// -// [[4,9],[3,8],[2,7],[1,6],[0,5]] -// [[1,4],[6,9],[0,3],[5,8]] -// [[0,2],[3,6],[7,9]] -// [[0,1],[2,4],[5,7],[8,9]] -// [[1,2],[4,6],[7,8],[3,5]] -// [[2,5],[6,8],[1,3],[4,7]] -// [[2,3],[6,7]] -// [[3,4],[5,6]] -// [[4,5]] -#define SORT_NETWORK_10(TYPE, CMP, begin) \ - do { \ - SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[9]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[7]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[5]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[9]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[3]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[2]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[9]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[7]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[9]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[2]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[5]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[5]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[3]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[7]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[7]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[5]); \ - } while (0) - -// 35 comparators, 9 parallel operations -// o--^-----^-----------------^--------^--------------------o -// | | | | -// o--v--^--|--^--^--------^--|--------|--^-----------------o -// | | | | | | | | -// o--^--|--v--v--|-----^--|--|--------|--|-----^--^--------o -// | | | | | | | | | | -// o--v--v--------|-----|--|--|--^-----|--|--^--v--|--^--^--o -// | | | | | | | | | | | -// o--^-----^-----|-----|--|--v--|--^--v--v--|-----v--|--v--o -// | | | | | | | | | -// o--v--^--|--^--v--^--|--v-----|--|--------|--------v--^--o -// | | | | | | | | | -// o--^--|--v--v--^--|--v--^-----|--|--------|--------^--v--o -// | | | | | | | | | -// o--v--v--------|--|-----|-----v--|--^-----|-----^--|--^--o -// | | | | | | | | | -// o--^--^--------|--|-----|--------v--|-----v--^--|--v--v--o -// | | | | | | | | -// o--v--|--^-----|--v-----|-----------|--------v--v--------o -// | | | | | -// o-----v--v-----v--------v-----------v--------------------o -// -// [[0,1],[2,3],[4,5],[6,7],[8,9]] -// [[1,3],[5,7],[0,2],[4,6],[8,10]] -// [[1,2],[5,6],[9,10],[0,4],[3,7]] -// [[1,5],[6,10],[4,8]] -// [[5,9],[2,6],[0,4],[3,8]] -// [[1,5],[6,10],[2,3],[8,9]] -// [[1,4],[7,10],[3,5],[6,8]] -// [[2,4],[7,9],[5,6]] -// [[3,4],[7,8]] -#define SORT_NETWORK_11(TYPE, CMP, begin) \ - do { \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[5]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[7]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[9]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[3]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[7]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[2]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[10]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[2]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[9], begin[10]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[7]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[5]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[10]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[9]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[5]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[10]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[9]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[10]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[5]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[9]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[8]); \ - } while (0) - -// 39 comparators, parallel operations -// o--^-----^-----------------^--------^--------------------o -// | | | | -// o--v--^--|--^--^--------^--|--------|--^-----------------o -// | | | | | | | | -// o--^--|--v--v--|-----^--|--|--------|--|-----^--^--------o -// | | | | | | | | | | -// o--v--v--------|-----|--|--|--^-----|--|--^--v--|--^--^--o -// | | | | | | | | | | | -// o--^-----^-----|-----|--|--v--|--^--v--v--|-----v--|--v--o -// | | | | | | | | | -// o--v--^--|--^--v--^--|--v-----|--|--------|--------v--^--o -// | | | | | | | | | -// o--^--|--v--v--^--|--v--^-----|--|--------|--------^--v--o -// | | | | | | | | | -// o--v--v--------|--|-----|--^--v--|--^--^--|-----^--|--^--o -// | | | | | | | | | | | -// o--^-----^-----|--|-----|--|-----v--|--|--v--^--|--v--v--o -// | | | | | | | | | | -// o--v--^--|--^--|--v-----|--|--------|--|-----v--v--------o -// | | | | | | | | -// o--^--|--v--v--v--------v--|--------|--v-----------------o -// | | | | -// o--v--v--------------------v--------v--------------------o -// -// [[0,1],[2,3],[4,5],[6,7],[8,9],[10,11]] -// [[1,3],[5,7],[9,11],[0,2],[4,6],[8,10]] -// [[1,2],[5,6],[9,10],[0,4],[7,11]] -// [[1,5],[6,10],[3,7],[4,8]] -// [[5,9],[2,6],[0,4],[7,11],[3,8]] -// [[1,5],[6,10],[2,3],[8,9]] -// [[1,4],[7,10],[3,5],[6,8]] -// [[2,4],[7,9],[5,6]] -// [[3,4],[7,8]] -#define SORT_NETWORK_12(TYPE, CMP, begin) \ - do { \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[5]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[7]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[9]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[10], begin[11]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[3]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[7]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[9], begin[11]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[2]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[10]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[2]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[9], begin[10]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[11]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[5]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[10]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[7]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[9]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[11]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[5]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[10]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[9]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[10]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[5]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[9]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[8]); \ - } while (0) - -// 45 comparators, 10 parallel operations -// o--------^--^-----^-----------------------------^-----------------o -// | | | | -// o--^-----|--v-----|-----^--------------^-----^--|-----^-----------o -// | | | | | | | | -// o--|-----|--^--^--v-----|--------------|--^--|--|--^--v--^--------o -// | | | | | | | | | | | -// o--|--^--|--|--v-----^--|--------^-----|--|--v--|--|--^--v-----^--o -// | | | | | | | | | | | | | -// o--|--v--|--|--^-----|--v-----^--v-----|--|--^--|--|--|--^--^--v--o -// | | | | | | | | | | | | | | -// o--|--^--|--|--|--^--|--------|-----^--|--|--|--v--v--v--|--v--^--o -// | | | | | | | | | | | | | | -// o--|--|--|--v--v--|--|--^-----|--^--v--|--v--|--^--------v--^--v--o -// | | | | | | | | | | | | -// o--v--|--|-----^--|--v--|--^--|--|-----v-----v--|--^--------v-----o -// | | | | | | | | | | -// o-----v--|--^--|--|-----|--v--|--|--^-----^-----v--v--^-----------o -// | | | | | | | | | | -// o--^-----|--|--|--v-----|-----v--|--v--^--|--^--------v-----------o -// | | | | | | | | | -// o--|-----|--|--|--^-----|--------v--^--|--v--v--------------------o -// | | | | | | | | -// o--v-----|--v--|--v-----|--^--------v--v--------------------------o -// | | | | -// o--------v-----v--------v--v--------------------------------------o -// -// [[1,7],[9,11],[3,4],[5,8],[0,12],[2,6]] -// [[0,1],[2,3],[4,6],[8,11],[7,12],[5,9]] -// [[0,2],[3,7],[10,11],[1,4],[6,12]] -// [[7,8],[11,12],[4,9],[6,10]] -// [[3,4],[5,6],[8,9],[10,11],[1,7]] -// [[2,6],[9,11],[1,3],[4,7],[8,10],[0,5]] -// [[2,5],[6,8],[9,10]] -// [[1,2],[3,5],[7,8],[4,6]] -// [[2,3],[4,5],[6,7],[8,9]] -// [[3,4],[5,6]] -#define SORT_NETWORK_13(TYPE, CMP, begin) \ - do { \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[7]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[9], begin[11]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[12]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[11]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[12]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[9]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[2]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[7]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[10], begin[11]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[12]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[11], begin[12]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[9]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[10]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[9]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[10], begin[11]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[7]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[9], begin[11]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[3]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[7]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[10]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[5]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[5]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[9], begin[10]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[2]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[5]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[5]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[7]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[9]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[6]); \ - } while (0) - -/* *INDENT-OFF* */ -/* clang-format off */ - -// 51 comparators, 10 parallel operations -// o--^--^-----^-----------^-----------------------------------------------------------o -// | | | | -// o--v--|--^--|--^--------|--^-----^-----------------------^--------------------------o -// | | | | | | | | -// o--^--v--|--|--|--^-----|--|--^--v-----------------------|--^--^--------------------o -// | | | | | | | | | | | -// o--v-----v--|--|--|--^--|--|--|--^--------------^--------|--|--|--^--^--^-----------o -// | | | | | | | | | | | | | | | -// o--^--^-----v--|--|--|--|--|--|--|--^-----------|-----^--v--|--v--|--|--v-----------o -// | | | | | | | | | | | | | | | -// o--v--|--^-----v--|--|--|--|--|--|--|--^--^-----|-----|-----|--^--|--v-----^--------o -// | | | | | | | | | | | | | | | | | -// o--^--v--|--------v--|--|--|--|--|--|--|--|--^--|-----|-----|--v--|-----^--v-----^--o -// | | | | | | | | | | | | | | | | | -// o--v-----v-----------v--|--|--|--|--|--|--|--|--|--^--|--^--|-----|--^--|--^--^--v--o -// | | | | | | | | | | | | | | | | | | -// o--^--^-----^-----------v--|--|--|--|--|--|--|--|--|--v--|--v-----v--|--v--|--v--^--o -// | | | | | | | | | | | | | | | | -// o--v--|--^--|--^-----------v--|--|--|--|--|--v--|--|-----|--^--------|-----v--^--v--o -// | | | | | | | | | | | | | | | -// o--^--v--|--|--|--------------v--|--|--|--v-----|--|-----|--v--------|--^-----v-----o -// | | | | | | | | | | | | -// o--v-----v--|--|-----------------v--|--|--------|--v-----|--^--------|--|--^--------o -// | | | | | | | | | | -// o--^--------v--|--------------------v--|--------v--------|--|--------v--v--v--------o -// | | | | | -// o--v-----------v-----------------------v-----------------v--v-----------------------o -// -// [[0,1],[2,3],[4,5],[6,7],[8,9],[10,11],[12,13]] -// [[0,2],[4,6],[8,10],[1,3],[5,7],[9,11]] -// [[0,4],[8,12],[1,5],[9,13],[2,6],[3,7]] -// [[0,8],[1,9],[2,10],[3,11],[4,12],[5,13]] -// [[5,10],[6,9],[3,12],[7,11],[1,2],[4,8]] -// [[1,4],[7,13],[2,8],[5,6],[9,10]] -// [[2,4],[11,13],[3,8],[7,12]] -// [[6,8],[10,12],[3,5],[7,9]] -// [[3,4],[5,6],[7,8],[9,10],[11,12]] -// [[6,7],[8,9]] - -/* *INDENT-ON* */ -/* clang-format on */ - -#define SORT_NETWORK_14(TYPE, CMP, begin) \ - do { \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[5]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[7]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[9]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[10], begin[11]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[12], begin[13]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[2]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[10]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[3]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[7]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[9], begin[11]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[12]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[5]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[9], begin[13]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[7]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[9]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[10]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[11]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[12]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[13]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[10]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[9]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[12]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[11]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[2]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[13]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[9], begin[10]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[11], begin[13]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[12]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[10], begin[12]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[5]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[9]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[9], begin[10]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[11], begin[12]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[7]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[9]); \ - } while (0) - -/* *INDENT-OFF* */ -/* clang-format off */ - -// 56 comparators, 10 parallel operations -// o--^--^-----^-----------^--------------------------------------------------------------o -// | | | | -// o--v--|--^--|--^--------|--^-----^--------------------------^--------------------------o -// | | | | | | | | -// o--^--v--|--|--|--^-----|--|--^--v--------------------------|--^--^--------------------o -// | | | | | | | | | | | -// o--v-----v--|--|--|--^--|--|--|--^-----------------^--------|--|--|--^--^--^-----------o -// | | | | | | | | | | | | | | | -// o--^--^-----v--|--|--|--|--|--|--|--^--------------|-----^--v--|--v--|--|--v-----------o -// | | | | | | | | | | | | | | | -// o--v--|--^-----v--|--|--|--|--|--|--|--^-----^-----|-----|-----|--^--|--v-----^--------o -// | | | | | | | | | | | | | | | | | -// o--^--v--|--------v--|--|--|--|--|--|--|--^--|--^--|-----|-----|--v--|-----^--v-----^--o -// | | | | | | | | | | | | | | | | | | -// o--v-----v-----------v--|--|--|--|--|--|--|--|--|--|--^--|--^--|-----|--^--|--^--^--v--o -// | | | | | | | | | | | | | | | | | | | -// o--^--^-----^-----------v--|--|--|--|--|--|--|--|--|--|--v--|--v-----v--|--v--|--v--^--o -// | | | | | | | | | | | | | | | | | -// o--v--|--^--|--^-----------v--|--|--|--|--|--|--v--|--|-----|--^--------|-----v--^--v--o -// | | | | | | | | | | | | | | | | -// o--^--v--|--|--|--^-----------v--|--|--|--|--v-----|--|-----|--v--------|--^-----v-----o -// | | | | | | | | | | | | | | -// o--v-----v--|--|--|--------------v--|--|--|--------|--v-----|--^--^-----|--|--^--------o -// | | | | | | | | | | | | | -// o--^--^-----v--|--|-----------------v--|--|--------v--------|--|--|-----v--v--v--------o -// | | | | | | | | | -// o--v--|--------v--|--------------------v--|--^--------------v--|--v--------------------o -// | | | | | -// o-----v-----------v-----------------------v--v-----------------v-----------------------o -// -// [[0,1],[2,3],[4,5],[6,7],[8,9],[10,11],[12,13]] -// [[0,2],[4,6],[8,10],[12,14],[1,3],[5,7],[9,11]] -// [[0,4],[8,12],[1,5],[9,13],[2,6],[10,14],[3,7]] -// [[0,8],[1,9],[2,10],[3,11],[4,12],[5,13],[6,14]] -// [[5,10],[6,9],[3,12],[13,14],[7,11],[1,2],[4,8]] -// [[1,4],[7,13],[2,8],[11,14],[5,6],[9,10]] -// [[2,4],[11,13],[3,8],[7,12]] -// [[6,8],[10,12],[3,5],[7,9]] -// [[3,4],[5,6],[7,8],[9,10],[11,12]] -// [[6,7],[8,9]] - -/* *INDENT-ON* */ -/* clang-format on */ - -#define SORT_NETWORK_15(TYPE, CMP, begin) \ - do { \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[5]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[7]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[9]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[10], begin[11]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[12], begin[13]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[2]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[10]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[12], begin[14]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[3]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[7]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[9], begin[11]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[12]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[5]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[9], begin[13]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[10], begin[14]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[7]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[9]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[10]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[11]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[12]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[13]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[14]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[10]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[9]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[12]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[13], begin[14]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[11]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[2]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[13]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[11], begin[14]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[9], begin[10]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[11], begin[13]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[12]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[10], begin[12]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[5]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[9]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[9], begin[10]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[11], begin[12]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[7]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[9]); \ - } while (0) - -/* *INDENT-OFF* */ -/* clang-format off */ - -// 60 comparators, 10 parallel operations -// o--^--^-----^-----------^-----------------------------------------------------------------o -// | | | | -// o--v--|--^--|--^--------|--^-----^-----------------------------^--------------------------o -// | | | | | | | | -// o--^--v--|--|--|--^-----|--|--^--v-----------------------------|--^--^--------------------o -// | | | | | | | | | | | -// o--v-----v--|--|--|--^--|--|--|--^--------------------^--------|--|--|--^--^--^-----------o -// | | | | | | | | | | | | | | | -// o--^--^-----v--|--|--|--|--|--|--|--^-----------------|-----^--v--|--v--|--|--v-----------o -// | | | | | | | | | | | | | | | -// o--v--|--^-----v--|--|--|--|--|--|--|--^--------^-----|-----|-----|--^--|--v-----^--------o -// | | | | | | | | | | | | | | | | | -// o--^--v--|--------v--|--|--|--|--|--|--|--^-----|--^--|-----|-----|--v--|-----^--v-----^--o -// | | | | | | | | | | | | | | | | | | -// o--v-----v-----------v--|--|--|--|--|--|--|--^--|--|--|--^--|--^--|-----|--^--|--^--^--v--o -// | | | | | | | | | | | | | | | | | | | | -// o--^--^-----^-----------v--|--|--|--|--|--|--|--|--|--|--|--v--|--v-----v--|--v--|--v--^--o -// | | | | | | | | | | | | | | | | | | -// o--v--|--^--|--^-----------v--|--|--|--|--|--|--|--v--|--|-----|--^--------|-----v--^--v--o -// | | | | | | | | | | | | | | | | | -// o--^--v--|--|--|--^-----------v--|--|--|--|--|--v-----|--|-----|--v--------|--^-----v-----o -// | | | | | | | | | | | | | | | -// o--v-----v--|--|--|--^-----------v--|--|--|--|--------|--v-----|--^--^-----|--|--^--------o -// | | | | | | | | | | | | | | | -// o--^--^-----v--|--|--|--------------v--|--|--|--------v--------|--|--|-----v--v--v--------o -// | | | | | | | | | | | -// o--v--|--^-----v--|--|-----------------v--|--|--^--------------v--|--v--------------------o -// | | | | | | | | -// o--^--v--|--------v--|--------------------v--|--v-----------------v-----------------------o -// | | | | -// o--v-----v-----------v-----------------------v--------------------------------------------o -// -// [[0,1],[2,3],[4,5],[6,7],[8,9],[10,11],[12,13],[14,15]] -// [[0,2],[4,6],[8,10],[12,14],[1,3],[5,7],[9,11],[13,15]] -// [[0,4],[8,12],[1,5],[9,13],[2,6],[10,14],[3,7],[11,15]] -// [[0,8],[1,9],[2,10],[3,11],[4,12],[5,13],[6,14],[7,15]] -// [[5,10],[6,9],[3,12],[13,14],[7,11],[1,2],[4,8]] -// [[1,4],[7,13],[2,8],[11,14],[5,6],[9,10]] -// [[2,4],[11,13],[3,8],[7,12]] -// [[6,8],[10,12],[3,5],[7,9]] -// [[3,4],[5,6],[7,8],[9,10],[11,12]] -// [[6,7],[8,9]] - -/* *INDENT-ON* */ -/* clang-format on */ - -#define SORT_NETWORK_16(TYPE, CMP, begin) \ - do { \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[5]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[7]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[9]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[10], begin[11]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[12], begin[13]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[14], begin[15]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[2]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[10]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[12], begin[14]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[3]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[7]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[9], begin[11]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[13], begin[15]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[12]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[5]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[9], begin[13]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[10], begin[14]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[7]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[11], begin[15]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[9]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[10]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[11]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[12]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[13]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[14]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[15]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[10]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[9]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[12]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[13], begin[14]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[11]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[2]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[13]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[11], begin[14]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[9], begin[10]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[11], begin[13]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[12]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[10], begin[12]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[5]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[9]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[9], begin[10]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[11], begin[12]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[7]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[9]); \ - } while (0) - #define SORT_INNER(TYPE, CMP, begin, end, len) \ switch (len) { \ default: \ + assert(false); \ __unreachable(); \ case 0: \ case 1: \ @@ -2787,30 +2111,6 @@ static int lcklist_detach_locked(MDBX_env *env) { case 8: \ SORT_NETWORK_8(TYPE, CMP, begin); \ break; \ - case 9: \ - SORT_NETWORK_9(TYPE, CMP, begin); \ - break; \ - case 10: \ - SORT_NETWORK_10(TYPE, CMP, begin); \ - break; \ - case 11: \ - SORT_NETWORK_11(TYPE, CMP, begin); \ - break; \ - case 12: \ - SORT_NETWORK_12(TYPE, CMP, begin); \ - break; \ - case 13: \ - SORT_NETWORK_13(TYPE, CMP, begin); \ - break; \ - case 14: \ - SORT_NETWORK_14(TYPE, CMP, begin); \ - break; \ - case 15: \ - SORT_NETWORK_15(TYPE, CMP, begin); \ - break; \ - case 16: \ - SORT_NETWORK_16(TYPE, CMP, begin); \ - break; \ } #define SORT_SWAP(TYPE, a, b) \ @@ -2855,7 +2155,7 @@ static int lcklist_detach_locked(MDBX_env *env) { TYPE *__restrict lo = begin; \ while (true) { \ const ptrdiff_t len = hi - lo; \ - if (len < 16) { \ + if (len < 8) { \ SORT_INNER(TYPE, CMP, lo, hi + 1, len + 1); \ if (unlikely(top == stack)) \ break; \ From c0f8ecd6f2546780a1314557bf0b3be29c497a6a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Wed, 27 Jul 2022 21:40:42 +0300 Subject: [PATCH 069/364] mdbx: add `expect_with_probability()` macro. --- src/base.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/base.h b/src/base.h index a40b34fd..9e2778ae 100644 --- a/src/base.h +++ b/src/base.h @@ -645,6 +645,16 @@ __extern_C key_t ftok(const char *, int); #endif #endif /* __anonymous_struct_extension__ */ +#ifndef expect_with_probability +#if defined(__builtin_expect_with_probability) || \ + __has_builtin(__builtin_expect_with_probability) || __GNUC_PREREQ(9, 0) +#define expect_with_probability(expr, value, prob) \ + __builtin_expect_with_probability(expr, value, prob) +#else +#define expect_with_probability(expr, value, prob) (expr) +#endif +#endif /* expect_with_probability */ + /*----------------------------------------------------------------------------*/ #if defined(MDBX_USE_VALGRIND) From a11c045f1e08d1958c60e06ff1eebbb0934968c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Thu, 4 Aug 2022 16:55:37 +0300 Subject: [PATCH 070/364] mdbx: using `expect_with_probability()` macro. --- src/core.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/core.c b/src/core.c index 5077cabc..2f07c833 100644 --- a/src/core.c +++ b/src/core.c @@ -1878,14 +1878,14 @@ static int lcklist_detach_locked(MDBX_env *env) { #define SORT_CMP_SWAP(TYPE, CMP, a, b) \ do { \ const TYPE swap_tmp = (a); \ - const bool swap_cmp = CMP(swap_tmp, b); \ + const bool swap_cmp = expect_with_probability(CMP(swap_tmp, b), 0, .5); \ (a) = swap_cmp ? swap_tmp : b; \ (b) = swap_cmp ? b : swap_tmp; \ } while (0) #else #define SORT_CMP_SWAP(TYPE, CMP, a, b) \ do \ - if (!CMP(a, b)) { \ + if (expect_with_probability(!CMP(a, b), 0, .5)) { \ const TYPE swap_tmp = (a); \ (a) = (b); \ (b) = swap_tmp; \ @@ -2138,7 +2138,7 @@ static int lcklist_detach_locked(MDBX_env *env) { \ static __inline bool NAME##_is_sorted(const TYPE *first, const TYPE *last) { \ while (++first <= last) \ - if (CMP(first[0], first[-1])) \ + if (expect_with_probability(CMP(first[0], first[-1]), 1, .1)) \ return false; \ return true; \ } \ @@ -2171,9 +2171,9 @@ static int lcklist_detach_locked(MDBX_env *env) { TYPE *right = hi - 1; \ TYPE *left = lo + 1; \ while (1) { \ - while (CMP(*left, *mid)) \ + while (expect_with_probability(CMP(*left, *mid), 0, .5)) \ ++left; \ - while (CMP(*mid, *right)) \ + while (expect_with_probability(CMP(*mid, *right), 0, .5)) \ --right; \ if (unlikely(left > right)) { \ if (EXPECT_LOW_CARDINALITY_OR_PRESORTED) { \ @@ -2287,24 +2287,24 @@ static int lcklist_detach_locked(MDBX_env *env) { length >>= 1; \ const TYPE_LIST *const middle = first + length; \ const unsigned left = whole - length - 1; \ - const bool cmp = CMP(*middle, item); \ + const bool cmp = expect_with_probability(CMP(*middle, item), 0, .5); \ length = cmp ? left : length; \ first = cmp ? middle + 1 : first; \ } \ \ switch (length) { \ case 3: \ - if (!CMP(*first, item)) \ + if (expect_with_probability(!CMP(*first, item), 0, .5)) \ break; \ ++first; \ __fallthrough /* fall through */; \ case 2: \ - if (!CMP(*first, item)) \ + if (expect_with_probability(!CMP(*first, item), 0, .5)) \ break; \ ++first; \ __fallthrough /* fall through */; \ case 1: \ - if (!CMP(*first, item)) \ + if (expect_with_probability(!CMP(*first, item), 0, .5)) \ break; \ ++first; \ __fallthrough /* fall through */; \ @@ -2944,7 +2944,7 @@ __hot __noinline static MDBX_dpl *mdbx_dpl_sort_slowpath(const MDBX_txn *txn) { MDBX_dp *l = dl->items + dl->sorted; MDBX_dp *r = end - 1; do { - const bool cmp = l->pgno > r->pgno; + const bool cmp = expect_with_probability(l->pgno > r->pgno, 0, .5); *w = cmp ? *l : *r; l -= cmp; r += cmp - 1; From eac3d0499f9196bbb295a64c81b4b91a96568cbc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Thu, 4 Aug 2022 16:57:49 +0300 Subject: [PATCH 071/364] mdbx: minor refine/speedup `dpl_sort_slowpath()`. --- src/core.c | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/src/core.c b/src/core.c index 2f07c833..5974f555 100644 --- a/src/core.c +++ b/src/core.c @@ -2919,7 +2919,7 @@ RADIXSORT_IMPL(dpl, MDBX_dp, MDBX_DPL_EXTRACT_KEY, #define DP_SORT_CMP(first, last) ((first).pgno < (last).pgno) SORT_IMPL(dp_sort, false, MDBX_dp, DP_SORT_CMP) -__hot __noinline static MDBX_dpl *mdbx_dpl_sort_slowpath(const MDBX_txn *txn) { +__hot __noinline static MDBX_dpl *dpl_sort_slowpath(const MDBX_txn *txn) { MDBX_dpl *dl = txn->tw.dirtylist; assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); const unsigned unsorted = dl->length - dl->sorted; @@ -2940,14 +2940,18 @@ __hot __noinline static MDBX_dpl *mdbx_dpl_sort_slowpath(const MDBX_txn *txn) { memcpy(tmp, sorted_end, unsorted * sizeof(MDBX_dp)); dp_sort(tmp, tmp + unsorted); /* merge two parts from end to begin */ - MDBX_dp *w = dl->items + dl->length; - MDBX_dp *l = dl->items + dl->sorted; - MDBX_dp *r = end - 1; + MDBX_dp *__restrict w = dl->items + dl->length; + MDBX_dp *__restrict l = dl->items + dl->sorted; + MDBX_dp *__restrict r = end - 1; do { const bool cmp = expect_with_probability(l->pgno > r->pgno, 0, .5); +#if defined(__LCC__) || __CLANG_PREREQ(13, 0) || !MDBX_HAVE_CMOV + *w = cmp ? *l-- : *r--; +#else *w = cmp ? *l : *r; l -= cmp; r += cmp - 1; +#endif } while (likely(--w > l)); assert(r == tmp - 1); assert(dl->items[0].pgno == 0 && @@ -2968,12 +2972,12 @@ __hot __noinline static MDBX_dpl *mdbx_dpl_sort_slowpath(const MDBX_txn *txn) { return dl; } -static __always_inline MDBX_dpl *mdbx_dpl_sort(const MDBX_txn *txn) { +static __always_inline MDBX_dpl *dpl_sort(const MDBX_txn *txn) { MDBX_dpl *dl = txn->tw.dirtylist; assert(dl->length <= MDBX_PGL_LIMIT); assert(dl->sorted <= dl->length); assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); - return likely(dl->sorted == dl->length) ? dl : mdbx_dpl_sort_slowpath(txn); + return likely(dl->sorted == dl->length) ? dl : dpl_sort_slowpath(txn); } /* Returns the index of the first dirty-page whose pgno @@ -2994,7 +2998,7 @@ static unsigned __hot mdbx_dpl_search(const MDBX_txn *txn, pgno_t pgno) { switch (dl->length - dl->sorted) { default: /* sort a whole */ - mdbx_dpl_sort_slowpath(txn); + dpl_sort_slowpath(txn); break; case 0: /* whole sorted cases */ @@ -4023,7 +4027,7 @@ static void mdbx_refund_loose(MDBX_txn *txn) { } } else { /* Dirtylist is mostly sorted, just refund loose pages at the end. */ - mdbx_dpl_sort(txn); + dpl_sort(txn); mdbx_tassert(txn, dl->length < 2 || dl->items[1].pgno < dl->items[dl->length].pgno); mdbx_tassert(txn, dl->sorted == dl->length); @@ -4838,7 +4842,7 @@ static int mdbx_txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0, } /* Сортируем чтобы запись на диск была полее последовательна */ - MDBX_dpl *const dl = mdbx_dpl_sort(txn); + MDBX_dpl *const dl = dpl_sort(txn); /* Preserve pages which may soon be dirtied again */ const unsigned unspillable = mdbx_txn_keep(txn, m0); @@ -8023,7 +8027,7 @@ int mdbx_txn_begin_ex(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags, txn->tw.dirtyroom = parent->tw.dirtyroom; txn->tw.dirtylru = parent->tw.dirtylru; - mdbx_dpl_sort(parent); + dpl_sort(parent); if (parent->tw.spill_pages) mdbx_spill_purge(parent); @@ -8383,7 +8387,7 @@ static void mdbx_dpl_sift(MDBX_txn *const txn, MDBX_PNL pl, if (MDBX_PNL_SIZE(pl) && txn->tw.dirtylist->length) { mdbx_tassert(txn, pnl_check_allocated(pl, (size_t)txn->mt_next_pgno << spilled)); - MDBX_dpl *dl = mdbx_dpl_sort(txn); + MDBX_dpl *dl = dpl_sort(txn); /* Scanning in ascend order */ const int step = MDBX_PNL_ASCENDING ? 1 : -1; @@ -9740,7 +9744,7 @@ bailout: static int mdbx_txn_write(MDBX_txn *txn, struct mdbx_iov_ctx *ctx) { MDBX_dpl *const dl = - (txn->mt_flags & MDBX_WRITEMAP) ? txn->tw.dirtylist : mdbx_dpl_sort(txn); + (txn->mt_flags & MDBX_WRITEMAP) ? txn->tw.dirtylist : dpl_sort(txn); int rc = MDBX_SUCCESS; unsigned r, w; for (w = 0, r = 1; r <= dl->length; ++r) { @@ -9792,10 +9796,10 @@ int mdbx_txn_commit(MDBX_txn *txn) { return __inline_mdbx_txn_commit(txn); } /* Merge child txn into parent */ static __inline void mdbx_txn_merge(MDBX_txn *const parent, MDBX_txn *const txn, const unsigned parent_retired_len) { - MDBX_dpl *const src = mdbx_dpl_sort(txn); + MDBX_dpl *const src = dpl_sort(txn); /* Remove refunded pages from parent's dirty list */ - MDBX_dpl *const dst = mdbx_dpl_sort(parent); + MDBX_dpl *const dst = dpl_sort(parent); if (MDBX_ENABLE_REFUND) { unsigned n = dst->length; while (n && dst->items[n].pgno >= parent->mt_next_pgno) { From 0dd45324734ac7b6613c55ade0f1b6bc50c099a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Thu, 4 Aug 2022 17:08:00 +0300 Subject: [PATCH 072/364] mdbx: reduce gap/backlog of linear scan inside `dpl_search()`. --- src/core.c | 37 ++++++++++++++----------------------- 1 file changed, 14 insertions(+), 23 deletions(-) diff --git a/src/core.c b/src/core.c index 5974f555..626a79c1 100644 --- a/src/core.c +++ b/src/core.c @@ -2985,7 +2985,7 @@ static __always_inline MDBX_dpl *dpl_sort(const MDBX_txn *txn) { #define DP_SEARCH_CMP(dp, id) ((dp).pgno < (id)) SEARCH_IMPL(dp_bsearch, MDBX_dp, pgno_t, DP_SEARCH_CMP) -static unsigned __hot mdbx_dpl_search(const MDBX_txn *txn, pgno_t pgno) { +__hot static unsigned dpl_search(const MDBX_txn *txn, pgno_t pgno) { MDBX_dpl *dl = txn->tw.dirtylist; assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); if (mdbx_audit_enabled()) { @@ -3010,22 +3010,13 @@ static unsigned __hot mdbx_dpl_search(const MDBX_txn *txn, pgno_t pgno) { return dl->length - N + 1; \ __fallthrough - /* try linear search until the threshold */ - LINEAR_SEARCH_CASE(16); /* fall through */ - LINEAR_SEARCH_CASE(15); /* fall through */ - LINEAR_SEARCH_CASE(14); /* fall through */ - LINEAR_SEARCH_CASE(13); /* fall through */ - LINEAR_SEARCH_CASE(12); /* fall through */ - LINEAR_SEARCH_CASE(11); /* fall through */ - LINEAR_SEARCH_CASE(10); /* fall through */ - LINEAR_SEARCH_CASE(9); /* fall through */ - LINEAR_SEARCH_CASE(8); /* fall through */ - LINEAR_SEARCH_CASE(7); /* fall through */ - LINEAR_SEARCH_CASE(6); /* fall through */ - LINEAR_SEARCH_CASE(5); /* fall through */ - LINEAR_SEARCH_CASE(4); /* fall through */ - LINEAR_SEARCH_CASE(3); /* fall through */ - LINEAR_SEARCH_CASE(2); /* fall through */ + /* use linear scan until the threshold */ + LINEAR_SEARCH_CASE(7); /* fall through */ + LINEAR_SEARCH_CASE(6); /* fall through */ + LINEAR_SEARCH_CASE(5); /* fall through */ + LINEAR_SEARCH_CASE(4); /* fall through */ + LINEAR_SEARCH_CASE(3); /* fall through */ + LINEAR_SEARCH_CASE(2); /* fall through */ case 1: if (dl->items[dl->length].pgno == pgno) return dl->length; @@ -3053,7 +3044,7 @@ static __inline bool mdbx_dpl_intersect(const MDBX_txn *txn, pgno_t pgno, MDBX_dpl *dl = txn->tw.dirtylist; assert(dl->sorted == dl->length); assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); - unsigned const n = mdbx_dpl_search(txn, pgno); + unsigned const n = dpl_search(txn, pgno); assert(n >= 1 && n <= dl->length + 1); assert(pgno <= dl->items[n].pgno); assert(pgno > dl->items[n - 1].pgno); @@ -3075,7 +3066,7 @@ static __inline bool mdbx_dpl_intersect(const MDBX_txn *txn, pgno_t pgno, static __always_inline unsigned mdbx_dpl_exist(MDBX_txn *txn, pgno_t pgno) { MDBX_dpl *dl = txn->tw.dirtylist; - unsigned i = mdbx_dpl_search(txn, pgno); + unsigned i = dpl_search(txn, pgno); assert((int)i > 0); return (dl->items[i].pgno == pgno) ? i : 0; } @@ -4646,7 +4637,7 @@ static unsigned mdbx_cursor_keep(MDBX_txn *txn, MDBX_cursor *mc) { for (unsigned i = 0; i < mc->mc_snum; ++i) { const MDBX_page *mp = mc->mc_pg[i]; if (IS_MODIFIABLE(txn, mp) && !IS_SUBP(mp)) { - unsigned const n = mdbx_dpl_search(txn, mp->mp_pgno); + unsigned const n = dpl_search(txn, mp->mp_pgno); if (txn->tw.dirtylist->items[n].pgno == mp->mp_pgno && mdbx_dpl_age(txn, n)) { txn->tw.dirtylist->items[n].lru = txn->tw.dirtylru; @@ -5398,7 +5389,7 @@ static int __must_check_result mdbx_page_dirty(MDBX_txn *txn, MDBX_page *mp, rc = mdbx_pnl_insert_range(&txn->tw.reclaimed_pglist, loose->mp_pgno, 1); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; - unsigned di = mdbx_dpl_search(txn, loose->mp_pgno); + unsigned di = dpl_search(txn, loose->mp_pgno); mdbx_tassert(txn, txn->tw.dirtylist->items[di].ptr == loose); mdbx_dpl_remove(txn, di); txn->tw.loose_pages = loose->mp_next; @@ -8395,7 +8386,7 @@ static void mdbx_dpl_sift(MDBX_txn *const txn, MDBX_PNL pl, const int end = MDBX_PNL_ASCENDING ? MDBX_PNL_SIZE(pl) + 1 : 0; mdbx_tassert(txn, pl[begin] <= pl[end - step]); - unsigned r = mdbx_dpl_search(txn, pl[begin] >> spilled); + unsigned r = dpl_search(txn, pl[begin] >> spilled); mdbx_tassert(txn, dl->sorted == dl->length); for (int i = begin; r <= dl->length;) { /* scan loop */ assert(i != end); @@ -13621,7 +13612,7 @@ __hot static __noinline MDBX_page *page_lookup_spilled(MDBX_txn *const txn, mdbx_search_spilled(spiller, pgno)) break; - const unsigned i = mdbx_dpl_search(spiller, pgno); + const unsigned i = dpl_search(spiller, pgno); mdbx_tassert(txn, (int)i > 0); if (spiller->tw.dirtylist->items[i].pgno == pgno) { spiller->tw.dirtylist->items[i].lru = txn->tw.dirtylru++; From 1215bda188484c130657efe6ce1a80fcf46d36d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Thu, 4 Aug 2022 15:38:54 +0300 Subject: [PATCH 073/364] mdbx: minor refine/speedup `node_del()`. --- src/core.c | 74 +++++++++++++++++++++++------------------------------- 1 file changed, 32 insertions(+), 42 deletions(-) diff --git a/src/core.c b/src/core.c index 626a79c1..e2fc6263 100644 --- a/src/core.c +++ b/src/core.c @@ -3286,7 +3286,7 @@ static int __must_check_result mdbx_node_add_leaf2(MDBX_cursor *mc, unsigned indx, const MDBX_val *key); -static void mdbx_node_del(MDBX_cursor *mc, size_t ksize); +static void node_del(MDBX_cursor *mc, size_t ksize); static void mdbx_node_shrink(MDBX_page *mp, unsigned indx); static int __must_check_result mdbx_node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, bool fromleft); @@ -15790,7 +15790,7 @@ int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, flags |= F_DUPDATA; do_sub = true; if (!insert_key) - mdbx_node_del(mc, 0); + node_del(mc, 0); goto new_sub; } @@ -15829,7 +15829,7 @@ int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, return MDBX_SUCCESS; } } - mdbx_node_del(mc, 0); + node_del(mc, 0); } rdata = data; @@ -16336,22 +16336,19 @@ static int __must_check_result mdbx_node_add_leaf(MDBX_cursor *mc, * [in] mc Cursor pointing to the node to delete. * [in] ksize The size of a node. Only used if the page is * part of a MDBX_DUPFIXED database. */ -static void mdbx_node_del(MDBX_cursor *mc, size_t ksize) { +__hot static void node_del(MDBX_cursor *mc, size_t ksize) { MDBX_page *mp = mc->mc_pg[mc->mc_top]; - int indx = mc->mc_ki[mc->mc_top]; - int i, j, nkeys, ptr; - MDBX_node *node; - char *base; + const unsigned hole = mc->mc_ki[mc->mc_top]; + const unsigned nkeys = page_numkeys(mp); - mdbx_debug("delete node %u on %s page %" PRIaPGNO, indx, + mdbx_debug("delete node %u on %s page %" PRIaPGNO, hole, IS_LEAF(mp) ? "leaf" : "branch", mp->mp_pgno); - nkeys = page_numkeys(mp); - mdbx_cassert(mc, indx < nkeys); + mdbx_cassert(mc, hole < nkeys); if (IS_LEAF2(mp)) { mdbx_cassert(mc, ksize >= sizeof(indx_t)); - unsigned diff = nkeys - 1 - indx; - base = page_leaf2key(mp, indx, ksize); + unsigned diff = nkeys - 1 - hole; + char *base = page_leaf2key(mp, hole, ksize); if (diff) memmove(base, base + ksize, diff * ksize); mdbx_cassert(mc, mp->mp_lower >= sizeof(indx_t)); @@ -16362,36 +16359,29 @@ static void mdbx_node_del(MDBX_cursor *mc, size_t ksize) { return; } - node = page_node(mp, indx); - mdbx_cassert(mc, !IS_BRANCH(mp) || indx || node_ks(node) == 0); - size_t sz = NODESIZE + node_ks(node); - if (IS_LEAF(mp)) { - if (F_ISSET(node_flags(node), F_BIGDATA)) - sz += sizeof(pgno_t); - else - sz += node_ds(node); - } - sz = EVEN(sz); + MDBX_node *node = page_node(mp, hole); + mdbx_cassert(mc, !IS_BRANCH(mp) || hole || node_ks(node) == 0); + size_t hole_size = NODESIZE + node_ks(node); + if (IS_LEAF(mp)) + hole_size += + (node_flags(node) & F_BIGDATA) ? sizeof(pgno_t) : node_ds(node); + hole_size = EVEN(hole_size); - ptr = mp->mp_ptrs[indx]; - for (i = j = 0; i < nkeys; i++) { - if (i != indx) { - mp->mp_ptrs[j] = mp->mp_ptrs[i]; - if (mp->mp_ptrs[i] < ptr) { - mdbx_cassert(mc, (size_t)UINT16_MAX - mp->mp_ptrs[j] >= sz); - mp->mp_ptrs[j] += (indx_t)sz; - } - j++; - } - } + const indx_t hole_offset = mp->mp_ptrs[hole]; + unsigned r, w; + for (r = w = 0; r < nkeys; r++) + if (r != hole) + mp->mp_ptrs[w++] = (mp->mp_ptrs[r] < hole_offset) + ? mp->mp_ptrs[r] + (indx_t)hole_size + : mp->mp_ptrs[r]; - base = (char *)mp + mp->mp_upper + PAGEHDRSZ; - memmove(base + sz, base, ptr - mp->mp_upper); + char *base = (char *)mp + mp->mp_upper + PAGEHDRSZ; + memmove(base + hole_size, base, hole_offset - mp->mp_upper); mdbx_cassert(mc, mp->mp_lower >= sizeof(indx_t)); mp->mp_lower -= sizeof(indx_t); - mdbx_cassert(mc, (size_t)UINT16_MAX - mp->mp_upper >= sz); - mp->mp_upper += (indx_t)sz; + mdbx_cassert(mc, (size_t)UINT16_MAX - mp->mp_upper >= hole_size); + mp->mp_upper += (indx_t)hole_size; if (mdbx_audit_enabled()) { const uint8_t checking = mc->mc_checking; @@ -16968,7 +16958,7 @@ static int mdbx_update_key(MDBX_cursor *mc, const MDBX_val *key) { /* not enough space left, do a delete and split */ mdbx_debug("Not enough room, delta = %zd, splitting...", delta); pgno_t pgno = node_pgno(node); - mdbx_node_del(mc, 0); + node_del(mc, 0); int err = page_split(mc, key, NULL, pgno, MDBX_SPLIT_REPLACE); if (err == MDBX_SUCCESS && mdbx_audit_enabled()) err = mdbx_cursor_check_updating(mc); @@ -17173,7 +17163,7 @@ static int mdbx_node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, bool fromleft) { return rc; /* Delete the node from the source page. */ - mdbx_node_del(csrc, key4move.iov_len); + node_del(csrc, key4move.iov_len); mdbx_cassert(csrc, psrc == csrc->mc_pg[csrc->mc_top]); mdbx_cassert(cdst, pdst == cdst->mc_pg[cdst->mc_top]); @@ -17424,7 +17414,7 @@ static int mdbx_page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { /* Unlink the src page from parent and add to free list. */ csrc->mc_top--; - mdbx_node_del(csrc, 0); + node_del(csrc, 0); if (csrc->mc_ki[csrc->mc_top] == 0) { const MDBX_val nullkey = {0, 0}; rc = mdbx_update_key(csrc, &nullkey); @@ -18313,7 +18303,7 @@ static int mdbx_cursor_del0(MDBX_cursor *mc) { mdbx_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); ki = mc->mc_ki[mc->mc_top]; mp = mc->mc_pg[mc->mc_top]; - mdbx_node_del(mc, mc->mc_db->md_xsize); + node_del(mc, mc->mc_db->md_xsize); mc->mc_db->md_entries--; /* Adjust other cursors pointing to mp */ From c05a3b7bb9f403e0a046baf0e79a296851f1d0d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Thu, 4 Aug 2022 16:23:42 +0300 Subject: [PATCH 074/364] mdbx: minor refine `node_add()`. --- src/core.c | 97 +++++++++++++++++++++++++----------------------------- 1 file changed, 44 insertions(+), 53 deletions(-) diff --git a/src/core.c b/src/core.c index e2fc6263..c4b59a4b 100644 --- a/src/core.c +++ b/src/core.c @@ -3273,18 +3273,14 @@ struct node_result { static struct node_result mdbx_node_search(MDBX_cursor *mc, const MDBX_val *key); -static int __must_check_result mdbx_node_add_branch(MDBX_cursor *mc, - unsigned indx, - const MDBX_val *key, - pgno_t pgno); -static int __must_check_result mdbx_node_add_leaf(MDBX_cursor *mc, - unsigned indx, - const MDBX_val *key, - MDBX_val *data, - unsigned flags); -static int __must_check_result mdbx_node_add_leaf2(MDBX_cursor *mc, - unsigned indx, - const MDBX_val *key); +static int __must_check_result node_add_branch(MDBX_cursor *mc, unsigned indx, + const MDBX_val *key, + pgno_t pgno); +static int __must_check_result node_add_leaf(MDBX_cursor *mc, unsigned indx, + const MDBX_val *key, + MDBX_val *data, unsigned flags); +static int __must_check_result node_add_leaf2(MDBX_cursor *mc, unsigned indx, + const MDBX_val *key); static void node_del(MDBX_cursor *mc, size_t ksize); static void mdbx_node_shrink(MDBX_page *mp, unsigned indx); @@ -15848,9 +15844,9 @@ new_sub:; if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { mdbx_cassert(mc, !(naf & (F_BIGDATA | F_SUBDATA | F_DUPDATA)) && rdata->iov_len == 0); - rc = mdbx_node_add_leaf2(mc, mc->mc_ki[mc->mc_top], key); + rc = node_add_leaf2(mc, mc->mc_ki[mc->mc_top], key); } else - rc = mdbx_node_add_leaf(mc, mc->mc_ki[mc->mc_top], key, rdata, naf); + rc = node_add_leaf(mc, mc->mc_ki[mc->mc_top], key, rdata, naf); if (likely(rc == 0)) { /* Adjust other cursors pointing to mp */ const MDBX_dbi dbi = mc->mc_dbi; @@ -16155,9 +16151,9 @@ static pgr_t page_new_large(MDBX_cursor *mc, const unsigned npages) { return ret; } -static int __must_check_result mdbx_node_add_leaf2(MDBX_cursor *mc, - unsigned indx, - const MDBX_val *key) { +__hot static int __must_check_result node_add_leaf2(MDBX_cursor *mc, + unsigned indx, + const MDBX_val *key) { MDBX_page *mp = mc->mc_pg[mc->mc_top]; DKBUF_DEBUG; mdbx_debug("add to leaf2-%spage %" PRIaPGNO " index %i, " @@ -16192,10 +16188,9 @@ static int __must_check_result mdbx_node_add_leaf2(MDBX_cursor *mc, return MDBX_SUCCESS; } -static int __must_check_result mdbx_node_add_branch(MDBX_cursor *mc, - unsigned indx, - const MDBX_val *key, - pgno_t pgno) { +static int __must_check_result node_add_branch(MDBX_cursor *mc, unsigned indx, + const MDBX_val *key, + pgno_t pgno) { MDBX_page *mp = mc->mc_pg[mc->mc_top]; DKBUF_DEBUG; mdbx_debug("add to branch-%spage %" PRIaPGNO " index %i, node-pgno %" PRIaPGNO @@ -16236,11 +16231,11 @@ static int __must_check_result mdbx_node_add_branch(MDBX_cursor *mc, return MDBX_SUCCESS; } -static int __must_check_result mdbx_node_add_leaf(MDBX_cursor *mc, - unsigned indx, - const MDBX_val *key, - MDBX_val *data, - unsigned flags) { +__hot static int __must_check_result node_add_leaf(MDBX_cursor *mc, + unsigned indx, + const MDBX_val *key, + MDBX_val *data, + unsigned flags) { MDBX_page *mp = mc->mc_pg[mc->mc_top]; DKBUF_DEBUG; mdbx_debug("add to leaf-%spage %" PRIaPGNO " index %i, data size %" PRIuPTR @@ -16313,22 +16308,19 @@ static int __must_check_result mdbx_node_add_leaf(MDBX_cursor *mc, void *nodedata = node_data(node); if (likely(largepage == NULL)) { - if (unlikely(flags & F_BIGDATA)) + if (unlikely(flags & F_BIGDATA)) { memcpy(nodedata, data->iov_base, sizeof(pgno_t)); - else if (unlikely(flags & MDBX_RESERVE)) - data->iov_base = nodedata; - else if (likely(nodedata != data->iov_base && - data->iov_len /* to avoid UBSAN traps*/ != 0)) - memcpy(nodedata, data->iov_base, data->iov_len); + return MDBX_SUCCESS; + } } else { poke_pgno(nodedata, largepage->mp_pgno); nodedata = page_data(largepage); - if (unlikely(flags & MDBX_RESERVE)) - data->iov_base = nodedata; - else if (likely(nodedata != data->iov_base && - data->iov_len /* to avoid UBSAN traps*/ != 0)) - memcpy(nodedata, data->iov_base, data->iov_len); } + if (unlikely(flags & MDBX_RESERVE)) + data->iov_base = nodedata; + else if (likely(nodedata != data->iov_base && + data->iov_len /* to avoid UBSAN traps*/ != 0)) + memcpy(nodedata, data->iov_base, data->iov_len); return MDBX_SUCCESS; } @@ -17112,8 +17104,7 @@ static int mdbx_node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, bool fromleft) { "branch", csrc->mc_ki[csrc->mc_top], DKEY_DEBUG(&key4move), psrc->mp_pgno, cdst->mc_ki[cdst->mc_top], pdst->mp_pgno); /* Add the node to the destination page. */ - rc = - mdbx_node_add_branch(cdst, cdst->mc_ki[cdst->mc_top], &key4move, srcpg); + rc = node_add_branch(cdst, cdst->mc_ki[cdst->mc_top], &key4move, srcpg); } break; case P_LEAF: { @@ -17133,8 +17124,8 @@ static int mdbx_node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, bool fromleft) { "leaf", csrc->mc_ki[csrc->mc_top], DKEY_DEBUG(&key4move), psrc->mp_pgno, cdst->mc_ki[cdst->mc_top], pdst->mp_pgno); /* Add the node to the destination page. */ - rc = mdbx_node_add_leaf(cdst, cdst->mc_ki[cdst->mc_top], &key4move, &data, - node_flags(srcnode)); + rc = node_add_leaf(cdst, cdst->mc_ki[cdst->mc_top], &key4move, &data, + node_flags(srcnode)); } break; case P_LEAF | P_LEAF2: { @@ -17151,7 +17142,7 @@ static int mdbx_node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, bool fromleft) { "leaf2", csrc->mc_ki[csrc->mc_top], DKEY_DEBUG(&key4move), psrc->mp_pgno, cdst->mc_ki[cdst->mc_top], pdst->mp_pgno); /* Add the node to the destination page. */ - rc = mdbx_node_add_leaf2(cdst, cdst->mc_ki[cdst->mc_top], &key4move); + rc = node_add_leaf2(cdst, cdst->mc_ki[cdst->mc_top], &key4move); } break; default: @@ -17339,7 +17330,7 @@ static int mdbx_page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { key.iov_base = page_data(psrc); unsigned i = 0; do { - rc = mdbx_node_add_leaf2(cdst, j++, &key); + rc = node_add_leaf2(cdst, j++, &key); if (unlikely(rc != MDBX_SUCCESS)) return rc; key.iov_base = (char *)key.iov_base + key.iov_len; @@ -17387,10 +17378,10 @@ static int mdbx_page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { MDBX_val data; data.iov_len = node_ds(srcnode); data.iov_base = node_data(srcnode); - rc = mdbx_node_add_leaf(cdst, j++, &key, &data, node_flags(srcnode)); + rc = node_add_leaf(cdst, j++, &key, &data, node_flags(srcnode)); } else { mdbx_cassert(csrc, node_flags(srcnode) == 0); - rc = mdbx_node_add_branch(cdst, j++, &key, node_pgno(srcnode)); + rc = node_add_branch(cdst, j++, &key, node_pgno(srcnode)); } if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -18541,7 +18532,7 @@ static int page_split(MDBX_cursor *mc, const MDBX_val *const newkey, foliage = mc->mc_db->md_depth++; /* Add left (implicit) pointer. */ - rc = mdbx_node_add_branch(mc, 0, NULL, mp->mp_pgno); + rc = node_add_branch(mc, 0, NULL, mp->mp_pgno); if (unlikely(rc != MDBX_SUCCESS)) { /* undo the pre-push */ mc->mc_pg[0] = mc->mc_pg[1]; @@ -18836,8 +18827,8 @@ static int page_split(MDBX_cursor *mc, const MDBX_val *const newkey, ptop_page->mp_pgno, mc->mc_ki[ptop], sister->mp_pgno, DKEY(mc->mc_ki[ptop] ? newkey : NULL)); mc->mc_top--; - rc = mdbx_node_add_branch(mc, mc->mc_ki[ptop], - mc->mc_ki[ptop] ? newkey : NULL, sister->mp_pgno); + rc = node_add_branch(mc, mc->mc_ki[ptop], mc->mc_ki[ptop] ? newkey : NULL, + sister->mp_pgno); mdbx_cassert(mc, mp == mc->mc_pg[ptop + 1] && newindx == mc->mc_ki[ptop + 1] && ptop == mc->mc_top); @@ -18865,7 +18856,7 @@ static int page_split(MDBX_cursor *mc, const MDBX_val *const newkey, mn.mc_top--; mdbx_trace("add-to-parent the right-entry[%u] for new sibling-page", mn.mc_ki[ptop]); - rc = mdbx_node_add_branch(&mn, mn.mc_ki[ptop], &sepkey, sister->mp_pgno); + rc = node_add_branch(&mn, mn.mc_ki[ptop], &sepkey, sister->mp_pgno); mn.mc_top++; if (unlikely(rc != MDBX_SUCCESS)) goto done; @@ -18877,12 +18868,12 @@ static int page_split(MDBX_cursor *mc, const MDBX_val *const newkey, switch (PAGETYPE_WHOLE(sister)) { case P_LEAF: { mdbx_cassert(mc, newpgno == 0 || newpgno == P_INVALID); - rc = mdbx_node_add_leaf(mc, 0, newkey, newdata, naf); + rc = node_add_leaf(mc, 0, newkey, newdata, naf); } break; case P_LEAF | P_LEAF2: { mdbx_cassert(mc, (naf & (F_BIGDATA | F_SUBDATA | F_DUPDATA)) == 0); mdbx_cassert(mc, newpgno == 0 || newpgno == P_INVALID); - rc = mdbx_node_add_leaf2(mc, 0, newkey); + rc = node_add_leaf2(mc, 0, newkey); } break; default: rc = bad_page(sister, "wrong page-type %u\n", PAGETYPE_WHOLE(sister)); @@ -18949,12 +18940,12 @@ static int page_split(MDBX_cursor *mc, const MDBX_val *const newkey, case P_BRANCH: { mdbx_cassert(mc, 0 == (uint16_t)flags); /* First branch index doesn't need key data. */ - rc = mdbx_node_add_branch(mc, n, n ? &rkey : NULL, pgno); + rc = node_add_branch(mc, n, n ? &rkey : NULL, pgno); } break; case P_LEAF: { mdbx_cassert(mc, pgno == 0); mdbx_cassert(mc, rdata != NULL); - rc = mdbx_node_add_leaf(mc, n, &rkey, rdata, flags); + rc = node_add_leaf(mc, n, &rkey, rdata, flags); } break; /* case P_LEAF | P_LEAF2: { mdbx_cassert(mc, (nflags & (F_BIGDATA | F_SUBDATA | F_DUPDATA)) == 0); From 5afc5c4e8cec859af1418bebb1c44fe00fc364b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Fri, 5 Aug 2022 15:02:04 +0300 Subject: [PATCH 075/364] mdbx: reorganize/move fences to reduce overhead. --- src/core.c | 38 ++++++++++++++++++++------------------ 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/src/core.c b/src/core.c index c4b59a4b..6370b896 100644 --- a/src/core.c +++ b/src/core.c @@ -5061,7 +5061,6 @@ static bool meta_weak_acceptable(const MDBX_env *env, const MDBX_meta *meta, MDBX_NOTHROW_PURE_FUNCTION static __inline txnid_t constmeta_txnid(const MDBX_env *env, const MDBX_meta *meta) { - mdbx_memory_fence(mo_AcquireRelease, false); txnid_t a = unaligned_peek_u64(4, &meta->mm_txnid_a); txnid_t b = unaligned_peek_u64(4, &meta->mm_txnid_b); mdbx_assert(env, a == b); @@ -5081,7 +5080,6 @@ static __inline void meta_cache_clear(MDBX_env *env) { static __inline txnid_t meta_txnid(const MDBX_env *env, volatile const MDBX_meta *meta) { (void)env; - mdbx_memory_fence(mo_AcquireRelease, false); txnid_t a = unaligned_peek_u64_volatile(4, &meta->mm_txnid_a); txnid_t b = unaligned_peek_u64_volatile(4, &meta->mm_txnid_b); return (a == b) ? a : 0; @@ -5213,7 +5211,8 @@ meta_mostrecent(const enum meta_choice_mode mode, const MDBX_env *env) { return head; } -static volatile const MDBX_meta *meta_prefer_steady(const MDBX_env *env) { +static __noinline volatile const MDBX_meta * +meta_prefer_steady(const MDBX_env *env) { return #if MDBX_CACHE_METAPTR ((MDBX_env *)env)->cache_steady_meta = @@ -5221,7 +5220,7 @@ static volatile const MDBX_meta *meta_prefer_steady(const MDBX_env *env) { meta_mostrecent(prefer_steady, env); } -MDBX_NOTHROW_PURE_FUNCTION static const MDBX_meta * +MDBX_NOTHROW_PURE_FUNCTION static __inline const MDBX_meta * constmeta_prefer_steady(const MDBX_env *env) { #if MDBX_CACHE_METAPTR if (likely(env->cache_steady_meta)) { @@ -5233,7 +5232,8 @@ constmeta_prefer_steady(const MDBX_env *env) { return (const MDBX_meta *)meta_prefer_steady(env); } -static volatile const MDBX_meta *meta_prefer_last(const MDBX_env *env) { +__hot static __noinline volatile const MDBX_meta * +meta_prefer_last(const MDBX_env *env) { return #if MDBX_CACHE_METAPTR ((MDBX_env *)env)->cache_last_meta = @@ -5241,7 +5241,7 @@ static volatile const MDBX_meta *meta_prefer_last(const MDBX_env *env) { meta_mostrecent(prefer_last, env); } -MDBX_NOTHROW_PURE_FUNCTION static const MDBX_meta * +MDBX_NOTHROW_PURE_FUNCTION static __inline const MDBX_meta * constmeta_prefer_last(const MDBX_env *env) { #if MDBX_CACHE_METAPTR if (likely(env->cache_last_meta)) { @@ -5252,11 +5252,11 @@ constmeta_prefer_last(const MDBX_env *env) { return (const MDBX_meta *)meta_prefer_last(env); } -static txnid_t mdbx_recent_committed_txnid(const MDBX_env *env) { +__cold static txnid_t mdbx_recent_committed_txnid(const MDBX_env *env) { while (true) { volatile const MDBX_meta *head = meta_prefer_last(env); const txnid_t recent = meta_txnid(env, head); - mdbx_memory_barrier(); + mdbx_memory_fence(mo_AcquireRelease, false); if (likely(head == meta_prefer_last(env) && recent == meta_txnid(env, head))) return recent; @@ -7488,6 +7488,7 @@ static int meta_waittxnid(const MDBX_env *env, const volatile MDBX_meta *meta, return MDBX_SUCCESS; if (likely(!is_timeout(timestamp))) { + mdbx_memory_fence(mo_AcquireRelease, true); #if defined(_WIN32) || defined(_WIN64) SwitchToThread(); #elif defined(__linux__) || defined(__gnu_linux__) || defined(_UNIX03_SOURCE) @@ -7624,11 +7625,8 @@ static int mdbx_txn_renew0(MDBX_txn *txn, const unsigned flags) { txn->mt_dbs[1] = meta->mm_dbs[1]; txn->mt_canary = meta->mm_canary; - /* LY: Retry on a race, ITS#7970. - * The barrier is not needed here since C11-atomics are used, - * but it is reasonable paranoia to avoid compiler misoptimization - * and makes clarity for code readers. */ - mdbx_compiler_barrier(); + /* LY: Retry on a race, ITS#7970. */ + mdbx_memory_fence(mo_AcquireRelease, false); const txnid_t oldest = atomic_load64(&env->me_lck->mti_oldest_reader, mo_AcquireRelease); if (unlikely(target_txnid < oldest || @@ -8139,7 +8137,7 @@ int mdbx_txn_info(const MDBX_txn *txn, MDBX_txn_info *info, bool scan_rlt) { info->txn_space_limit_hard = pgno2bytes(env, head_meta->mm_geo.upper); info->txn_space_leftover = pgno2bytes(env, head_meta->mm_geo.now - head_meta->mm_geo.next); - mdbx_compiler_barrier(); + mdbx_memory_fence(mo_AcquireRelease, false); } while (unlikely(head_meta != meta_prefer_last(env) || head_txnid != meta_txnid(env, head_meta))); @@ -12086,6 +12084,7 @@ __cold static int mdbx_setup_dxb(MDBX_env *env, const int lck_rc, } } else /* not recovery mode */ while (1) { + mdbx_memory_fence(mo_AcquireRelease, false); const unsigned meta_clash_mask = meta_eq_mask(env); if (unlikely(meta_clash_mask)) { mdbx_error("meta-pages are clashed: mask 0x%d", meta_clash_mask); @@ -12120,7 +12119,7 @@ __cold static int mdbx_setup_dxb(MDBX_env *env, const int lck_rc, MDBX_meta clone; const MDBX_meta *const steady = constmeta_prefer_steady(env); const MDBX_meta *const head = constmeta_prefer_last(env); - const txnid_t steady_txnid = meta_txnid(env, steady); + const txnid_t steady_txnid = constmeta_txnid(env, steady); if (META_IS_STEADY(steady)) { err = mdbx_validate_meta_copy(env, steady, &clone); if (unlikely(err != MDBX_SUCCESS)) { @@ -12135,7 +12134,7 @@ __cold static int mdbx_setup_dxb(MDBX_env *env, const int lck_rc, } const pgno_t pgno = bytes2pgno(env, (uint8_t *)head - env->me_map); - const txnid_t head_txnid = meta_txnid(env, head); + const txnid_t head_txnid = constmeta_txnid(env, head); const bool head_valid = mdbx_validate_meta_copy(env, head, &clone) == MDBX_SUCCESS; mdbx_assert(env, !META_IS_STEADY(steady) || head_txnid != steady_txnid); @@ -12279,7 +12278,7 @@ __cold static int mdbx_setup_dxb(MDBX_env *env, const int lck_rc, MDBX_meta *const pmeta = METAPAGE(env, n); if (unlikely(unaligned_peek_u64(4, &pmeta->mm_magic_and_version) != MDBX_DATA_MAGIC)) { - const txnid_t txnid = meta_txnid(env, pmeta); + const txnid_t txnid = constmeta_txnid(env, pmeta); mdbx_notice("%s %s" "meta[%u], txnid %" PRIaTXN, "updating db-format signature for", @@ -20214,6 +20213,7 @@ __cold static int fetch_envinfo_ex(const MDBX_env *env, const MDBX_txn *txn, if (unlikely(env->me_flags & MDBX_FATAL_ERROR)) return MDBX_PANIC; + mdbx_memory_fence(mo_AcquireRelease, false); volatile const MDBX_meta *const recent_meta = meta_prefer_last(env); arg->mi_recent_txnid = meta_txnid(env, recent_meta); arg->mi_meta0_txnid = meta_txnid(env, meta0); @@ -20979,7 +20979,7 @@ __cold int mdbx_reader_list(const MDBX_env *env, MDBX_reader_list_func *func, const uint64_t head_pages_retired = unaligned_peek_u64_volatile(4, recent_meta->mm_pages_retired); const txnid_t head_txnid = meta_txnid(env, recent_meta); - mdbx_compiler_barrier(); + mdbx_memory_fence(mo_AcquireRelease, false); if (unlikely(recent_meta != meta_prefer_last(env) || head_pages_retired != unaligned_peek_u64_volatile( @@ -21186,6 +21186,7 @@ __cold static txnid_t kick_longlived_readers(MDBX_env *env, mdbx_assert(env, oldest >= laggard); mdbx_assert(env, oldest >= env->me_lck->mti_oldest_reader.weak); + mdbx_memory_fence(mo_AcquireRelease, false); const txnid_t steady = meta_txnid(env, meta_prefer_steady(env)); MDBX_lockinfo *const lck = env->me_lck_mmap.lck; if (oldest == steady || oldest > laggard || /* without-LCK mode */ !lck) @@ -21315,6 +21316,7 @@ int mdbx_txn_straggler(const MDBX_txn *txn, int *percent) const pgno_t maxpg = meta->mm_geo.now; *percent = (int)((meta->mm_geo.next * UINT64_C(100) + maxpg / 2) / maxpg); } + mdbx_memory_fence(mo_AcquireRelease, false); } while (unlikely(recent != meta_txnid(env, meta))); txnid_t lag = (recent - txn->mt_txnid) / xMDBX_TXNID_STEP; From fa854e40c3a3f3e0ad4ff98b68e608bf4dfaa95e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Sun, 7 Aug 2022 12:10:17 +0300 Subject: [PATCH 076/364] mdbx: refine checking inside `page_get()`. --- src/core.c | 185 +++++++++++++++++++++++++++-------------------------- 1 file changed, 94 insertions(+), 91 deletions(-) diff --git a/src/core.c b/src/core.c index 6370b896..26d1c823 100644 --- a/src/core.c +++ b/src/core.c @@ -13595,29 +13595,71 @@ static __inline int mdbx_cursor_push(MDBX_cursor *mc, MDBX_page *mp) { return MDBX_SUCCESS; } -__hot static __noinline MDBX_page *page_lookup_spilled(MDBX_txn *const txn, - const pgno_t pgno) { - const MDBX_txn *spiller = txn; - do { - /* Spilled pages were dirtied in this txn and flushed - * because the dirty list got full. Bring this page - * back in from the map (but don't unspill it here, - * leave that unless page_touch happens again). */ - if (unlikely(spiller->mt_flags & MDBX_TXN_SPILLS) && - mdbx_search_spilled(spiller, pgno)) - break; - - const unsigned i = dpl_search(spiller, pgno); - mdbx_tassert(txn, (int)i > 0); - if (spiller->tw.dirtylist->items[i].pgno == pgno) { - spiller->tw.dirtylist->items[i].lru = txn->tw.dirtylru++; - return spiller->tw.dirtylist->items[i].ptr; +__hot static __always_inline int page_get_checker_lite(const uint16_t ILL, + const MDBX_page *page, + MDBX_txn *const txn, + const txnid_t front) { + if (unlikely(page->mp_flags & ILL)) { + if (ILL == P_ILL_BITS || (page->mp_flags & P_ILL_BITS)) + return bad_page(page, "invalid page's flags (%u)\n", page->mp_flags); + else if (ILL & P_OVERFLOW) { + assert((ILL & (P_BRANCH | P_LEAF | P_LEAF2)) == 0); + assert(page->mp_flags & (P_BRANCH | P_LEAF | P_LEAF2)); + return bad_page(page, "unexpected %s instead of %s (%u)\n", + "large/overlow", "branch/leaf/leaf2", page->mp_flags); + } else if (ILL & (P_BRANCH | P_LEAF | P_LEAF2)) { + assert((ILL & P_BRANCH) && (ILL & P_LEAF) && (ILL & P_LEAF2)); + assert(page->mp_flags & (P_BRANCH | P_LEAF | P_LEAF2)); + return bad_page(page, "unexpected %s instead of %s (%u)\n", + "branch/leaf/leaf2", "large/overlow", page->mp_flags); + } else { + assert(false); } + } - spiller = spiller->mt_parent; - } while (spiller); + if (unlikely(page->mp_txnid > front) && + unlikely(page->mp_txnid > txn->mt_front || front < txn->mt_txnid)) + return bad_page( + page, + "invalid page' txnid (%" PRIaTXN ") for %s' txnid (%" PRIaTXN ")\n", + page->mp_txnid, + (front == txn->mt_front && front != txn->mt_txnid) ? "front-txn" + : "parent-page", + front); - return pgno2page(txn->mt_env, pgno); + if (((ILL & P_OVERFLOW) || !IS_OVERFLOW(page)) && + (ILL & (P_BRANCH | P_LEAF | P_LEAF2)) == 0) { + if (unlikely(page->mp_upper < page->mp_lower || + ((page->mp_lower | page->mp_upper) & 1) || + PAGEHDRSZ + page->mp_upper > txn->mt_env->me_psize)) + return bad_page(page, "invalid page' lower(%u)/upper(%u) with limit %u\n", + page->mp_lower, page->mp_upper, page_space(txn->mt_env)); + + } else if ((ILL & P_OVERFLOW) == 0) { + const pgno_t npages = page->mp_pages; + if (unlikely(npages < 1) || unlikely(npages >= MAX_PAGENO / 2)) + return bad_page(page, "invalid n-pages (%u) for large-page\n", npages); + if (unlikely(page->mp_pgno + npages > txn->mt_next_pgno)) + return bad_page( + page, + "end of large-page beyond (%u) allocated space (%u next-pgno)\n", + page->mp_pgno + npages, txn->mt_next_pgno); + } else { + assert(false); + } + return MDBX_SUCCESS; +} + +__cold static __noinline pgr_t page_get_checker_full(const uint16_t ILL, + MDBX_page *page, + MDBX_cursor *const mc, + const txnid_t front) { + pgr_t r = {page, page_get_checker_lite(ILL, page, mc->mc_txn, front)}; + if (likely(r.err == MDBX_SUCCESS)) + r.err = mdbx_page_check(mc, page); + if (unlikely(r.err != MDBX_SUCCESS)) + mc->mc_txn->mt_flags |= MDBX_TXN_ERROR; + return r; } __hot static __always_inline pgr_t page_get_inline(const uint16_t ILL, @@ -13630,95 +13672,56 @@ __hot static __always_inline pgr_t page_get_inline(const uint16_t ILL, pgr_t r; if (unlikely(pgno >= txn->mt_next_pgno)) { mdbx_error("page #%" PRIaPGNO " beyond next-pgno", pgno); - notfound: r.page = nullptr; r.err = MDBX_PAGE_NOTFOUND; bailout: - mc->mc_txn->mt_flags |= MDBX_TXN_ERROR; + txn->mt_flags |= MDBX_TXN_ERROR; return r; } + mdbx_assert(txn->mt_env, + ((txn->mt_flags ^ txn->mt_env->me_flags) & MDBX_WRITEMAP) == 0); r.page = pgno2page(txn->mt_env, pgno); - if (unlikely((txn->mt_flags & (MDBX_TXN_RDONLY | MDBX_WRITEMAP)) == 0)) - r.page = page_lookup_spilled(txn, pgno); + if ((txn->mt_flags & (MDBX_TXN_RDONLY | MDBX_WRITEMAP)) == 0) { + const MDBX_txn *spiller = txn; + do { + /* Spilled pages were dirtied in this txn and flushed + * because the dirty list got full. Bring this page + * back in from the map (but don't unspill it here, + * leave that unless page_touch happens again). */ + if (unlikely(spiller->mt_flags & MDBX_TXN_SPILLS) && + mdbx_search_spilled(spiller, pgno)) + break; - MDBX_env *const env = txn->mt_env; - mdbx_assert(env, ((txn->mt_flags ^ env->me_flags) & MDBX_WRITEMAP) == 0); + const unsigned i = dpl_search(spiller, pgno); + mdbx_tassert(txn, (int)i > 0); + if (spiller->tw.dirtylist->items[i].pgno == pgno) { + spiller->tw.dirtylist->items[i].lru = txn->tw.dirtylru++; + r.page = spiller->tw.dirtylist->items[i].ptr; + break; + } + + spiller = spiller->mt_parent; + } while (spiller); + } if (unlikely(r.page->mp_pgno != pgno)) { r.err = bad_page( r.page, "pgno mismatch (%" PRIaPGNO ") != expected (%" PRIaPGNO ")\n", r.page->mp_pgno, pgno); - goto notfound; - } - -#if !MDBX_DISABLE_VALIDATION - if (unlikely(r.page->mp_flags & ILL)) { - if (ILL == P_ILL_BITS || (r.page->mp_flags & P_ILL_BITS)) - r.err = bad_page(r.page, "invalid page's flags (%u)\n", r.page->mp_flags); - else if (ILL & P_OVERFLOW) { - assert((ILL & (P_BRANCH | P_LEAF | P_LEAF2)) == 0); - assert(r.page->mp_flags & (P_BRANCH | P_LEAF | P_LEAF2)); - r.err = bad_page(r.page, "unexpected %s instead of %s (%u)\n", - "large/overlow", "branch/leaf/leaf2", r.page->mp_flags); - } else if (ILL & (P_BRANCH | P_LEAF | P_LEAF2)) { - assert((ILL & P_BRANCH) && (ILL & P_LEAF) && (ILL & P_LEAF2)); - assert(r.page->mp_flags & (P_BRANCH | P_LEAF | P_LEAF2)); - r.err = bad_page(r.page, "unexpected %s instead of %s (%u)\n", - "branch/leaf/leaf2", "large/overlow", r.page->mp_flags); - } else { - assert(false); - } goto bailout; } - if (unlikely(r.page->mp_txnid > front) && - unlikely(r.page->mp_txnid > txn->mt_front || front < txn->mt_txnid)) { - r.err = bad_page( - r.page, - "invalid page' txnid (%" PRIaTXN ") for %s' txnid (%" PRIaTXN ")\n", - r.page->mp_txnid, - (front == txn->mt_front && front != txn->mt_txnid) ? "front-txn" - : "parent-page", - front); - goto bailout; - } - - if (((ILL & P_OVERFLOW) || !IS_OVERFLOW(r.page)) && - (ILL & (P_BRANCH | P_LEAF | P_LEAF2)) == 0) { - if (unlikely(r.page->mp_upper < r.page->mp_lower || - ((r.page->mp_lower | r.page->mp_upper) & 1) || - PAGEHDRSZ + r.page->mp_upper > env->me_psize)) { - r.err = - bad_page(r.page, "invalid page' lower(%u)/upper(%u) with limit %u\n", - r.page->mp_lower, r.page->mp_upper, page_space(env)); - goto bailout; - } - } else if ((ILL & P_OVERFLOW) == 0) { - const pgno_t npages = r.page->mp_pages; - if (unlikely(npages < 1 || npages >= MAX_PAGENO / 2)) { - r.err = bad_page(r.page, "invalid n-pages (%u) for large-page\n", npages); - goto bailout; - } - if (unlikely(r.page->mp_pgno + npages > txn->mt_next_pgno)) { - r.err = bad_page( - r.page, - "end of large-page beyond (%u) allocated space (%u next-pgno)\n", - r.page->mp_pgno + npages, txn->mt_next_pgno); - goto bailout; - } - } else { - assert(false); - } -#else - (void)ILL; -#endif /* MDBX_DISABLE_VALIDATION */ - - if (unlikely(mc->mc_checking & CC_PAGECHECK) && - unlikely(MDBX_SUCCESS != (r.err = mdbx_page_check(mc, r.page)))) - goto bailout; + if (unlikely(mc->mc_checking & CC_PAGECHECK)) + return page_get_checker_full(ILL, r.page, mc, front); +#if MDBX_DISABLE_VALIDATION r.err = MDBX_SUCCESS; +#else + r.err = page_get_checker_lite(ILL, r.page, txn, front); + if (unlikely(r.err != MDBX_SUCCESS)) + goto bailout; +#endif /* MDBX_DISABLE_VALIDATION */ return r; } From 15146d3823a96b0811a14ad967ab5bfd91fa9352 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Sun, 7 Aug 2022 14:37:45 +0300 Subject: [PATCH 077/364] mdbx: fix `scan4seq()` selection for non-implemented cases. --- src/core.c | 37 ++++++++++++------------------------- 1 file changed, 12 insertions(+), 25 deletions(-) diff --git a/src/core.c b/src/core.c index 857eba56..6d018165 100644 --- a/src/core.c +++ b/src/core.c @@ -5974,13 +5974,6 @@ scan4seq_avx2(pgno_t *range, const size_t len, const unsigned seq) { } #endif /* MDBX_ATTRIBUTE_TARGET_AVX2 */ -#ifdef MDBX_ATTRIBUTE_TARGET_AVX -MDBX_MAYBE_UNUSED __hot MDBX_ATTRIBUTE_TARGET_AVX static pgno_t *static pgno_t * -scan4seq_avx(pgno_t *range, const size_t len, const unsigned seq) { - return nullptr; -} -#endif /* MDBX_ATTRIBUTE_TARGET_AVX */ - #if defined(__SSE2__) #define MDBX_ATTRIBUTE_TARGET_SSE2 /* nope */ #elif (defined(_M_IX86_FP) && _M_IX86_FP >= 2) || defined(__amd64__) @@ -5988,7 +5981,7 @@ scan4seq_avx(pgno_t *range, const size_t len, const unsigned seq) { #define MDBX_ATTRIBUTE_TARGET_SSE2 /* nope */ #elif defined(MDBX_ATTRIBUTE_TARGET) && defined(__ia32__) #define MDBX_ATTRIBUTE_TARGET_SSE2 MDBX_ATTRIBUTE_TARGET("sse2") -#endif +#endif /* __SSE2__ */ #ifdef MDBX_ATTRIBUTE_TARGET_SSE2 MDBX_ATTRIBUTE_TARGET_SSE2 static __always_inline unsigned @@ -6058,14 +6051,12 @@ scan4seq_sse2(pgno_t *range, const size_t len, const unsigned seq) { } #endif /* MDBX_ATTRIBUTE_TARGET_SSE2 */ -#if defined(__AVX512BW__) +#if defined(__AVX512BW__) && defined(MDBX_ATTRIBUTE_TARGET_AVX512) #define scan4seq_default scan4seq_avx512bw #define scan4seq scan4seq_default -#elif defined(__AVX2__) +#elif defined(__AVX2__) && defined(MDBX_ATTRIBUTE_TARGET_AVX2) #define scan4seq_default scan4seq_avx2 -#elif defined(__AVX__) -#define scan4seq_default scan4seq_avx -#elif defined(__SSE2__) +#elif defined(__SSE2__) && defined(MDBX_ATTRIBUTE_TARGET_SSE2) #define scan4seq_default scan4seq_sse2 /* Choosing of another variants should be added here. */ #endif /* scan4seq_default */ @@ -6100,22 +6091,18 @@ static pgno_t *scan4seq_resolver(pgno_t *range, const size_t len, __GNUC_PREREQ(4, 8) __builtin_cpu_init(); #endif /* __builtin_cpu_init() */ -#ifdef MDBX_ATTRIBUTE_TARGET_AVX512BW - if (__builtin_cpu_supports("avx512bw")) - choice = scan4seq_avx512; -#endif /* MDBX_ATTRIBUTE_TARGET_AVX512BW */ +#ifdef MDBX_ATTRIBUTE_TARGET_SSE2 + if (__builtin_cpu_supports("sse2")) + choice = scan4seq_sse2; +#endif /* MDBX_ATTRIBUTE_TARGET_SSE2 */ #ifdef MDBX_ATTRIBUTE_TARGET_AVX2 if (__builtin_cpu_supports("avx2")) choice = scan4seq_avx2; #endif /* MDBX_ATTRIBUTE_TARGET_AVX2 */ -#ifdef MDBX_ATTRIBUTE_TARGET_AVX - if (__builtin_cpu_supports("avx")) - choice = scan4seq_avx; -#endif /* MDBX_ATTRIBUTE_TARGET_AVX2 */ -#ifdef MDBX_ATTRIBUTE_TARGET_SSE2 - if (!choice && __builtin_cpu_supports("sse2")) - choice = scan4seq_sse2; -#endif /* MDBX_ATTRIBUTE_TARGET_SSE2 */ +#ifdef MDBX_ATTRIBUTE_TARGET_AVX512BW + if (__builtin_cpu_supports("avx512bw")) + choice = scan4seq_avx512; +#endif /* MDBX_ATTRIBUTE_TARGET_AVX512BW */ /* Choosing of another variants should be added here. */ scan4seq = choice ? choice : scan4seq_default; return scan4seq(range, len, seq); From d6603a0c0af697070d598a1af3ba7f7534a85db0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Sun, 7 Aug 2022 15:08:41 +0300 Subject: [PATCH 078/364] =?UTF-8?q?mdbx:=20add=20=C3=978=20accelerated=20`?= =?UTF-8?q?scan4seq()`=20(AVX2).?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core.c | 73 +++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 70 insertions(+), 3 deletions(-) diff --git a/src/core.c b/src/core.c index 6d018165..ee8a6d13 100644 --- a/src/core.c +++ b/src/core.c @@ -5954,7 +5954,7 @@ MDBX_MAYBE_UNUSED static const pgno_t *scan4range_checker(const MDBX_PNL pnl, } #if !defined(MDBX_ATTRIBUTE_TARGET) && \ - (__has_attribute(__target__) || __GNUC_PREREQ(4, 8)) + (__has_attribute(__target__) || __GNUC_PREREQ(5, 0)) #define MDBX_ATTRIBUTE_TARGET(target) __attribute__((__target__(target))) #endif /* MDBX_ATTRIBUTE_TARGET */ @@ -5966,10 +5966,77 @@ scan4seq_avx512bw(pgno_t *range, const size_t len, const unsigned seq) { } #endif /* MDBX_ATTRIBUTE_TARGET_AVX512BW */ +#if defined(__AVX2__) +#define MDBX_ATTRIBUTE_TARGET_AVX2 /* nope */ +#elif defined(MDBX_ATTRIBUTE_TARGET) && defined(__ia32__) +#define MDBX_ATTRIBUTE_TARGET_AVX2 MDBX_ATTRIBUTE_TARGET("avx2") +#endif /* __AVX2__ */ + #ifdef MDBX_ATTRIBUTE_TARGET_AVX2 -MDBX_MAYBE_UNUSED -__hot MDBX_ATTRIBUTE_TARGET_AVX2 static pgno_t *static pgno_t * + +MDBX_ATTRIBUTE_TARGET_AVX2 static __always_inline unsigned +diffcmp2mask_avx2(const pgno_t *const ptr, const ptrdiff_t offset, + const __m256i pattern) { + const __m256i f = _mm256_loadu_si256((const __m256i *)ptr); + const __m256i l = _mm256_loadu_si256((const __m256i *)(ptr + offset)); + const __m256i cmp = _mm256_cmpeq_epi32(_mm256_sub_epi32(f, l), pattern); + return _mm256_movemask_ps(*(const __m256 *)&cmp); +} + +MDBX_MAYBE_UNUSED __hot MDBX_ATTRIBUTE_TARGET_AVX2 static pgno_t * scan4seq_avx2(pgno_t *range, const size_t len, const unsigned seq) { + assert(seq > 0 && len > seq); +#if MDBX_PNL_ASCENDING +#error "FIXME: Not implemented" +#endif /* MDBX_PNL_ASCENDING */ + assert(range[-(ptrdiff_t)len] == len); + pgno_t *const detent = range - len + seq; + const ptrdiff_t offset = -(ptrdiff_t)seq; + const pgno_t target = (pgno_t)offset; + const __m256i pattern = _mm256_set1_epi32(target); + uint8_t mask; + if (likely(len > seq + 7)) { + do { + mask = (uint8_t)diffcmp2mask_avx2(range - 7, offset, pattern); + if (mask) + goto found; + range -= 8; + } while (range > detent + 7); + if (range == detent) + return nullptr; + } + + /* Далее происходит чтение от 4 до 28 лишних байт, которые могут быть не + * только за пределами региона выделенного под PNL, но и пересекать границу + * страницы памяти. Что может приводить как к ошибкам ASAN, так и к падению. + * Поэтому проверяем смещение на странице, а с ASAN всегда страхуемся. */ +#ifdef __SANITIZE_ADDRESS__ + const unsigned on_page_safe_mask = 0; +#else + const unsigned on_page_safe_mask = 0xfe0 /* enough for '-31' bytes offset */; +#endif + if (likely(on_page_safe_mask & (uintptr_t)range)) { + const unsigned extra = (unsigned)(detent + 8 - range); + assert(extra > 0 && extra < 8); + mask = 0xFF << extra; + mask &= diffcmp2mask_avx2(range - 7, offset, pattern); + if (mask) { + found:; +#ifdef _MSC_VER + unsigned long index; + _BitScanReverse(&index, mask); +#else + const unsigned index = __builtin_clz(mask); +#endif /* _MSC_VER */ + range = range + 24 - index; + return range; + } + return nullptr; + } + do + if (*range - range[offset] == target) + return range; + while (--range != detent); return nullptr; } #endif /* MDBX_ATTRIBUTE_TARGET_AVX2 */ From 3de759a7be9ac69dfdae6d4f62e6a19e837cf89c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Sun, 7 Aug 2022 22:24:00 +0300 Subject: [PATCH 079/364] mdbx: fix page-boundary checking inside accelerated `scan4seq()`. --- src/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/core.c b/src/core.c index ee8a6d13..a17fa128 100644 --- a/src/core.c +++ b/src/core.c @@ -6015,7 +6015,7 @@ scan4seq_avx2(pgno_t *range, const size_t len, const unsigned seq) { #else const unsigned on_page_safe_mask = 0xfe0 /* enough for '-31' bytes offset */; #endif - if (likely(on_page_safe_mask & (uintptr_t)range)) { + if (likely(on_page_safe_mask & (uintptr_t)(range + offset))) { const unsigned extra = (unsigned)(detent + 8 - range); assert(extra > 0 && extra < 8); mask = 0xFF << extra; @@ -6092,7 +6092,7 @@ scan4seq_sse2(pgno_t *range, const size_t len, const unsigned seq) { #else const unsigned on_page_safe_mask = 0xff0 /* enough for '-15' bytes offset */; #endif - if (likely(on_page_safe_mask & (uintptr_t)range)) { + if (likely(on_page_safe_mask & (uintptr_t)(range + offset))) { const unsigned extra = (unsigned)(detent + 4 - range); assert(extra > 0 && extra < 4); mask = 0xF << extra; From 98c53555ab98649d24e4f9a2d350bac8f5c37644 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Tue, 9 Aug 2022 00:24:44 +0300 Subject: [PATCH 080/364] mdbx: using e2k-frendly/cmov/branch-less bsearch. https://gitflic.ru/project/erthink/bsearch-try --- src/core.c | 93 +++++++++++++++++++++++++++++++++--------------------- 1 file changed, 57 insertions(+), 36 deletions(-) diff --git a/src/core.c b/src/core.c index a17fa128..b54c91a1 100644 --- a/src/core.c +++ b/src/core.c @@ -2277,52 +2277,73 @@ static int lcklist_detach_locked(MDBX_env *env) { /*------------------------------------------------------------------------------ * LY: Binary search */ +#if defined(__clang__) && __clang_major__ > 4 && defined(__ia32__) +#define WORKAROUND_FOR_CLANG_OPTIMIZER_BUG(size, flag) \ + do \ + __asm __volatile("" \ + : "+r"(size) \ + : "r" /* the `b` constraint is more suitable here, but \ + cause CLANG to allocate and push/pop an one more \ + register, so using the `r` which avoids this. */ \ + (flag)); \ + while (0) +#else +#define WORKAROUND_FOR_CLANG_OPTIMIZER_BUG(size, flag) \ + do { \ + /* nope for non-clang or non-x86 */; \ + } while (0) +#endif /* Workaround for CLANG */ + +#define BINARY_SEARCH_STEP(TYPE_LIST, CMP, it, size, key) \ + do { \ + } while (0) + #define SEARCH_IMPL(NAME, TYPE_LIST, TYPE_ARG, CMP) \ static __always_inline const TYPE_LIST *NAME( \ - const TYPE_LIST *first, unsigned length, const TYPE_ARG item) { \ - const TYPE_LIST *const begin = first, *const end = begin + length; \ + const TYPE_LIST *it, unsigned length, const TYPE_ARG item) { \ + const TYPE_LIST *const begin = it, *const end = begin + length; \ \ - while (length > 3) { \ - const unsigned whole = length; \ - length >>= 1; \ - const TYPE_LIST *const middle = first + length; \ - const unsigned left = whole - length - 1; \ - const bool cmp = expect_with_probability(CMP(*middle, item), 0, .5); \ - length = cmp ? left : length; \ - first = cmp ? middle + 1 : first; \ - } \ - \ - switch (length) { \ - case 3: \ - if (expect_with_probability(!CMP(*first, item), 0, .5)) \ - break; \ - ++first; \ - __fallthrough /* fall through */; \ - case 2: \ - if (expect_with_probability(!CMP(*first, item), 0, .5)) \ - break; \ - ++first; \ - __fallthrough /* fall through */; \ - case 1: \ - if (expect_with_probability(!CMP(*first, item), 0, .5)) \ - break; \ - ++first; \ - __fallthrough /* fall through */; \ - case 0: \ - break; \ - default: \ - __unreachable(); \ - } \ + if (MDBX_HAVE_CMOV) \ + do { \ + /* Адаптивно-упрощенный шаг двоичного поиска: \ + * - без переходов при наличии cmov или аналога; \ + * - допускает лишние итерации; \ + * - но ищет пока size > 2, что требует дозавершения поиска \ + * среди остающихся 0-1-2 элементов. */ \ + const TYPE_LIST *const middle = it + (length >> 1); \ + length = (length + 1) >> 1; \ + const bool flag = expect_with_probability(CMP(*middle, item), 0, .5); \ + WORKAROUND_FOR_CLANG_OPTIMIZER_BUG(length, flag); \ + it = flag ? middle : it; \ + } while (length > 2); \ + else \ + while (length > 2) { \ + /* Вариант с использованием условного перехода. Основное отличие в \ + * том, что при "не равно" (true от компаратора) переход делается на 1 \ + * ближе к концу массива. Алгоритмически это верно и обеспечивает \ + * чуть-чуть более быструю сходимость, но зато требует больше \ + * вычислений при true от компаратора. Также ВАЖНО(!) не допускается \ + * спекулятивное выполнение при size == 0. */ \ + const TYPE_LIST *const middle = it + (length >> 1); \ + length = (length + 1) >> 1; \ + const bool flag = expect_with_probability(CMP(*middle, item), 0, .5); \ + if (flag) { \ + it = middle + 1; \ + length -= 1; \ + } \ + } \ + it += length > 1 && expect_with_probability(CMP(*it, item), 0, .5); \ + it += length > 0 && expect_with_probability(CMP(*it, item), 0, .5); \ \ if (mdbx_audit_enabled()) { \ - for (const TYPE_LIST *scan = begin; scan < first; ++scan) \ + for (const TYPE_LIST *scan = begin; scan < it; ++scan) \ assert(CMP(*scan, item)); \ - for (const TYPE_LIST *scan = first; scan < end; ++scan) \ + for (const TYPE_LIST *scan = it; scan < end; ++scan) \ assert(!CMP(*scan, item)); \ (void)begin, (void)end; \ } \ \ - return first; \ + return it; \ } /*----------------------------------------------------------------------------*/ From 2ff8d3c4f24176d9522ae54503df15742e87385c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Tue, 9 Aug 2022 18:27:43 +0300 Subject: [PATCH 081/364] mdbx: native `wchar_t` pathname for Windows. --- mdbx.h | 35 +++++++++ mdbx.h++ | 8 +++ src/core.c | 174 ++++++++++++++++++++++++++++++++++---------- src/internals.h | 2 +- src/mdbx.c++ | 188 ++++++++++++++++-------------------------------- src/osal.c | 39 +++------- src/osal.h | 16 +++-- 7 files changed, 264 insertions(+), 198 deletions(-) diff --git a/mdbx.h b/mdbx.h index 3d13751b..82d46ef4 100644 --- a/mdbx.h +++ b/mdbx.h @@ -827,18 +827,30 @@ enum MDBX_constants { #ifndef MDBX_LOCKNAME /** \brief The name of the lock file in the environment * without using \ref MDBX_NOSUBDIR */ +#if !(defined(_WIN32) || defined(_WIN64)) #define MDBX_LOCKNAME "/mdbx.lck" +#else +#define MDBX_LOCKNAME L"\\mdbx.lck" #endif +#endif /* MDBX_LOCKNAME */ #ifndef MDBX_DATANAME /** \brief The name of the data file in the environment * without using \ref MDBX_NOSUBDIR */ +#if !(defined(_WIN32) || defined(_WIN64)) #define MDBX_DATANAME "/mdbx.dat" +#else +#define MDBX_DATANAME L"\\mdbx.dat" #endif +#endif /* MDBX_DATANAME */ #ifndef MDBX_LOCK_SUFFIX /** \brief The suffix of the lock file when \ref MDBX_NOSUBDIR is used */ +#if !(defined(_WIN32) || defined(_WIN64)) #define MDBX_LOCK_SUFFIX "-lck" +#else +#define MDBX_LOCK_SUFFIX L"-lck" #endif +#endif /* MDBX_LOCK_SUFFIX */ /* DEBUG & LOGGING ************************************************************/ @@ -2275,6 +2287,11 @@ LIBMDBX_API int mdbx_env_get_option(const MDBX_env *env, LIBMDBX_API int mdbx_env_open(MDBX_env *env, const char *pathname, MDBX_env_flags_t flags, mdbx_mode_t mode); +#if defined(_WIN32) || defined(_WIN64) +LIBMDBX_API int mdbx_env_openW(MDBX_env *env, const wchar_t *pathnameW, + MDBX_env_flags_t flags, mdbx_mode_t mode); +#endif /* Windows */ + /** \brief Deletion modes for \ref mdbx_env_delete(). * \ingroup c_extra * \see mdbx_env_delete() */ @@ -2317,6 +2334,10 @@ typedef enum MDBX_env_delete_mode_t MDBX_env_delete_mode_t; * so no deletion was performed. */ LIBMDBX_API int mdbx_env_delete(const char *pathname, MDBX_env_delete_mode_t mode); +#if defined(_WIN32) || defined(_WIN64) +LIBMDBX_API int mdbx_env_deleteW(const wchar_t *pathnameW, + MDBX_env_delete_mode_t mode); +#endif /* Windows */ /** \brief Copy an MDBX environment to the specified path, with options. * \ingroup c_extra @@ -2351,6 +2372,10 @@ LIBMDBX_API int mdbx_env_delete(const char *pathname, * \returns A non-zero error value on failure and 0 on success. */ LIBMDBX_API int mdbx_env_copy(MDBX_env *env, const char *dest, MDBX_copy_flags_t flags); +#if defined(_WIN32) || defined(_WIN64) +LIBMDBX_API int mdbx_env_copyW(MDBX_env *env, const wchar_t *dest, + MDBX_copy_flags_t flags); +#endif /* Windows */ /** \brief Copy an environment to the specified file descriptor, with * options. @@ -2803,7 +2828,11 @@ LIBMDBX_API int mdbx_env_get_flags(const MDBX_env *env, unsigned *flags); * \returns A non-zero error value on failure and 0 on success, * some possible errors are: * \retval MDBX_EINVAL An invalid parameter was specified. */ +#if !(defined(_WIN32) || defined(_WIN64)) LIBMDBX_API int mdbx_env_get_path(const MDBX_env *env, const char **dest); +#else +LIBMDBX_API int mdbx_env_get_pathW(const MDBX_env *env, const wchar_t **dest); +#endif /* Windows */ /** \brief Return the file descriptor for the given environment. * \ingroup c_statinfo @@ -5195,6 +5224,12 @@ LIBMDBX_API int mdbx_env_pgwalk(MDBX_txn *txn, MDBX_pgvisitor_func *visitor, LIBMDBX_API int mdbx_env_open_for_recovery(MDBX_env *env, const char *pathname, unsigned target_meta, bool writeable); +#if defined(_WIN32) || defined(_WIN64) +LIBMDBX_API int mdbx_env_open_for_recoveryW(MDBX_env *env, + const wchar_t *pathnameW, + unsigned target_meta, + bool writeable); +#endif /* Windows */ /** \brief Turn database to the specified meta-page. * diff --git a/mdbx.h++ b/mdbx.h++ index 64143c48..623b4cc2 100644 --- a/mdbx.h++ +++ b/mdbx.h++ @@ -3225,6 +3225,8 @@ public: #if defined(_WIN32) || defined(_WIN64) || defined(DOXYGEN) env ©(const ::std::wstring &destination, bool compactify, bool force_dynamic_size = false); + env ©(const wchar_t *destination, bool compactify, + bool force_dynamic_size = false); #endif /* Windows */ env ©(const ::std::string &destination, bool compactify, bool force_dynamic_size = false); @@ -3260,6 +3262,8 @@ public: #if defined(_WIN32) || defined(_WIN64) || defined(DOXYGEN) static bool remove(const ::std::wstring &pathname, const remove_mode mode = just_remove); + static bool remove(const wchar_t *pathname, + const remove_mode mode = just_remove); #endif /* Windows */ static bool remove(const ::std::string &pathname, const remove_mode mode = just_remove); @@ -3507,6 +3511,8 @@ public: #if defined(_WIN32) || defined(_WIN64) || defined(DOXYGEN) env_managed(const ::std::wstring &pathname, const operate_parameters &, bool accede = true); + explicit env_managed(const wchar_t *pathname, const operate_parameters &, + bool accede = true); #endif /* Windows */ env_managed(const ::std::string &pathname, const operate_parameters &, bool accede = true); @@ -3531,6 +3537,8 @@ public: #if defined(_WIN32) || defined(_WIN64) || defined(DOXYGEN) env_managed(const ::std::wstring &pathname, const create_parameters &, const operate_parameters &, bool accede = true); + explicit env_managed(const wchar_t *pathname, const create_parameters &, + const operate_parameters &, bool accede = true); #endif /* Windows */ env_managed(const ::std::string &pathname, const create_parameters &, const operate_parameters &, bool accede = true); diff --git a/src/core.c b/src/core.c index b54c91a1..62914508 100644 --- a/src/core.c +++ b/src/core.c @@ -12433,7 +12433,7 @@ __cold static int mdbx_setup_dxb(MDBX_env *env, const int lck_rc, /******************************************************************************/ /* Open and/or initialize the lock region for the environment. */ -__cold static int mdbx_setup_lck(MDBX_env *env, char *lck_pathname, +__cold static int mdbx_setup_lck(MDBX_env *env, pathchar_t *lck_pathname, mdbx_mode_t mode) { mdbx_assert(env, env->me_lazy_fd != INVALID_HANDLE_VALUE); mdbx_assert(env, env->me_lfd == INVALID_HANDLE_VALUE); @@ -12816,6 +12816,21 @@ __cold int mdbx_env_turn_for_recovery(MDBX_env *env, unsigned target) { __cold int mdbx_env_open_for_recovery(MDBX_env *env, const char *pathname, unsigned target_meta, bool writeable) { +#if defined(_WIN32) || defined(_WIN64) + const size_t wlen = mbstowcs(nullptr, pathname, INT_MAX); + if (wlen < 1 || wlen > /* MAX_PATH */ INT16_MAX) + return ERROR_INVALID_NAME; + wchar_t *const pathnameW = _alloca((wlen + 1) * sizeof(wchar_t)); + if (wlen != mbstowcs(pathnameW, pathname, wlen + 1)) + return ERROR_INVALID_NAME; + + return mdbx_env_open_for_recoveryW(env, pathnameW, target_meta, writeable); +} + +__cold int mdbx_env_open_for_recoveryW(MDBX_env *env, const wchar_t *pathname, + unsigned target_meta, bool writeable) { +#endif /* Windows */ + if (unlikely(target_meta >= NUM_METAS)) return MDBX_EINVAL; int rc = check_env(env, false); @@ -12825,35 +12840,49 @@ __cold int mdbx_env_open_for_recovery(MDBX_env *env, const char *pathname, return MDBX_EPERM; env->me_stuck_meta = (int8_t)target_meta; - return mdbx_env_open( - env, pathname, writeable ? MDBX_EXCLUSIVE : MDBX_EXCLUSIVE | MDBX_RDONLY, - 0); + return +#if defined(_WIN32) || defined(_WIN64) + mdbx_env_openW +#else + mdbx_env_open +#endif /* Windows */ + (env, pathname, writeable ? MDBX_EXCLUSIVE : MDBX_EXCLUSIVE | MDBX_RDONLY, + 0); } typedef struct { void *buffer_for_free; - char *lck, *dxb; + pathchar_t *lck, *dxb; size_t ent_len; } MDBX_handle_env_pathname; +static bool path_equal(const pathchar_t *l, const pathchar_t *r, size_t len) { +#if defined(_WIN32) || defined(_WIN64) + while (len > 0) { + pathchar_t a = *l++; + pathchar_t b = *r++; + a = (a == '\\') ? '/' : a; + b = (b == '\\') ? '/' : b; + if (a != b) + return false; + } + return true; +#else + return memcmp(l, r, len * sizeof(pathchar_t)) == 0; +#endif +} + __cold static int mdbx_handle_env_pathname(MDBX_handle_env_pathname *ctx, - const char *pathname, + const pathchar_t *pathname, MDBX_env_flags_t *flags, const mdbx_mode_t mode) { - int rc; memset(ctx, 0, sizeof(*ctx)); - if (unlikely(!pathname)) + if (unlikely(!pathname || !*pathname)) return MDBX_EINVAL; + int rc; #if defined(_WIN32) || defined(_WIN64) - const size_t wlen = mbstowcs(nullptr, pathname, INT_MAX); - if (wlen < 1 || wlen > /* MAX_PATH */ INT16_MAX) - return ERROR_INVALID_NAME; - wchar_t *const pathnameW = _alloca((wlen + 1) * sizeof(wchar_t)); - if (wlen != mbstowcs(pathnameW, pathname, wlen + 1)) - return ERROR_INVALID_NAME; - - const DWORD dwAttrib = GetFileAttributesW(pathnameW); + const DWORD dwAttrib = GetFileAttributesW(pathname); if (dwAttrib == INVALID_FILE_ATTRIBUTES) { rc = GetLastError(); if (rc != MDBX_ENOFILE) @@ -12863,8 +12892,7 @@ __cold static int mdbx_handle_env_pathname(MDBX_handle_env_pathname *ctx, return rc; /* auto-create directory if requested */ - if ((*flags & MDBX_NOSUBDIR) == 0 && - !CreateDirectoryW(pathnameW, nullptr)) { + if ((*flags & MDBX_NOSUBDIR) == 0 && !CreateDirectoryW(pathname, nullptr)) { rc = GetLastError(); if (rc != ERROR_ALREADY_EXISTS) return rc; @@ -12905,41 +12933,66 @@ __cold static int mdbx_handle_env_pathname(MDBX_handle_env_pathname *ctx, } #endif - static const char dxb_name[] = MDBX_DATANAME; - static const size_t dxb_name_len = sizeof(dxb_name) - 1; - static const char lck_name[] = MDBX_LOCKNAME; - static const char lock_suffix[] = MDBX_LOCK_SUFFIX; + static const pathchar_t dxb_name[] = MDBX_DATANAME; + static const pathchar_t lck_name[] = MDBX_LOCKNAME; + static const pathchar_t lock_suffix[] = MDBX_LOCK_SUFFIX; - ctx->ent_len = strlen(pathname); - if ((*flags & MDBX_NOSUBDIR) && ctx->ent_len >= dxb_name_len && - !memcmp(dxb_name, pathname + ctx->ent_len - dxb_name_len, dxb_name_len)) { +#if defined(_WIN32) || defined(_WIN64) + assert(dxb_name[0] == '\\' && lck_name[0] == '\\'); + const size_t pathname_len = wcslen(pathname); +#else + assert(dxb_name[0] == '/' && lck_name[0] == '/'); + const size_t pathname_len = strlen(pathname); +#endif + assert(lock_suffix[0] != '\\' && lock_suffix[0] != '/'); + ctx->ent_len = pathname_len; + static const size_t dxb_name_len = ARRAY_LENGTH(dxb_name) - 1; + if ((*flags & MDBX_NOSUBDIR) && ctx->ent_len > dxb_name_len && + path_equal(pathname + ctx->ent_len - dxb_name_len, dxb_name, + dxb_name_len)) { *flags -= MDBX_NOSUBDIR; ctx->ent_len -= dxb_name_len; } const size_t bytes_needed = - ctx->ent_len * 2 + ((*flags & MDBX_NOSUBDIR) - ? sizeof(lock_suffix) + 1 - : sizeof(lck_name) + sizeof(dxb_name)); + sizeof(pathchar_t) * ctx->ent_len * 2 + + ((*flags & MDBX_NOSUBDIR) ? sizeof(lock_suffix) + sizeof(pathchar_t) + : sizeof(lck_name) + sizeof(dxb_name)); ctx->buffer_for_free = mdbx_malloc(bytes_needed); if (!ctx->buffer_for_free) return MDBX_ENOMEM; - ctx->lck = ctx->buffer_for_free; + ctx->dxb = ctx->buffer_for_free; + ctx->lck = ctx->dxb + ctx->ent_len + 1; + memcpy(ctx->dxb, pathname, sizeof(pathchar_t) * (ctx->ent_len + 1)); if (*flags & MDBX_NOSUBDIR) { - ctx->dxb = ctx->lck + ctx->ent_len + sizeof(lock_suffix); - sprintf(ctx->lck, "%s%s", pathname, lock_suffix); - strcpy(ctx->dxb, pathname); + memcpy(ctx->lck + ctx->ent_len, lock_suffix, sizeof(lock_suffix)); } else { - ctx->dxb = ctx->lck + ctx->ent_len + sizeof(lck_name); - sprintf(ctx->lck, "%.*s%s", (int)ctx->ent_len, pathname, lck_name); - sprintf(ctx->dxb, "%.*s%s", (int)ctx->ent_len, pathname, dxb_name); + ctx->lck += dxb_name_len; + memcpy(ctx->lck + ctx->ent_len, lck_name, sizeof(lck_name)); + memcpy(ctx->dxb + ctx->ent_len, dxb_name, sizeof(dxb_name)); } + memcpy(ctx->lck, pathname, sizeof(pathchar_t) * ctx->ent_len); return MDBX_SUCCESS; } __cold int mdbx_env_delete(const char *pathname, MDBX_env_delete_mode_t mode) { +#if defined(_WIN32) || defined(_WIN64) + const size_t wlen = mbstowcs(nullptr, pathname, INT_MAX); + if (wlen < 1 || wlen > /* MAX_PATH */ INT16_MAX) + return ERROR_INVALID_NAME; + wchar_t *const pathnameW = _alloca((wlen + 1) * sizeof(wchar_t)); + if (wlen != mbstowcs(pathnameW, pathname, wlen + 1)) + return ERROR_INVALID_NAME; + + return mdbx_env_deleteW(pathnameW, mode); +} + +__cold int mdbx_env_deleteW(const wchar_t *pathname, + MDBX_env_delete_mode_t mode) { +#endif /* Windows */ + switch (mode) { default: return MDBX_EINVAL; @@ -12959,7 +13012,7 @@ __cold int mdbx_env_delete(const char *pathname, MDBX_env_delete_mode_t mode) { (mode == MDBX_ENV_ENSURE_UNUSED) ? MDBX_EXCLUSIVE : MDBX_ENV_DEFAULTS; dummy_env->me_os_psize = (unsigned)mdbx_syspagesize(); dummy_env->me_psize = (unsigned)mdbx_default_pagesize(); - dummy_env->me_pathname = (char *)pathname; + dummy_env->me_pathname = (pathchar_t *)pathname; MDBX_handle_env_pathname env_pathname; STATIC_ASSERT(sizeof(dummy_env->me_flags) == sizeof(MDBX_env_flags_t)); @@ -13021,6 +13074,21 @@ __cold int mdbx_env_delete(const char *pathname, MDBX_env_delete_mode_t mode) { __cold int mdbx_env_open(MDBX_env *env, const char *pathname, MDBX_env_flags_t flags, mdbx_mode_t mode) { +#if defined(_WIN32) || defined(_WIN64) + const size_t wlen = mbstowcs(nullptr, pathname, INT_MAX); + if (wlen < 1 || wlen > /* MAX_PATH */ INT16_MAX) + return ERROR_INVALID_NAME; + wchar_t *const pathnameW = _alloca((wlen + 1) * sizeof(wchar_t)); + if (wlen != mbstowcs(pathnameW, pathname, wlen + 1)) + return ERROR_INVALID_NAME; + + return mdbx_env_openW(env, pathnameW, flags, mode); +} + +__cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, + MDBX_env_flags_t flags, mdbx_mode_t mode) { +#endif /* Windows */ + int rc = check_env(env, false); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -13066,7 +13134,7 @@ __cold int mdbx_env_open(MDBX_env *env, const char *pathname, goto bailout; env->me_flags = (flags & ~MDBX_FATAL_ERROR) | MDBX_ENV_ACTIVE; - env->me_pathname = mdbx_calloc(env_pathname.ent_len + 1, 1); + env->me_pathname = mdbx_calloc(env_pathname.ent_len + 1, sizeof(pathchar_t)); env->me_dbxs = mdbx_calloc(env->me_maxdbs, sizeof(MDBX_dbx)); env->me_dbflags = mdbx_calloc(env->me_maxdbs, sizeof(env->me_dbflags[0])); env->me_dbiseqs = mdbx_calloc(env->me_maxdbs, sizeof(env->me_dbiseqs[0])); @@ -13075,7 +13143,8 @@ __cold int mdbx_env_open(MDBX_env *env, const char *pathname, rc = MDBX_ENOMEM; goto bailout; } - memcpy(env->me_pathname, env_pathname.dxb, env_pathname.ent_len); + memcpy(env->me_pathname, env_pathname.dxb, + env_pathname.ent_len * sizeof(pathchar_t)); env->me_dbxs[FREE_DBI].md_cmp = cmp_int_align4; /* aligned MDBX_INTEGERKEY */ env->me_dbxs[FREE_DBI].md_dcmp = cmp_lenfast; @@ -19911,6 +19980,21 @@ __cold int mdbx_env_copy2fd(MDBX_env *env, mdbx_filehandle_t fd, __cold int mdbx_env_copy(MDBX_env *env, const char *dest_path, MDBX_copy_flags_t flags) { +#if defined(_WIN32) || defined(_WIN64) + const size_t wlen = mbstowcs(nullptr, dest_path, INT_MAX); + if (wlen < 1 || wlen > /* MAX_PATH */ INT16_MAX) + return ERROR_INVALID_NAME; + wchar_t *const dest_pathW = _alloca((wlen + 1) * sizeof(wchar_t)); + if (wlen != mbstowcs(dest_pathW, dest_path, wlen + 1)) + return ERROR_INVALID_NAME; + + return mdbx_env_copyW(env, dest_pathW, flags); +} + +LIBMDBX_API int mdbx_env_copyW(MDBX_env *env, const wchar_t *dest_path, + MDBX_copy_flags_t flags) { +#endif /* Windows */ + int rc = check_env(env, true); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -20049,6 +20133,7 @@ __cold int mdbx_env_set_assert(MDBX_env *env, MDBX_assert_func *func) { #endif } +#if !(defined(_WIN32) || defined(_WIN64)) __cold int mdbx_env_get_path(const MDBX_env *env, const char **arg) { int rc = check_env(env, true); if (unlikely(rc != MDBX_SUCCESS)) @@ -20060,6 +20145,19 @@ __cold int mdbx_env_get_path(const MDBX_env *env, const char **arg) { *arg = env->me_pathname; return MDBX_SUCCESS; } +#else +__cold int mdbx_env_get_pathW(const MDBX_env *env, const wchar_t **arg) { + int rc = check_env(env, true); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + if (unlikely(!arg)) + return MDBX_EINVAL; + + *arg = env->me_pathname; + return MDBX_SUCCESS; +} +#endif /* Windows */ __cold int mdbx_env_get_fd(const MDBX_env *env, mdbx_filehandle_t *arg) { int rc = check_env(env, true); diff --git a/src/internals.h b/src/internals.h index 4d830abf..af82934e 100644 --- a/src/internals.h +++ b/src/internals.h @@ -1143,7 +1143,7 @@ struct MDBX_env { MDBX_dbi me_maxdbs; /* size of the DB table */ uint32_t me_pid; /* process ID of this env */ mdbx_thread_key_t me_txkey; /* thread-key for readers */ - char *me_pathname; /* path to the DB files */ + pathchar_t *me_pathname; /* path to the DB files */ void *me_pbuf; /* scratch area for DUPSORT put() */ MDBX_txn *me_txn0; /* preallocated write transaction */ diff --git a/src/mdbx.c++ b/src/mdbx.c++ index ccb5fa3e..a3587ff7 100644 --- a/src/mdbx.c++ +++ b/src/mdbx.c++ @@ -201,64 +201,6 @@ __cold bug::~bug() noexcept {} #endif /* Unused*/ -//------------------------------------------------------------------------------ - -template struct path_to_pchar { - const std::string str; - path_to_pchar(const PATH &path) : str(path.generic_string()) {} - operator const char *() const { return str.c_str(); } -}; - -template -MDBX_MAYBE_UNUSED PATH pchar_to_path(const char *c_str) { - return PATH(c_str); -} - -#if defined(_WIN32) || defined(_WIN64) - -#ifndef WC_ERR_INVALID_CHARS -static const DWORD WC_ERR_INVALID_CHARS = - (6 /* Windows Vista */ <= /* MajorVersion */ LOBYTE(LOWORD(GetVersion()))) - ? 0x00000080 - : 0; -#endif /* WC_ERR_INVALID_CHARS */ - -template <> struct path_to_pchar { - std::string str; - path_to_pchar(const std::wstring &path) { - if (!path.empty()) { - const int chars = - WideCharToMultiByte(CP_UTF8, WC_ERR_INVALID_CHARS, path.data(), - int(path.size()), nullptr, 0, nullptr, nullptr); - if (chars == 0) - mdbx::error::throw_exception(GetLastError()); - str.append(chars, '\0'); - WideCharToMultiByte(CP_UTF8, WC_ERR_INVALID_CHARS, path.data(), - int(path.size()), const_cast(str.data()), - chars, nullptr, nullptr); - } - } - operator const char *() const { return str.c_str(); } -}; - -template <> -MDBX_MAYBE_UNUSED std::wstring pchar_to_path(const char *c_str) { - std::wstring wstr; - if (c_str && *c_str) { - const int chars = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, c_str, - int(strlen(c_str)), nullptr, 0); - if (chars == 0) - mdbx::error::throw_exception(GetLastError()); - wstr.append(chars, '\0'); - MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, c_str, - int(strlen(c_str)), const_cast(wstr.data()), - chars); - } - return wstr; -} - -#endif /* Windows */ - } // namespace //------------------------------------------------------------------------------ @@ -1239,29 +1181,20 @@ bool env::is_pristine() const { bool env::is_empty() const { return get_stat().ms_leaf_pages == 0; } -#ifdef MDBX_STD_FILESYSTEM_PATH -env &env::copy(const MDBX_STD_FILESYSTEM_PATH &destination, bool compactify, +#if defined(_WIN32) || defined(_WIN64) +env &env::copy(const wchar_t *destination, bool compactify, bool force_dynamic_size) { - const path_to_pchar utf8(destination); error::success_or_throw( - ::mdbx_env_copy(handle_, utf8, - (compactify ? MDBX_CP_COMPACT : MDBX_CP_DEFAULTS) | - (force_dynamic_size ? MDBX_CP_FORCE_DYNAMIC_SIZE - : MDBX_CP_DEFAULTS))); + ::mdbx_env_copyW(handle_, destination, + (compactify ? MDBX_CP_COMPACT : MDBX_CP_DEFAULTS) | + (force_dynamic_size ? MDBX_CP_FORCE_DYNAMIC_SIZE + : MDBX_CP_DEFAULTS))); return *this; } -#endif /* MDBX_STD_FILESYSTEM_PATH */ -#if defined(_WIN32) || defined(_WIN64) env &env::copy(const ::std::wstring &destination, bool compactify, bool force_dynamic_size) { - const path_to_pchar<::std::wstring> utf8(destination); - error::success_or_throw( - ::mdbx_env_copy(handle_, utf8, - (compactify ? MDBX_CP_COMPACT : MDBX_CP_DEFAULTS) | - (force_dynamic_size ? MDBX_CP_FORCE_DYNAMIC_SIZE - : MDBX_CP_DEFAULTS))); - return *this; + return copy(destination.c_str(), compactify, force_dynamic_size); } #endif /* Windows */ @@ -1289,26 +1222,33 @@ env &env::copy(filehandle fd, bool compactify, bool force_dynamic_size) { return *this; } -path env::get_path() const { - const char *c_str; - error::success_or_throw(::mdbx_env_get_path(handle_, &c_str)); - return pchar_to_path(c_str); -} - #ifdef MDBX_STD_FILESYSTEM_PATH -bool env::remove(const MDBX_STD_FILESYSTEM_PATH &pathname, - const remove_mode mode) { - const path_to_pchar utf8(pathname); - return error::boolean_or_throw( - ::mdbx_env_delete(utf8, MDBX_env_delete_mode_t(mode))); +env &env::copy(const MDBX_STD_FILESYSTEM_PATH &destination, bool compactify, + bool force_dynamic_size) { + return copy(destination.native(), compactify, force_dynamic_size); } #endif /* MDBX_STD_FILESYSTEM_PATH */ +path env::get_path() const { #if defined(_WIN32) || defined(_WIN64) -bool env::remove(const ::std::wstring &pathname, const remove_mode mode) { - const path_to_pchar<::std::wstring> utf8(pathname); + const wchar_t *c_wstr; + error::success_or_throw(::mdbx_env_get_pathW(handle_, &c_wstr)); + return path(c_wstr); +#else + const char *c_str; + error::success_or_throw(::mdbx_env_get_path(handle_, &c_str)); + return path(c_str); +#endif +} + +#if defined(_WIN32) || defined(_WIN64) +bool env::remove(const wchar_t *pathname, const remove_mode mode) { return error::boolean_or_throw( - ::mdbx_env_delete(utf8, MDBX_env_delete_mode_t(mode))); + ::mdbx_env_deleteW(pathname, MDBX_env_delete_mode_t(mode))); +} + +bool env::remove(const ::std::wstring &pathname, const remove_mode mode) { + return remove(pathname.c_str(), mode); } #endif /* Windows */ @@ -1321,6 +1261,13 @@ bool env::remove(const ::std::string &pathname, const remove_mode mode) { return remove(pathname.c_str(), mode); } +#ifdef MDBX_STD_FILESYSTEM_PATH +bool env::remove(const MDBX_STD_FILESYSTEM_PATH &pathname, + const remove_mode mode) { + return remove(pathname.native(), mode); +} +#endif /* MDBX_STD_FILESYSTEM_PATH */ + //------------------------------------------------------------------------------ static inline MDBX_env *create_env() { @@ -1357,66 +1304,42 @@ __cold void env_managed::setup(unsigned max_maps, unsigned max_readers) { error::success_or_throw(::mdbx_env_set_maxdbs(handle_, max_maps)); } -#ifdef MDBX_STD_FILESYSTEM_PATH -__cold env_managed::env_managed(const MDBX_STD_FILESYSTEM_PATH &pathname, - const operate_parameters &op, bool accede) - : env_managed(create_env()) { - setup(op.max_maps, op.max_readers); - const path_to_pchar utf8(pathname); - error::success_or_throw( - ::mdbx_env_open(handle_, utf8, op.make_flags(accede), 0)); - - if (op.options.nested_write_transactions && - !get_options().nested_write_transactions) - MDBX_CXX20_UNLIKELY error::throw_exception(MDBX_INCOMPATIBLE); -} - -__cold env_managed::env_managed(const MDBX_STD_FILESYSTEM_PATH &pathname, - const env_managed::create_parameters &cp, - const env::operate_parameters &op, bool accede) - : env_managed(create_env()) { - setup(op.max_maps, op.max_readers); - const path_to_pchar utf8(pathname); - set_geometry(cp.geometry); - error::success_or_throw( - ::mdbx_env_open(handle_, utf8, op.make_flags(accede, cp.use_subdirectory), - cp.file_mode_bits)); - - if (op.options.nested_write_transactions && - !get_options().nested_write_transactions) - MDBX_CXX20_UNLIKELY error::throw_exception(MDBX_INCOMPATIBLE); -} -#endif /* MDBX_STD_FILESYSTEM_PATH */ - #if defined(_WIN32) || defined(_WIN64) -__cold env_managed::env_managed(const ::std::wstring &pathname, +__cold env_managed::env_managed(const wchar_t *pathname, const operate_parameters &op, bool accede) : env_managed(create_env()) { setup(op.max_maps, op.max_readers); - const path_to_pchar<::std::wstring> utf8(pathname); error::success_or_throw( - ::mdbx_env_open(handle_, utf8, op.make_flags(accede), 0)); + ::mdbx_env_openW(handle_, pathname, op.make_flags(accede), 0)); if (op.options.nested_write_transactions && !get_options().nested_write_transactions) MDBX_CXX20_UNLIKELY error::throw_exception(MDBX_INCOMPATIBLE); } -__cold env_managed::env_managed(const ::std::wstring &pathname, +__cold env_managed::env_managed(const wchar_t *pathname, const env_managed::create_parameters &cp, const env::operate_parameters &op, bool accede) : env_managed(create_env()) { setup(op.max_maps, op.max_readers); - const path_to_pchar<::std::wstring> utf8(pathname); set_geometry(cp.geometry); - error::success_or_throw( - ::mdbx_env_open(handle_, utf8, op.make_flags(accede, cp.use_subdirectory), - cp.file_mode_bits)); + error::success_or_throw(::mdbx_env_openW( + handle_, pathname, op.make_flags(accede, cp.use_subdirectory), + cp.file_mode_bits)); if (op.options.nested_write_transactions && !get_options().nested_write_transactions) MDBX_CXX20_UNLIKELY error::throw_exception(MDBX_INCOMPATIBLE); } + +__cold env_managed::env_managed(const ::std::wstring &pathname, + const operate_parameters &op, bool accede) + : env_managed(pathname.c_str(), op, accede) {} + +__cold env_managed::env_managed(const ::std::wstring &pathname, + const env_managed::create_parameters &cp, + const env::operate_parameters &op, bool accede) + : env_managed(pathname.c_str(), cp, op, accede) {} #endif /* Windows */ __cold env_managed::env_managed(const char *pathname, @@ -1455,6 +1378,17 @@ __cold env_managed::env_managed(const ::std::string &pathname, const env::operate_parameters &op, bool accede) : env_managed(pathname.c_str(), cp, op, accede) {} +#ifdef MDBX_STD_FILESYSTEM_PATH +__cold env_managed::env_managed(const MDBX_STD_FILESYSTEM_PATH &pathname, + const operate_parameters &op, bool accede) + : env_managed(pathname.native(), op, accede) {} + +__cold env_managed::env_managed(const MDBX_STD_FILESYSTEM_PATH &pathname, + const env_managed::create_parameters &cp, + const env::operate_parameters &op, bool accede) + : env_managed(pathname.native(), cp, op, accede) {} +#endif /* MDBX_STD_FILESYSTEM_PATH */ + //------------------------------------------------------------------------------ txn_managed txn::start_nested() { diff --git a/src/osal.c b/src/osal.c index f2ad4ab8..8d44eb65 100644 --- a/src/osal.c +++ b/src/osal.c @@ -518,15 +518,9 @@ MDBX_INTERNAL_FUNC int mdbx_fastmutex_release(mdbx_fastmutex_t *fastmutex) { /*----------------------------------------------------------------------------*/ -MDBX_INTERNAL_FUNC int mdbx_removefile(const char *pathname) { +MDBX_INTERNAL_FUNC int mdbx_removefile(const pathchar_t *pathname) { #if defined(_WIN32) || defined(_WIN64) - const size_t wlen = mbstowcs(nullptr, pathname, INT_MAX); - if (wlen < 1 || wlen > /* MAX_PATH */ INT16_MAX) - return ERROR_INVALID_NAME; - wchar_t *const pathnameW = _alloca((wlen + 1) * sizeof(wchar_t)); - if (wlen != mbstowcs(pathnameW, pathname, wlen + 1)) - return ERROR_INVALID_NAME; - return DeleteFileW(pathnameW) ? MDBX_SUCCESS : (int)GetLastError(); + return DeleteFileW(pathname) ? MDBX_SUCCESS : (int)GetLastError(); #else return unlink(pathname) ? errno : MDBX_SUCCESS; #endif @@ -536,34 +530,22 @@ MDBX_INTERNAL_FUNC int mdbx_removefile(const char *pathname) { static bool is_valid_fd(int fd) { return !(isatty(fd) < 0 && errno == EBADF); } #endif /*! Windows */ -MDBX_INTERNAL_FUNC int mdbx_removedirectory(const char *pathname) { +MDBX_INTERNAL_FUNC int mdbx_removedirectory(const pathchar_t *pathname) { #if defined(_WIN32) || defined(_WIN64) - const size_t wlen = mbstowcs(nullptr, pathname, INT_MAX); - if (wlen < 1 || wlen > /* MAX_PATH */ INT16_MAX) - return ERROR_INVALID_NAME; - wchar_t *const pathnameW = _alloca((wlen + 1) * sizeof(wchar_t)); - if (wlen != mbstowcs(pathnameW, pathname, wlen + 1)) - return ERROR_INVALID_NAME; - return RemoveDirectoryW(pathnameW) ? MDBX_SUCCESS : (int)GetLastError(); + return RemoveDirectoryW(pathname) ? MDBX_SUCCESS : (int)GetLastError(); #else return rmdir(pathname) ? errno : MDBX_SUCCESS; #endif } MDBX_INTERNAL_FUNC int mdbx_openfile(const enum mdbx_openfile_purpose purpose, - const MDBX_env *env, const char *pathname, + const MDBX_env *env, + const pathchar_t *pathname, mdbx_filehandle_t *fd, mdbx_mode_t unix_mode_bits) { *fd = INVALID_HANDLE_VALUE; #if defined(_WIN32) || defined(_WIN64) - const size_t wlen = mbstowcs(nullptr, pathname, INT_MAX); - if (wlen < 1 || wlen > /* MAX_PATH */ INT16_MAX) - return ERROR_INVALID_NAME; - wchar_t *const pathnameW = _alloca((wlen + 1) * sizeof(wchar_t)); - if (wlen != mbstowcs(pathnameW, pathname, wlen + 1)) - return ERROR_INVALID_NAME; - DWORD CreationDisposition = unix_mode_bits ? OPEN_ALWAYS : OPEN_EXISTING; DWORD FlagsAndAttributes = FILE_FLAG_POSIX_SEMANTICS | FILE_ATTRIBUTE_NOT_CONTENT_INDEXED; @@ -608,12 +590,12 @@ MDBX_INTERNAL_FUNC int mdbx_openfile(const enum mdbx_openfile_purpose purpose, break; } - *fd = CreateFileW(pathnameW, DesiredAccess, ShareMode, NULL, + *fd = CreateFileW(pathname, DesiredAccess, ShareMode, NULL, CreationDisposition, FlagsAndAttributes, NULL); if (*fd == INVALID_HANDLE_VALUE) { int err = (int)GetLastError(); if (err == ERROR_ACCESS_DENIED && purpose == MDBX_OPEN_LCK) { - if (GetFileAttributesW(pathnameW) == INVALID_FILE_ATTRIBUTES && + if (GetFileAttributesW(pathname) == INVALID_FILE_ATTRIBUTES && GetLastError() == ERROR_FILE_NOT_FOUND) err = ERROR_FILE_NOT_FOUND; } @@ -632,7 +614,7 @@ MDBX_INTERNAL_FUNC int mdbx_openfile(const enum mdbx_openfile_purpose purpose, (FILE_ATTRIBUTE_HIDDEN | FILE_ATTRIBUTE_NOT_CONTENT_INDEXED | FILE_ATTRIBUTE_TEMPORARY | FILE_ATTRIBUTE_COMPRESSED); if (AttributesDiff) - (void)SetFileAttributesW(pathnameW, info.dwFileAttributes ^ AttributesDiff); + (void)SetFileAttributesW(pathname, info.dwFileAttributes ^ AttributesDiff); #else int flags = unix_mode_bits ? O_CREAT : 0; @@ -1089,7 +1071,8 @@ MDBX_INTERNAL_FUNC int mdbx_msync(mdbx_mmap_t *map, size_t offset, } MDBX_INTERNAL_FUNC int mdbx_check_fs_rdonly(mdbx_filehandle_t handle, - const char *pathname, int err) { + const pathchar_t *pathname, + int err) { #if defined(_WIN32) || defined(_WIN64) (void)pathname; (void)err; diff --git a/src/osal.h b/src/osal.h index 049d99dc..7c5ab9bb 100644 --- a/src/osal.h +++ b/src/osal.h @@ -224,6 +224,12 @@ mdbx_syspagesize(void) { #endif } +#if defined(_WIN32) || defined(_WIN64) +typedef wchar_t pathchar_t; +#else +typedef char pathchar_t; +#endif + typedef struct mdbx_mmap_param { union { void *address; @@ -365,12 +371,13 @@ enum mdbx_openfile_purpose { }; MDBX_INTERNAL_FUNC int mdbx_openfile(const enum mdbx_openfile_purpose purpose, - const MDBX_env *env, const char *pathname, + const MDBX_env *env, + const pathchar_t *pathname, mdbx_filehandle_t *fd, mdbx_mode_t unix_mode_bits); MDBX_INTERNAL_FUNC int mdbx_closefile(mdbx_filehandle_t fd); -MDBX_INTERNAL_FUNC int mdbx_removefile(const char *pathname); -MDBX_INTERNAL_FUNC int mdbx_removedirectory(const char *pathname); +MDBX_INTERNAL_FUNC int mdbx_removefile(const pathchar_t *pathname); +MDBX_INTERNAL_FUNC int mdbx_removedirectory(const pathchar_t *pathname); MDBX_INTERNAL_FUNC int mdbx_is_pipe(mdbx_filehandle_t fd); MDBX_INTERNAL_FUNC int mdbx_lockfile(mdbx_filehandle_t fd, bool wait); @@ -398,7 +405,8 @@ MDBX_INTERNAL_FUNC int mdbx_msync(mdbx_mmap_t *map, size_t offset, size_t length, enum mdbx_syncmode_bits mode_bits); MDBX_INTERNAL_FUNC int mdbx_check_fs_rdonly(mdbx_filehandle_t handle, - const char *pathname, int err); + const pathchar_t *pathname, + int err); MDBX_MAYBE_UNUSED static __inline uint32_t mdbx_getpid(void) { STATIC_ASSERT(sizeof(mdbx_pid_t) <= sizeof(uint32_t)); From c2bf9ebf17785145595612c453bc0d91183a81ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Tue, 9 Aug 2022 23:40:53 +0300 Subject: [PATCH 082/364] mdbx: minor refine AVX2/SSE2-accelerated `scan4seq()`. --- src/core.c | 202 ++++++++++++++++++++++++++--------------------------- 1 file changed, 99 insertions(+), 103 deletions(-) diff --git a/src/core.c b/src/core.c index 69919870..ba203fdd 100644 --- a/src/core.c +++ b/src/core.c @@ -5969,18 +5969,28 @@ MDBX_MAYBE_UNUSED static const pgno_t *scan4range_checker(const MDBX_PNL pnl, return nullptr; } +#if defined(_MSC_VER) && !defined(__builtin_clz) && \ + !__has_builtin(__builtin_clz) +MDBX_MAYBE_UNUSED static __always_inline size_t __builtin_clz(unsigned value) { + unsigned long index; + _BitScanReverse(&index, value); + return index; +} +#endif /* _MSC_VER */ + #if !defined(MDBX_ATTRIBUTE_TARGET) && \ (__has_attribute(__target__) || __GNUC_PREREQ(5, 0)) #define MDBX_ATTRIBUTE_TARGET(target) __attribute__((__target__(target))) #endif /* MDBX_ATTRIBUTE_TARGET */ -#ifdef MDBX_ATTRIBUTE_TARGET_AVX512BW -MDBX_MAYBE_UNUSED -__hot MDBX_ATTRIBUTE_TARGET_AVX512BW static pgno_t *static pgno_t * -scan4seq_avx512bw(pgno_t *range, const size_t len, const unsigned seq) { - return nullptr; -} -#endif /* MDBX_ATTRIBUTE_TARGET_AVX512BW */ +#if defined(__SSE2__) +#define MDBX_ATTRIBUTE_TARGET_SSE2 /* nope */ +#elif (defined(_M_IX86_FP) && _M_IX86_FP >= 2) || defined(__amd64__) +#define __SSE2__ +#define MDBX_ATTRIBUTE_TARGET_SSE2 /* nope */ +#elif defined(MDBX_ATTRIBUTE_TARGET) && defined(__ia32__) +#define MDBX_ATTRIBUTE_TARGET_SSE2 MDBX_ATTRIBUTE_TARGET("sse2") +#endif /* __SSE2__ */ #if defined(__AVX2__) #define MDBX_ATTRIBUTE_TARGET_AVX2 /* nope */ @@ -5988,8 +5998,66 @@ scan4seq_avx512bw(pgno_t *range, const size_t len, const unsigned seq) { #define MDBX_ATTRIBUTE_TARGET_AVX2 MDBX_ATTRIBUTE_TARGET("avx2") #endif /* __AVX2__ */ -#ifdef MDBX_ATTRIBUTE_TARGET_AVX2 +#ifdef MDBX_ATTRIBUTE_TARGET_SSE2 +MDBX_ATTRIBUTE_TARGET_SSE2 static __always_inline unsigned +diffcmp2mask_sse2(const pgno_t *const ptr, const ptrdiff_t offset, + const __m128i pattern) { + const __m128i f = _mm_loadu_si128((const __m128i *)ptr); + const __m128i l = _mm_loadu_si128((const __m128i *)(ptr + offset)); + const __m128i cmp = _mm_cmpeq_epi32(_mm_sub_epi32(f, l), pattern); + return _mm_movemask_ps(*(const __m128 *)&cmp); +} +MDBX_MAYBE_UNUSED __hot MDBX_ATTRIBUTE_TARGET_SSE2 static pgno_t * +scan4seq_sse2(pgno_t *range, const size_t len, const unsigned seq) { + assert(seq > 0 && len > seq); +#if MDBX_PNL_ASCENDING +#error "FIXME: Not implemented" +#endif /* MDBX_PNL_ASCENDING */ + assert(range[-(ptrdiff_t)len] == len); + pgno_t *const detent = range - len + seq; + const ptrdiff_t offset = -(ptrdiff_t)seq; + const pgno_t target = (pgno_t)offset; + const __m128i pattern = _mm_set1_epi32(target); + uint8_t mask; + if (likely(len > seq + 3)) { + do { + mask = (uint8_t)diffcmp2mask_sse2(range - 3, offset, pattern); + if (mask) { + found: + return range + 28 - __builtin_clz(mask); + } + range -= 4; + } while (range > detent + 3); + if (range == detent) + return nullptr; + } + + /* Далее происходит чтение от 4 до 12 лишних байт, которые могут быть не + * только за пределами региона выделенного под PNL, но и пересекать границу + * страницы памяти. Что может приводить как к ошибкам ASAN, так и к падению. + * Поэтому проверяем смещение на странице, а с ASAN всегда страхуемся. */ +#ifndef __SANITIZE_ADDRESS__ + const unsigned on_page_safe_mask = 0xff0 /* enough for '-15' bytes offset */; + if (likely(on_page_safe_mask & (uintptr_t)(range + offset))) { + const unsigned extra = (unsigned)(detent + 4 - range); + assert(extra > 0 && extra < 4); + mask = 0xF << extra; + mask &= diffcmp2mask_sse2(range - 3, offset, pattern); + if (mask) + goto found; + return nullptr; + } +#endif /* __SANITIZE_ADDRESS__ */ + do + if (*range - range[offset] == target) + return range; + while (--range != detent); + return nullptr; +} +#endif /* MDBX_ATTRIBUTE_TARGET_SSE2 */ + +#ifdef MDBX_ATTRIBUTE_TARGET_AVX2 MDBX_ATTRIBUTE_TARGET_AVX2 static __always_inline unsigned diffcmp2mask_avx2(const pgno_t *const ptr, const ptrdiff_t offset, const __m256i pattern) { @@ -6014,8 +6082,10 @@ scan4seq_avx2(pgno_t *range, const size_t len, const unsigned seq) { if (likely(len > seq + 7)) { do { mask = (uint8_t)diffcmp2mask_avx2(range - 7, offset, pattern); - if (mask) - goto found; + if (mask) { + found: + return range + 24 - __builtin_clz(mask); + } range -= 8; } while (range > detent + 7); if (range == detent) @@ -6026,115 +6096,41 @@ scan4seq_avx2(pgno_t *range, const size_t len, const unsigned seq) { * только за пределами региона выделенного под PNL, но и пересекать границу * страницы памяти. Что может приводить как к ошибкам ASAN, так и к падению. * Поэтому проверяем смещение на странице, а с ASAN всегда страхуемся. */ -#ifdef __SANITIZE_ADDRESS__ - const unsigned on_page_safe_mask = 0; -#else +#ifndef __SANITIZE_ADDRESS__ const unsigned on_page_safe_mask = 0xfe0 /* enough for '-31' bytes offset */; -#endif if (likely(on_page_safe_mask & (uintptr_t)(range + offset))) { const unsigned extra = (unsigned)(detent + 8 - range); assert(extra > 0 && extra < 8); mask = 0xFF << extra; mask &= diffcmp2mask_avx2(range - 7, offset, pattern); - if (mask) { - found:; -#ifdef _MSC_VER - unsigned long index; - _BitScanReverse(&index, mask); -#else - const unsigned index = __builtin_clz(mask); -#endif /* _MSC_VER */ - range = range + 24 - index; - return range; - } + if (mask) + goto found; return nullptr; } - do +#endif /* __SANITIZE_ADDRESS__ */ + if (range - 3 > detent) { + mask = diffcmp2mask_sse2(range - 3, offset, *(const __m128i *)&pattern); + if (mask) + return range + 28 - __builtin_clz(mask); + range -= 4; + } + while (range > detent) { if (*range - range[offset] == target) return range; - while (--range != detent); + --range; + } return nullptr; } #endif /* MDBX_ATTRIBUTE_TARGET_AVX2 */ -#if defined(__SSE2__) -#define MDBX_ATTRIBUTE_TARGET_SSE2 /* nope */ -#elif (defined(_M_IX86_FP) && _M_IX86_FP >= 2) || defined(__amd64__) -#define __SSE2__ -#define MDBX_ATTRIBUTE_TARGET_SSE2 /* nope */ -#elif defined(MDBX_ATTRIBUTE_TARGET) && defined(__ia32__) -#define MDBX_ATTRIBUTE_TARGET_SSE2 MDBX_ATTRIBUTE_TARGET("sse2") -#endif /* __SSE2__ */ - -#ifdef MDBX_ATTRIBUTE_TARGET_SSE2 -MDBX_ATTRIBUTE_TARGET_SSE2 static __always_inline unsigned -diffcmp2mask_sse2(const pgno_t *const ptr, const ptrdiff_t offset, - const __m128i pattern) { - const __m128i f = _mm_loadu_si128((const __m128i *)ptr); - const __m128i l = _mm_loadu_si128((const __m128i *)(ptr + offset)); - const __m128i cmp = _mm_cmpeq_epi32(_mm_sub_epi32(f, l), pattern); - return _mm_movemask_ps(*(const __m128 *)&cmp); -} - -MDBX_MAYBE_UNUSED __hot MDBX_ATTRIBUTE_TARGET_SSE2 static pgno_t * -scan4seq_sse2(pgno_t *range, const size_t len, const unsigned seq) { - assert(seq > 0 && len > seq); -#if MDBX_PNL_ASCENDING -#error "FIXME: Not implemented" -#endif /* MDBX_PNL_ASCENDING */ - assert(range[-(ptrdiff_t)len] == len); - pgno_t *const detent = range - len + seq; - const ptrdiff_t offset = -(ptrdiff_t)seq; - const pgno_t target = (pgno_t)offset; - const __m128i pattern = _mm_set_epi32(target, target, target, target); - uint8_t mask; - if (likely(len > seq + 3)) { - do { - mask = (uint8_t)diffcmp2mask_sse2(range - 3, offset, pattern); - if (mask) - goto found; - range -= 4; - } while (range > detent + 3); - if (range == detent) - return nullptr; - } - - /* Далее происходит чтение от 4 до 12 лишних байт, которые могут быть не - * только за пределами региона выделенного под PNL, но и пересекать границу - * страницы памяти. Что может приводить как к ошибкам ASAN, так и к падению. - * Поэтому проверяем смещение на странице, а с ASAN всегда страхуемся. */ -#ifdef __SANITIZE_ADDRESS__ - const unsigned on_page_safe_mask = 0; -#else - const unsigned on_page_safe_mask = 0xff0 /* enough for '-15' bytes offset */; -#endif - if (likely(on_page_safe_mask & (uintptr_t)(range + offset))) { - const unsigned extra = (unsigned)(detent + 4 - range); - assert(extra > 0 && extra < 4); - mask = 0xF << extra; - mask &= diffcmp2mask_sse2(range - 3, offset, pattern); - if (mask) { - found:; -#ifdef _MSC_VER - unsigned long index; - _BitScanReverse(&index, mask); -#else - const unsigned index = __builtin_clz(mask); -#endif /* _MSC_VER */ - range = range + 28 - index; - return range; - } - return nullptr; - } - do - if (*range - range[offset] == target) - return range; - while (--range != detent); +#ifdef MDBX_ATTRIBUTE_TARGET_AVX512BW +MDBX_MAYBE_UNUSED __hot MDBX_ATTRIBUTE_TARGET_AVX512BW static pgno_t * +scan4seq_avx512bw(pgno_t *range, const size_t len, const unsigned seq) { return nullptr; } -#endif /* MDBX_ATTRIBUTE_TARGET_SSE2 */ +#endif /* MDBX_ATTRIBUTE_TARGET_AVX512BW */ -#if defined(__AVX512BW__) && defined(MDBX_ATTRIBUTE_TARGET_AVX512) +#if defined(__AVX512BW__) && defined(MDBX_ATTRIBUTE_TARGET_AVX512BW) #define scan4seq_default scan4seq_avx512bw #define scan4seq scan4seq_default #elif defined(__AVX2__) && defined(MDBX_ATTRIBUTE_TARGET_AVX2) @@ -6184,7 +6180,7 @@ static pgno_t *scan4seq_resolver(pgno_t *range, const size_t len, #endif /* MDBX_ATTRIBUTE_TARGET_AVX2 */ #ifdef MDBX_ATTRIBUTE_TARGET_AVX512BW if (__builtin_cpu_supports("avx512bw")) - choice = scan4seq_avx512; + choice = scan4seq_avx512bw; #endif /* MDBX_ATTRIBUTE_TARGET_AVX512BW */ /* Choosing of another variants should be added here. */ scan4seq = choice ? choice : scan4seq_default; From 78dc69970925c1f52c2a80e6c7db8f1da1e93312 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Wed, 10 Aug 2022 00:33:08 +0300 Subject: [PATCH 083/364] =?UTF-8?q?mdbx:=20add=20=C3=9716=20accelerated=20?= =?UTF-8?q?`scan4seq()`=20(AVX512BW).?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core.c | 71 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) diff --git a/src/core.c b/src/core.c index ba203fdd..e726ebc6 100644 --- a/src/core.c +++ b/src/core.c @@ -5998,6 +5998,13 @@ MDBX_MAYBE_UNUSED static __always_inline size_t __builtin_clz(unsigned value) { #define MDBX_ATTRIBUTE_TARGET_AVX2 MDBX_ATTRIBUTE_TARGET("avx2") #endif /* __AVX2__ */ +#if defined(__AVX512BW__) +#define MDBX_ATTRIBUTE_TARGET_AVX512BW /* nope */ +#elif defined(MDBX_ATTRIBUTE_TARGET) && defined(__ia32__) && \ + (__GNUC_PREREQ(6, 0) || __CLANG_PREREQ(5, 0)) +#define MDBX_ATTRIBUTE_TARGET_AVX512BW MDBX_ATTRIBUTE_TARGET("avx512bw") +#endif /* __AVX512BW__ */ + #ifdef MDBX_ATTRIBUTE_TARGET_SSE2 MDBX_ATTRIBUTE_TARGET_SSE2 static __always_inline unsigned diffcmp2mask_sse2(const pgno_t *const ptr, const ptrdiff_t offset, @@ -6124,8 +6131,72 @@ scan4seq_avx2(pgno_t *range, const size_t len, const unsigned seq) { #endif /* MDBX_ATTRIBUTE_TARGET_AVX2 */ #ifdef MDBX_ATTRIBUTE_TARGET_AVX512BW +MDBX_ATTRIBUTE_TARGET_AVX512BW static __always_inline unsigned +diffcmp2mask_avx512bw(const pgno_t *const ptr, const ptrdiff_t offset, + const __m512i pattern) { + const __m512i f = _mm512_loadu_si512((const __m512i *)ptr); + const __m512i l = _mm512_loadu_si512((const __m512i *)(ptr + offset)); + return _mm512_cmpeq_epi32_mask(_mm512_sub_epi32(f, l), pattern); +} + MDBX_MAYBE_UNUSED __hot MDBX_ATTRIBUTE_TARGET_AVX512BW static pgno_t * scan4seq_avx512bw(pgno_t *range, const size_t len, const unsigned seq) { + assert(seq > 0 && len > seq); +#if MDBX_PNL_ASCENDING +#error "FIXME: Not implemented" +#endif /* MDBX_PNL_ASCENDING */ + assert(range[-(ptrdiff_t)len] == len); + pgno_t *const detent = range - len + seq; + const ptrdiff_t offset = -(ptrdiff_t)seq; + const pgno_t target = (pgno_t)offset; + const __m512i pattern = _mm512_set1_epi32(target); + unsigned mask; + if (likely(len > seq + 15)) { + do { + mask = diffcmp2mask_avx512bw(range - 15, offset, pattern); + if (mask) { + found: + return range + 16 - __builtin_clz(mask); + } + range -= 16; + } while (range > detent + 15); + if (range == detent) + return nullptr; + } + + /* Далее происходит чтение от 4 до 60 лишних байт, которые могут быть не + * только за пределами региона выделенного под PNL, но и пересекать границу + * страницы памяти. Что может приводить как к ошибкам ASAN, так и к падению. + * Поэтому проверяем смещение на странице, а с ASAN всегда страхуемся. */ +#ifndef __SANITIZE_ADDRESS__ + const unsigned on_page_safe_mask = 0xfc0 /* enough for '-63' bytes offset */; + if (likely(on_page_safe_mask & (uintptr_t)(range + offset))) { + const unsigned extra = (unsigned)(detent + 16 - range); + assert(extra > 0 && extra < 16); + mask = 0xFFFF << extra; + mask &= diffcmp2mask_avx512bw(range - 15, offset, pattern); + if (mask) + goto found; + return nullptr; + } +#endif /* __SANITIZE_ADDRESS__ */ + if (range - 7 > detent) { + mask = diffcmp2mask_avx2(range - 7, offset, *(const __m256i *)&pattern); + if (mask) + return range + 24 - __builtin_clz(mask); + range -= 8; + } + if (range - 3 > detent) { + mask = diffcmp2mask_sse2(range - 3, offset, *(const __m128i *)&pattern); + if (mask) + return range + 28 - __builtin_clz(mask); + range -= 4; + } + while (range > detent) { + if (*range - range[offset] == target) + return range; + --range; + } return nullptr; } #endif /* MDBX_ATTRIBUTE_TARGET_AVX512BW */ From d8f0c9dc441909a5fb33001ffedef6e921e574a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Sun, 7 Aug 2022 11:44:17 +0300 Subject: [PATCH 084/364] mdbx: more `__hot`. --- src/core.c | 58 ++++++++++++++++++++++++++++-------------------------- 1 file changed, 30 insertions(+), 28 deletions(-) diff --git a/src/core.c b/src/core.c index e726ebc6..14c4be12 100644 --- a/src/core.c +++ b/src/core.c @@ -786,7 +786,7 @@ get_key_optional(const MDBX_node *node, MDBX_val *keyptr /* __may_null */) { * in an obsolete versions of Elbrus's libc and kernels. */ #if defined(__e2k__) && defined(MDBX_E2K_MLHCPB_WORKAROUND) && \ MDBX_E2K_MLHCPB_WORKAROUND -int __hot mdbx_e2k_memcmp_bug_workaround(const void *s1, const void *s2, +__hot int mdbx_e2k_memcmp_bug_workaround(const void *s1, const void *s2, size_t n) { if (unlikely(n > 42 /* LY: align followed access if reasonable possible */ @@ -853,7 +853,7 @@ int __hot mdbx_e2k_memcmp_bug_workaround(const void *s1, const void *s2, return (n & 1) ? *(uint8_t *)s1 - *(uint8_t *)s2 : 0; } -int __hot mdbx_e2k_strcmp_bug_workaround(const char *s1, const char *s2) { +__hot int mdbx_e2k_strcmp_bug_workaround(const char *s1, const char *s2) { while (true) { int diff = *(uint8_t *)s1 - *(uint8_t *)s2; if (likely(diff != 0) || *s1 == '\0') @@ -863,7 +863,7 @@ int __hot mdbx_e2k_strcmp_bug_workaround(const char *s1, const char *s2) { } } -int __hot mdbx_e2k_strncmp_bug_workaround(const char *s1, const char *s2, +__hot int mdbx_e2k_strncmp_bug_workaround(const char *s1, const char *s2, size_t n) { while (n > 0) { int diff = *(uint8_t *)s1 - *(uint8_t *)s2; @@ -876,7 +876,7 @@ int __hot mdbx_e2k_strncmp_bug_workaround(const char *s1, const char *s2, return 0; } -size_t __hot mdbx_e2k_strlen_bug_workaround(const char *s) { +__hot size_t mdbx_e2k_strlen_bug_workaround(const char *s) { size_t n = 0; while (*s) { s += 1; @@ -885,7 +885,7 @@ size_t __hot mdbx_e2k_strlen_bug_workaround(const char *s) { return n; } -size_t __hot mdbx_e2k_strnlen_bug_workaround(const char *s, size_t maxlen) { +__hot size_t mdbx_e2k_strnlen_bug_workaround(const char *s, size_t maxlen) { size_t n = 0; while (maxlen > n && *s) { s += 1; @@ -2142,7 +2142,7 @@ static int lcklist_detach_locked(MDBX_env *env) { TYPE *lo, *hi; \ } NAME##_stack; \ \ - static __hot void NAME(TYPE *const __restrict begin, \ + __hot static void NAME(TYPE *const __restrict begin, \ TYPE *const __restrict end) { \ NAME##_stack stack[sizeof(unsigned) * CHAR_BIT], *__restrict top = stack; \ \ @@ -2492,7 +2492,7 @@ mdbx_pnl_append_range(bool spilled, MDBX_PNL *ppl, pgno_t pgno, unsigned n) { } /* Append an pgno range into the sorted PNL */ -static __hot int __must_check_result mdbx_pnl_insert_range(MDBX_PNL *ppl, +__hot static int __must_check_result mdbx_pnl_insert_range(MDBX_PNL *ppl, pgno_t pgno, unsigned n) { assert(n > 0); @@ -2575,7 +2575,7 @@ pnl_merge_inner(pgno_t *__restrict dst, const pgno_t *__restrict src_a, } /* Merge a PNL onto a PNL. The destination PNL must be big enough */ -static void __hot pnl_merge(MDBX_PNL dst, const MDBX_PNL src) { +__hot static void pnl_merge(MDBX_PNL dst, const MDBX_PNL src) { assert(pnl_check_allocated(dst, MAX_PAGENO + 1)); assert(pnl_check(src, MAX_PAGENO + 1)); const pgno_t src_len = MDBX_PNL_SIZE(src); @@ -2649,7 +2649,7 @@ RADIXSORT_IMPL(pgno, pgno_t, MDBX_PNL_EXTRACT_KEY, SORT_IMPL(pgno_sort, false, pgno_t, MDBX_PNL_ORDERED) -static __hot void mdbx_pnl_sort_nochk(MDBX_PNL pnl) { +__hot __noinline static void mdbx_pnl_sort_nochk(MDBX_PNL pnl) { if (likely(MDBX_PNL_SIZE(pnl) < MDBX_RADIXSORT_THRESHOLD) || unlikely(!pgno_radixsort(&MDBX_PNL_FIRST(pnl), MDBX_PNL_SIZE(pnl)))) pgno_sort(MDBX_PNL_BEGIN(pnl), MDBX_PNL_END(pnl)); @@ -2665,7 +2665,8 @@ static __inline void mdbx_pnl_sort(MDBX_PNL pnl, size_t limit4check) { * Returns The index of the first item greater than or equal to pgno. */ SEARCH_IMPL(pgno_bsearch, pgno_t, pgno_t, MDBX_PNL_ORDERED) -static __hot unsigned mdbx_pnl_search_nochk(const MDBX_PNL pnl, pgno_t pgno) { +__hot __noinline static unsigned mdbx_pnl_search_nochk(const MDBX_PNL pnl, + pgno_t pgno) { const pgno_t *begin = MDBX_PNL_BEGIN(pnl); const pgno_t *it = pgno_bsearch(begin, MDBX_PNL_SIZE(pnl), pgno); const pgno_t *end = begin + MDBX_PNL_SIZE(pnl); @@ -3001,7 +3002,7 @@ static __always_inline MDBX_dpl *dpl_sort(const MDBX_txn *txn) { #define DP_SEARCH_CMP(dp, id) ((dp).pgno < (id)) SEARCH_IMPL(dp_bsearch, MDBX_dp, pgno_t, DP_SEARCH_CMP) -__hot static unsigned dpl_search(const MDBX_txn *txn, pgno_t pgno) { +__hot __noinline static unsigned dpl_search(const MDBX_txn *txn, pgno_t pgno) { MDBX_dpl *dl = txn->tw.dirtylist; assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); if (mdbx_audit_enabled()) { @@ -13564,7 +13565,7 @@ __cold int mdbx_env_close(MDBX_env *env) { #endif /* LIBMDBX_NO_EXPORTS_LEGACY_API */ /* Compare two items pointing at aligned unsigned int's. */ -static int __hot cmp_int_align4(const MDBX_val *a, const MDBX_val *b) { +__hot static int cmp_int_align4(const MDBX_val *a, const MDBX_val *b) { mdbx_assert(NULL, a->iov_len == b->iov_len); switch (a->iov_len) { case 4: @@ -13581,7 +13582,7 @@ static int __hot cmp_int_align4(const MDBX_val *a, const MDBX_val *b) { } /* Compare two items pointing at 2-byte aligned unsigned int's. */ -static int __hot cmp_int_align2(const MDBX_val *a, const MDBX_val *b) { +__hot static int cmp_int_align2(const MDBX_val *a, const MDBX_val *b) { mdbx_assert(NULL, a->iov_len == b->iov_len); switch (a->iov_len) { case 4: @@ -13600,7 +13601,7 @@ static int __hot cmp_int_align2(const MDBX_val *a, const MDBX_val *b) { /* Compare two items pointing at unsigned values with unknown alignment. * * This is also set as MDBX_INTEGERDUP|MDBX_DUPFIXED's MDBX_dbx.md_dcmp. */ -static int __hot cmp_int_unaligned(const MDBX_val *a, const MDBX_val *b) { +__hot static int cmp_int_unaligned(const MDBX_val *a, const MDBX_val *b) { mdbx_assert(NULL, a->iov_len == b->iov_len); switch (a->iov_len) { case 4: @@ -13617,7 +13618,7 @@ static int __hot cmp_int_unaligned(const MDBX_val *a, const MDBX_val *b) { } /* Compare two items lexically */ -static int __hot cmp_lexical(const MDBX_val *a, const MDBX_val *b) { +__hot static int cmp_lexical(const MDBX_val *a, const MDBX_val *b) { if (a->iov_len == b->iov_len) return a->iov_len ? memcmp(a->iov_base, b->iov_base, a->iov_len) : 0; @@ -13628,7 +13629,7 @@ static int __hot cmp_lexical(const MDBX_val *a, const MDBX_val *b) { } /* Compare two items in reverse byte order */ -static int __hot cmp_reverse(const MDBX_val *a, const MDBX_val *b) { +__hot static int cmp_reverse(const MDBX_val *a, const MDBX_val *b) { const size_t shortest = (a->iov_len < b->iov_len) ? a->iov_len : b->iov_len; if (likely(shortest)) { const uint8_t *pa = (const uint8_t *)a->iov_base + a->iov_len; @@ -13644,7 +13645,7 @@ static int __hot cmp_reverse(const MDBX_val *a, const MDBX_val *b) { } /* Fast non-lexically comparator */ -static int __hot cmp_lenfast(const MDBX_val *a, const MDBX_val *b) { +__hot static int cmp_lenfast(const MDBX_val *a, const MDBX_val *b) { int diff = CMP2INT(a->iov_len, b->iov_len); return likely(diff || a->iov_len == 0) ? diff @@ -13663,7 +13664,7 @@ static bool unsure_equal(MDBX_cmp_func cmp, const MDBX_val *a, * Returns the smallest entry larger or equal to the key. * Updates the cursor index with the index of the found entry. * If no entry larger or equal to the key is found, returns NULL. */ -static struct node_result __hot mdbx_node_search(MDBX_cursor *mc, +__hot static struct node_result mdbx_node_search(MDBX_cursor *mc, const MDBX_val *key) { MDBX_page *mp = mc->mc_pg[mc->mc_top]; const int nkeys = page_numkeys(mp); @@ -13922,8 +13923,8 @@ __hot static __always_inline pgr_t page_get_inline(const uint16_t ILL, /* Finish mdbx_page_search() / mdbx_page_search_lowest(). * The cursor is at the root page, set up the rest of it. */ -__hot static int mdbx_page_search_root(MDBX_cursor *mc, const MDBX_val *key, - int flags) { +__hot __noinline static int +mdbx_page_search_root(MDBX_cursor *mc, const MDBX_val *key, int flags) { MDBX_page *mp = mc->mc_pg[mc->mc_top]; int rc; DKBUF_DEBUG; @@ -14592,9 +14593,10 @@ static int mdbx_cursor_prev(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, } /* Set the cursor on a specific data item. */ -static struct cursor_set_result mdbx_cursor_set(MDBX_cursor *mc, MDBX_val *key, - MDBX_val *data, - MDBX_cursor_op op) { +__hot static struct cursor_set_result mdbx_cursor_set(MDBX_cursor *mc, + MDBX_val *key, + MDBX_val *data, + MDBX_cursor_op op) { MDBX_page *mp; MDBX_node *node = NULL; DKBUF_DEBUG; @@ -14976,8 +14978,8 @@ static int mdbx_cursor_last(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data) { return MDBX_SUCCESS; } -int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, - MDBX_cursor_op op) { +__hot int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, + MDBX_cursor_op op) { if (unlikely(mc == NULL)) return MDBX_EINVAL; @@ -15372,8 +15374,8 @@ static int mdbx_cursor_touch(MDBX_cursor *mc) { return rc; } -int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, - unsigned flags) { +__hot int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, + unsigned flags) { MDBX_env *env; MDBX_page *sub_root = NULL; MDBX_val xdata, *rdata, dkey, olddata; @@ -16174,7 +16176,7 @@ new_sub:; return rc; } -int mdbx_cursor_del(MDBX_cursor *mc, MDBX_put_flags_t flags) { +__hot int mdbx_cursor_del(MDBX_cursor *mc, MDBX_put_flags_t flags) { if (unlikely(!mc)) return MDBX_EINVAL; From 096d6a9bd600ad4bc6dee02df3c74b4ad7835350 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Sun, 7 Aug 2022 12:10:36 +0300 Subject: [PATCH 085/364] mdbx: some micro-optimizations. --- src/core.c | 42 ++++++++++++++++++++---------------------- src/osal.c | 10 ++++++---- 2 files changed, 26 insertions(+), 26 deletions(-) diff --git a/src/core.c b/src/core.c index 14c4be12..25437165 100644 --- a/src/core.c +++ b/src/core.c @@ -1439,7 +1439,7 @@ __cold void mdbx_rthc_global_init(void) { __Wpedantic_format_voidptr(&rthc_key), (unsigned)rthc_key); #endif /* checking time conversion, this also avoids racing on 32-bit architectures - * during writing calculated 64-bit ratio(s) into memory. */ + * during storing calculated 64-bit ratio(s) into memory. */ uint32_t proba = UINT32_MAX; while (true) { unsigned time_conversion_checkup = @@ -13647,7 +13647,7 @@ __hot static int cmp_reverse(const MDBX_val *a, const MDBX_val *b) { /* Fast non-lexically comparator */ __hot static int cmp_lenfast(const MDBX_val *a, const MDBX_val *b) { int diff = CMP2INT(a->iov_len, b->iov_len); - return likely(diff || a->iov_len == 0) + return likely(diff) || a->iov_len == 0 ? diff : memcmp(a->iov_base, b->iov_base, a->iov_len); } @@ -13685,7 +13685,7 @@ __hot static struct node_result mdbx_node_search(MDBX_cursor *mc, return ret; } - int cr = 0, i = 0; + int i; MDBX_cmp_func *cmp = mc->mc_dbx->md_cmp; MDBX_val nodekey; if (unlikely(IS_LEAF2(mp))) { @@ -13696,21 +13696,21 @@ __hot static struct node_result mdbx_node_search(MDBX_cursor *mc, nodekey.iov_base = page_leaf2key(mp, i, nodekey.iov_len); mdbx_cassert(mc, (char *)mp + mc->mc_txn->mt_env->me_psize >= (char *)nodekey.iov_base + nodekey.iov_len); - cr = cmp(key, &nodekey); + int cr = cmp(key, &nodekey); mdbx_debug("found leaf index %u [%s], rc = %i", i, DKEY_DEBUG(&nodekey), cr); - if (unlikely(cr == 0)) { + if (cr > 0) + /* Found entry is less than the key. */ + /* Skip to get the smallest entry larger than key. */ + low = ++i; + else if (cr < 0) + high = i - 1; + else { ret.exact = true; break; } - low = (cr < 0) ? low : i + 1; - high = (cr < 0) ? i - 1 : high; } while (likely(low <= high)); - /* Found entry is less than the key. */ - /* Skip to get the smallest entry larger than key. */ - i += cr > 0; - /* store the key index */ mc->mc_ki[mc->mc_top] = (indx_t)i; ret.node = (i < nkeys) @@ -13727,32 +13727,30 @@ __hot static struct node_result mdbx_node_search(MDBX_cursor *mc, MDBX_node *node; do { i = (low + high) >> 1; - node = page_node(mp, i); nodekey.iov_len = node_ks(node); nodekey.iov_base = node_key(node); mdbx_cassert(mc, (char *)mp + mc->mc_txn->mt_env->me_psize >= (char *)nodekey.iov_base + nodekey.iov_len); - - cr = cmp(key, &nodekey); + int cr = cmp(key, &nodekey); if (IS_LEAF(mp)) mdbx_debug("found leaf index %u [%s], rc = %i", i, DKEY_DEBUG(&nodekey), cr); else mdbx_debug("found branch index %u [%s -> %" PRIaPGNO "], rc = %i", i, DKEY_DEBUG(&nodekey), node_pgno(node), cr); - if (unlikely(cr == 0)) { + if (cr > 0) + /* Found entry is less than the key. */ + /* Skip to get the smallest entry larger than key. */ + low = ++i; + else if (cr < 0) + high = i - 1; + else { ret.exact = true; break; } - low = (cr < 0) ? low : i + 1; - high = (cr < 0) ? i - 1 : high; } while (likely(low <= high)); - /* Found entry is less than the key. */ - /* Skip to get the smallest entry larger than key. */ - i += cr > 0; - /* store the key index */ mc->mc_ki[mc->mc_top] = (indx_t)i; ret.node = (i < nkeys) @@ -13956,7 +13954,7 @@ mdbx_page_search_root(MDBX_cursor *mc, const MDBX_val *key, int flags) { } } else { const struct node_result nsr = mdbx_node_search(mc, key); - if (nsr.node) + if (likely(nsr.node)) i = mc->mc_ki[mc->mc_top] + nsr.exact - 1; else i = page_numkeys(mp) - 1; diff --git a/src/osal.c b/src/osal.c index 719f297e..91a3f8c7 100644 --- a/src/osal.c +++ b/src/osal.c @@ -2034,10 +2034,10 @@ mdbx_osal_16dot16_to_monotime(uint32_t seconds_16dot16) { MDBX_INTERNAL_FUNC uint32_t mdbx_osal_monotime_to_16dot16(uint64_t monotime) { static uint64_t limit; if (unlikely(monotime > limit)) { - if (limit != 0) + if (likely(limit != 0)) return UINT32_MAX; limit = mdbx_osal_16dot16_to_monotime(UINT32_MAX - 1); - if (monotime > limit) + if (unlikely(monotime > limit)) return UINT32_MAX; } const uint32_t ret = @@ -2048,7 +2048,9 @@ MDBX_INTERNAL_FUNC uint32_t mdbx_osal_monotime_to_16dot16(uint64_t monotime) { #else (uint32_t)(monotime * 128 / 1953125); #endif - return likely(ret || monotime == 0) ? ret : /* fix underflow */ 1; + if (likely(ret > 0)) + return ret; + return monotime > 0 /* fix underflow */; } MDBX_INTERNAL_FUNC uint64_t mdbx_osal_monotime(void) { @@ -2091,7 +2093,7 @@ static void bootid_shake(bin128_t *p) { p->d = e + p->a; } -static void bootid_collect(bin128_t *p, const void *s, size_t n) { +__cold static void bootid_collect(bin128_t *p, const void *s, size_t n) { p->y += UINT64_C(64526882297375213); bootid_shake(p); for (size_t i = 0; i < n; ++i) { From 18e557c6e8042bbb15fef5e38947c7187ef8a969 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Thu, 11 Aug 2022 01:03:15 +0300 Subject: [PATCH 086/364] mdbx: rename internal functions, types and macros (to be closer to MithrilDB). --- src/core.c | 6521 ++++++++++++++++++------------------ src/internals.h | 196 +- src/lck-posix.c | 194 +- src/lck-windows.c | 132 +- src/mdbx_chk.c | 22 +- src/mdbx_copy.c | 2 +- src/mdbx_drop.c | 2 +- src/mdbx_dump.c | 4 +- src/mdbx_load.c | 10 +- src/mdbx_stat.c | 6 +- src/osal.c | 207 +- src/osal.h | 237 +- test/base.h | 2 +- test/copy.cc | 2 +- test/valgrind_suppress.txt | 150 +- 15 files changed, 3754 insertions(+), 3933 deletions(-) diff --git a/src/core.c b/src/core.c index 25437165..cb08e340 100644 --- a/src/core.c +++ b/src/core.c @@ -494,8 +494,8 @@ __cold int mdbx_env_get_maxkeysize_ex(const MDBX_env *env, } size_t mdbx_default_pagesize(void) { - size_t pagesize = mdbx_syspagesize(); - mdbx_ensure(nullptr, is_powerof2(pagesize)); + size_t pagesize = osal_syspagesize(); + ENSURE(nullptr, is_powerof2(pagesize)); pagesize = (pagesize >= MIN_PAGESIZE) ? pagesize : MIN_PAGESIZE; pagesize = (pagesize <= MAX_PAGESIZE) ? pagesize : MAX_PAGESIZE; return pagesize; @@ -602,7 +602,7 @@ flags_db2sub(uint16_t db_flags) { MDBX_NOTHROW_PURE_FUNCTION static __always_inline size_t pgno2bytes(const MDBX_env *env, pgno_t pgno) { - mdbx_assert(env, (1u << env->me_psize2log) == env->me_psize); + eASSERT(env, (1u << env->me_psize2log) == env->me_psize); return ((size_t)pgno) << env->me_psize2log; } @@ -613,7 +613,7 @@ pgno2page(const MDBX_env *env, pgno_t pgno) { MDBX_NOTHROW_PURE_FUNCTION static __always_inline pgno_t bytes2pgno(const MDBX_env *env, size_t bytes) { - mdbx_assert(env, (env->me_psize >> env->me_psize2log) == 1); + eASSERT(env, (env->me_psize >> env->me_psize2log) == 1); return (pgno_t)(bytes >> env->me_psize2log); } @@ -709,20 +709,20 @@ __cold static const char *pagetype_caption(const uint8_t type, __cold static __must_check_result int MDBX_PRINTF_ARGS(2, 3) bad_page(const MDBX_page *mp, const char *fmt, ...) { - if (mdbx_log_enabled(MDBX_LOG_ERROR)) { + if (LOG_ENABLED(MDBX_LOG_ERROR)) { static const MDBX_page *prev; if (prev != mp) { char buf4unknown[16]; prev = mp; - mdbx_debug_log(MDBX_LOG_ERROR, "badpage", 0, - "corrupted %s-page #%u, mod-txnid %" PRIaTXN "\n", - pagetype_caption(PAGETYPE_WHOLE(mp), buf4unknown), - mp->mp_pgno, mp->mp_txnid); + debug_log(MDBX_LOG_ERROR, "badpage", 0, + "corrupted %s-page #%u, mod-txnid %" PRIaTXN "\n", + pagetype_caption(PAGETYPE_WHOLE(mp), buf4unknown), mp->mp_pgno, + mp->mp_txnid); } va_list args; va_start(args, fmt); - mdbx_debug_log_va(MDBX_LOG_ERROR, "badpage", 0, fmt, args); + debug_log_va(MDBX_LOG_ERROR, "badpage", 0, fmt, args); va_end(args); } return MDBX_CORRUPTED; @@ -730,20 +730,20 @@ __cold static __must_check_result int MDBX_PRINTF_ARGS(2, 3) __cold static void MDBX_PRINTF_ARGS(2, 3) poor_page(const MDBX_page *mp, const char *fmt, ...) { - if (mdbx_log_enabled(MDBX_LOG_NOTICE)) { + if (LOG_ENABLED(MDBX_LOG_NOTICE)) { static const MDBX_page *prev; if (prev != mp) { char buf4unknown[16]; prev = mp; - mdbx_debug_log(MDBX_LOG_NOTICE, "poorpage", 0, - "suboptimal %s-page #%u, mod-txnid %" PRIaTXN "\n", - pagetype_caption(PAGETYPE_WHOLE(mp), buf4unknown), - mp->mp_pgno, mp->mp_txnid); + debug_log(MDBX_LOG_NOTICE, "poorpage", 0, + "suboptimal %s-page #%u, mod-txnid %" PRIaTXN "\n", + pagetype_caption(PAGETYPE_WHOLE(mp), buf4unknown), mp->mp_pgno, + mp->mp_txnid); } va_list args; va_start(args, fmt); - mdbx_debug_log_va(MDBX_LOG_NOTICE, "poorpage", 0, fmt, args); + debug_log_va(MDBX_LOG_NOTICE, "poorpage", 0, fmt, args); va_end(args); } } @@ -909,16 +909,16 @@ atomic_store64(MDBX_atomic_uint64_t *p, const uint64_t value, atomic_store_explicit(MDBX_c11a_rw(uint64_t, p), value, mo_c11_store(order)); #else /* MDBX_HAVE_C11ATOMICS */ if (order != mo_Relaxed) - mdbx_compiler_barrier(); + osal_compiler_barrier(); p->weak = value; - mdbx_memory_fence(order, true); + osal_memory_fence(order, true); #endif /* MDBX_HAVE_C11ATOMICS */ #else /* !MDBX_64BIT_ATOMIC */ - mdbx_compiler_barrier(); + osal_compiler_barrier(); atomic_store32(&p->low, (uint32_t)value, mo_Relaxed); - mdbx_jitter4testing(true); + jitter4testing(true); atomic_store32(&p->high, (uint32_t)(value >> 32), order); - mdbx_jitter4testing(true); + jitter4testing(true); #endif /* !MDBX_64BIT_ATOMIC */ return value; } @@ -938,26 +938,26 @@ MDBX_MAYBE_UNUSED static assert(atomic_is_lock_free(MDBX_c11a_ro(uint64_t, p))); return atomic_load_explicit(MDBX_c11a_ro(uint64_t, p), mo_c11_load(order)); #else /* MDBX_HAVE_C11ATOMICS */ - mdbx_memory_fence(order, false); + osal_memory_fence(order, false); const uint64_t value = p->weak; if (order != mo_Relaxed) - mdbx_compiler_barrier(); + osal_compiler_barrier(); return value; #endif /* MDBX_HAVE_C11ATOMICS */ #else /* !MDBX_64BIT_ATOMIC */ - mdbx_compiler_barrier(); + osal_compiler_barrier(); uint64_t value = (uint64_t)atomic_load32(&p->high, order) << 32; - mdbx_jitter4testing(true); + jitter4testing(true); value |= atomic_load32(&p->low, (order == mo_Relaxed) ? mo_Relaxed : mo_AcquireRelease); - mdbx_jitter4testing(true); + jitter4testing(true); for (;;) { - mdbx_compiler_barrier(); + osal_compiler_barrier(); uint64_t again = (uint64_t)atomic_load32(&p->high, order) << 32; - mdbx_jitter4testing(true); + jitter4testing(true); again |= atomic_load32(&p->low, (order == mo_Relaxed) ? mo_Relaxed : mo_AcquireRelease); - mdbx_jitter4testing(true); + jitter4testing(true); if (likely(value == again)) return value; value = again; @@ -1113,7 +1113,7 @@ static __always_inline void safe64_reset(MDBX_atomic_uint64_t *p, #endif /* MDBX_64BIT_CAS && MDBX_64BIT_ATOMIC */ } assert(p->weak >= SAFE64_INVALID_THRESHOLD); - mdbx_jitter4testing(true); + jitter4testing(true); } static __always_inline bool safe64_reset_compare(MDBX_atomic_uint64_t *p, @@ -1140,7 +1140,7 @@ static __always_inline bool safe64_reset_compare(MDBX_atomic_uint64_t *p, rc = true; } #endif /* MDBX_64BIT_CAS */ - mdbx_jitter4testing(true); + jitter4testing(true); return rc; } @@ -1150,20 +1150,20 @@ static __always_inline void safe64_write(MDBX_atomic_uint64_t *p, #if MDBX_64BIT_ATOMIC && MDBX_64BIT_CAS atomic_store64(p, v, mo_AcquireRelease); #else /* MDBX_64BIT_ATOMIC */ - mdbx_compiler_barrier(); + osal_compiler_barrier(); /* update low-part but still value >= SAFE64_INVALID_THRESHOLD */ atomic_store32(&p->low, (uint32_t)v, mo_Relaxed); assert(p->weak >= SAFE64_INVALID_THRESHOLD); - mdbx_jitter4testing(true); + jitter4testing(true); /* update high-part from SAFE64_INVALID_THRESHOLD to actual value */ atomic_store32(&p->high, (uint32_t)(v >> 32), mo_AcquireRelease); #endif /* MDBX_64BIT_ATOMIC */ assert(p->weak == v); - mdbx_jitter4testing(true); + jitter4testing(true); } static __always_inline uint64_t safe64_read(const MDBX_atomic_uint64_t *p) { - mdbx_jitter4testing(true); + jitter4testing(true); uint64_t v; do v = atomic_load64(p, mo_AcquireRelease); @@ -1218,7 +1218,7 @@ MDBX_MAYBE_UNUSED static typedef struct rthc_entry_t { MDBX_reader *begin; MDBX_reader *end; - mdbx_thread_key_t thr_tls_key; + osal_thread_key_t thr_tls_key; } rthc_entry_t; #if MDBX_DEBUG @@ -1237,11 +1237,11 @@ static CRITICAL_SECTION lcklist_critical_section; static pthread_mutex_t lcklist_mutex = PTHREAD_MUTEX_INITIALIZER; static pthread_mutex_t rthc_mutex = PTHREAD_MUTEX_INITIALIZER; static pthread_cond_t rthc_cond = PTHREAD_COND_INITIALIZER; -static mdbx_thread_key_t rthc_key; +static osal_thread_key_t rthc_key; static MDBX_atomic_uint32_t rthc_pending; static __inline uint64_t rthc_signature(const void *addr, uint8_t kind) { - uint64_t salt = mdbx_thread_self() * UINT64_C(0xA2F0EEC059629A17) ^ + uint64_t salt = osal_thread_self() * UINT64_C(0xA2F0EEC059629A17) ^ UINT64_C(0x01E07C6FDB596497) * (uintptr_t)(addr); #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ return salt << 8 | kind; @@ -1356,7 +1356,7 @@ static __inline void rthc_lock(void) { #if defined(_WIN32) || defined(_WIN64) EnterCriticalSection(&rthc_critical_section); #else - mdbx_ensure(nullptr, mdbx_pthread_mutex_lock(&rthc_mutex) == 0); + ENSURE(nullptr, osal_pthread_mutex_lock(&rthc_mutex) == 0); #endif } @@ -1364,11 +1364,11 @@ static __inline void rthc_unlock(void) { #if defined(_WIN32) || defined(_WIN64) LeaveCriticalSection(&rthc_critical_section); #else - mdbx_ensure(nullptr, pthread_mutex_unlock(&rthc_mutex) == 0); + ENSURE(nullptr, pthread_mutex_unlock(&rthc_mutex) == 0); #endif } -static __inline int thread_key_create(mdbx_thread_key_t *key) { +static __inline int thread_key_create(osal_thread_key_t *key) { int rc; #if defined(_WIN32) || defined(_WIN64) *key = TlsAlloc(); @@ -1376,22 +1376,22 @@ static __inline int thread_key_create(mdbx_thread_key_t *key) { #else rc = pthread_key_create(key, nullptr); #endif - mdbx_trace("&key = %p, value %" PRIuPTR ", rc %d", - __Wpedantic_format_voidptr(key), (uintptr_t)*key, rc); + TRACE("&key = %p, value %" PRIuPTR ", rc %d", __Wpedantic_format_voidptr(key), + (uintptr_t)*key, rc); return rc; } -static __inline void thread_key_delete(mdbx_thread_key_t key) { - mdbx_trace("key = %" PRIuPTR, (uintptr_t)key); +static __inline void thread_key_delete(osal_thread_key_t key) { + TRACE("key = %" PRIuPTR, (uintptr_t)key); #if defined(_WIN32) || defined(_WIN64) - mdbx_ensure(nullptr, TlsFree(key)); + ENSURE(nullptr, TlsFree(key)); #else - mdbx_ensure(nullptr, pthread_key_delete(key) == 0); + ENSURE(nullptr, pthread_key_delete(key) == 0); workaround_glibc_bug21031(); #endif } -static __inline void *thread_rthc_get(mdbx_thread_key_t key) { +static __inline void *thread_rthc_get(osal_thread_key_t key) { #if defined(_WIN32) || defined(_WIN64) return TlsGetValue(key); #else @@ -1399,9 +1399,9 @@ static __inline void *thread_rthc_get(mdbx_thread_key_t key) { #endif } -static void thread_rthc_set(mdbx_thread_key_t key, const void *value) { +static void thread_rthc_set(osal_thread_key_t key, const void *value) { #if defined(_WIN32) || defined(_WIN64) - mdbx_ensure(nullptr, TlsSetValue(key, (void *)value)); + ENSURE(nullptr, TlsSetValue(key, (void *)value)); #else const uint64_t sign_registered = MDBX_THREAD_RTHC_REGISTERED(&rthc_thread_state); @@ -1409,51 +1409,49 @@ static void thread_rthc_set(mdbx_thread_key_t key, const void *value) { if (value && unlikely(rthc_thread_state != sign_registered && rthc_thread_state != sign_counted)) { rthc_thread_state = sign_registered; - mdbx_trace("thread registered 0x%" PRIxPTR, mdbx_thread_self()); - if (rthc_atexit(mdbx_rthc_thread_dtor, &rthc_thread_state, + TRACE("thread registered 0x%" PRIxPTR, osal_thread_self()); + if (rthc_atexit(thread_dtor, &rthc_thread_state, (void *)&mdbx_version /* dso_anchor */)) { - mdbx_ensure(nullptr, - pthread_setspecific(rthc_key, &rthc_thread_state) == 0); + ENSURE(nullptr, pthread_setspecific(rthc_key, &rthc_thread_state) == 0); rthc_thread_state = sign_counted; const unsigned count_before = atomic_add32(&rthc_pending, 1); - mdbx_ensure(nullptr, count_before < INT_MAX); - mdbx_notice("fallback to pthreads' tsd, key %" PRIuPTR ", count %u", - (uintptr_t)rthc_key, count_before); + ENSURE(nullptr, count_before < INT_MAX); + NOTICE("fallback to pthreads' tsd, key %" PRIuPTR ", count %u", + (uintptr_t)rthc_key, count_before); (void)count_before; } } - mdbx_ensure(nullptr, pthread_setspecific(key, value) == 0); + ENSURE(nullptr, pthread_setspecific(key, value) == 0); #endif } -__cold void mdbx_rthc_global_init(void) { +__cold void global_ctor(void) { rthc_limit = RTHC_INITIAL_LIMIT; rthc_table = rthc_table_static; #if defined(_WIN32) || defined(_WIN64) InitializeCriticalSection(&rthc_critical_section); InitializeCriticalSection(&lcklist_critical_section); #else - mdbx_ensure(nullptr, - pthread_key_create(&rthc_key, mdbx_rthc_thread_dtor) == 0); - mdbx_trace("pid %d, &mdbx_rthc_key = %p, value 0x%x", mdbx_getpid(), - __Wpedantic_format_voidptr(&rthc_key), (unsigned)rthc_key); + ENSURE(nullptr, pthread_key_create(&rthc_key, thread_dtor) == 0); + TRACE("pid %d, &mdbx_rthc_key = %p, value 0x%x", osal_getpid(), + __Wpedantic_format_voidptr(&rthc_key), (unsigned)rthc_key); #endif /* checking time conversion, this also avoids racing on 32-bit architectures * during storing calculated 64-bit ratio(s) into memory. */ uint32_t proba = UINT32_MAX; while (true) { unsigned time_conversion_checkup = - mdbx_osal_monotime_to_16dot16(mdbx_osal_16dot16_to_monotime(proba)); + osal_monotime_to_16dot16(osal_16dot16_to_monotime(proba)); unsigned one_more = (proba < UINT32_MAX) ? proba + 1 : proba; unsigned one_less = (proba > 0) ? proba - 1 : proba; - mdbx_ensure(nullptr, time_conversion_checkup >= one_less && - time_conversion_checkup <= one_more); + ENSURE(nullptr, time_conversion_checkup >= one_less && + time_conversion_checkup <= one_more); if (proba == 0) break; proba >>= 1; } - bootid = mdbx_osal_bootid(); + bootid = osal_bootid(); #if 0 /* debug */ for (unsigned i = 0; i < 65536; ++i) { @@ -1469,43 +1467,42 @@ __cold void mdbx_rthc_global_init(void) { } /* dtor called for thread, i.e. for all mdbx's environment objects */ -__cold void mdbx_rthc_thread_dtor(void *rthc) { +__cold void thread_dtor(void *rthc) { rthc_lock(); - mdbx_trace(">> pid %d, thread 0x%" PRIxPTR ", rthc %p", mdbx_getpid(), - mdbx_thread_self(), rthc); + TRACE(">> pid %d, thread 0x%" PRIxPTR ", rthc %p", osal_getpid(), + osal_thread_self(), rthc); - const uint32_t self_pid = mdbx_getpid(); + const uint32_t self_pid = osal_getpid(); for (unsigned i = 0; i < rthc_count; ++i) { - const mdbx_thread_key_t key = rthc_table[i].thr_tls_key; + const osal_thread_key_t key = rthc_table[i].thr_tls_key; MDBX_reader *const reader = thread_rthc_get(key); if (reader < rthc_table[i].begin || reader >= rthc_table[i].end) continue; #if !defined(_WIN32) && !defined(_WIN64) if (pthread_setspecific(key, nullptr) != 0) { - mdbx_trace("== thread 0x%" PRIxPTR - ", rthc %p: ignore race with tsd-key deletion", - mdbx_thread_self(), __Wpedantic_format_voidptr(reader)); + TRACE("== thread 0x%" PRIxPTR + ", rthc %p: ignore race with tsd-key deletion", + osal_thread_self(), __Wpedantic_format_voidptr(reader)); continue /* ignore race with tsd-key deletion by mdbx_env_close() */; } #endif - mdbx_trace("== thread 0x%" PRIxPTR - ", rthc %p, [%i], %p ... %p (%+i), rtch-pid %i, " - "current-pid %i", - mdbx_thread_self(), __Wpedantic_format_voidptr(reader), i, - __Wpedantic_format_voidptr(rthc_table[i].begin), - __Wpedantic_format_voidptr(rthc_table[i].end), - (int)(reader - rthc_table[i].begin), reader->mr_pid.weak, - self_pid); + TRACE("== thread 0x%" PRIxPTR + ", rthc %p, [%i], %p ... %p (%+i), rtch-pid %i, " + "current-pid %i", + osal_thread_self(), __Wpedantic_format_voidptr(reader), i, + __Wpedantic_format_voidptr(rthc_table[i].begin), + __Wpedantic_format_voidptr(rthc_table[i].end), + (int)(reader - rthc_table[i].begin), reader->mr_pid.weak, self_pid); if (atomic_load32(&reader->mr_pid, mo_Relaxed) == self_pid) { - mdbx_trace("==== thread 0x%" PRIxPTR ", rthc %p, cleanup", - mdbx_thread_self(), __Wpedantic_format_voidptr(reader)); + TRACE("==== thread 0x%" PRIxPTR ", rthc %p, cleanup", osal_thread_self(), + __Wpedantic_format_voidptr(reader)); atomic_cas32(&reader->mr_pid, self_pid, 0); } } #if defined(_WIN32) || defined(_WIN64) - mdbx_trace("<< thread 0x%" PRIxPTR ", rthc %p", mdbx_thread_self(), rthc); + TRACE("<< thread 0x%" PRIxPTR ", rthc %p", osal_thread_self(), rthc); rthc_unlock(); #else const uint64_t sign_registered = MDBX_THREAD_RTHC_REGISTERED(rthc); @@ -1513,28 +1510,28 @@ __cold void mdbx_rthc_thread_dtor(void *rthc) { const uint64_t state = rthc_read(rthc); if (state == sign_registered && rthc_compare_and_clean(rthc, sign_registered)) { - mdbx_trace("== thread 0x%" PRIxPTR - ", rthc %p, pid %d, self-status %s (0x%08" PRIx64 ")", - mdbx_thread_self(), rthc, mdbx_getpid(), "registered", state); + TRACE("== thread 0x%" PRIxPTR + ", rthc %p, pid %d, self-status %s (0x%08" PRIx64 ")", + osal_thread_self(), rthc, osal_getpid(), "registered", state); } else if (state == sign_counted && rthc_compare_and_clean(rthc, sign_counted)) { - mdbx_trace("== thread 0x%" PRIxPTR - ", rthc %p, pid %d, self-status %s (0x%08" PRIx64 ")", - mdbx_thread_self(), rthc, mdbx_getpid(), "counted", state); - mdbx_ensure(nullptr, atomic_sub32(&rthc_pending, 1) > 0); + TRACE("== thread 0x%" PRIxPTR + ", rthc %p, pid %d, self-status %s (0x%08" PRIx64 ")", + osal_thread_self(), rthc, osal_getpid(), "counted", state); + ENSURE(nullptr, atomic_sub32(&rthc_pending, 1) > 0); } else { - mdbx_warning("thread 0x%" PRIxPTR - ", rthc %p, pid %d, self-status %s (0x%08" PRIx64 ")", - mdbx_thread_self(), rthc, mdbx_getpid(), "wrong", state); + WARNING("thread 0x%" PRIxPTR + ", rthc %p, pid %d, self-status %s (0x%08" PRIx64 ")", + osal_thread_self(), rthc, osal_getpid(), "wrong", state); } if (atomic_load32(&rthc_pending, mo_AcquireRelease) == 0) { - mdbx_trace("== thread 0x%" PRIxPTR ", rthc %p, pid %d, wake", - mdbx_thread_self(), rthc, mdbx_getpid()); - mdbx_ensure(nullptr, pthread_cond_broadcast(&rthc_cond) == 0); + TRACE("== thread 0x%" PRIxPTR ", rthc %p, pid %d, wake", osal_thread_self(), + rthc, osal_getpid()); + ENSURE(nullptr, pthread_cond_broadcast(&rthc_cond) == 0); } - mdbx_trace("<< thread 0x%" PRIxPTR ", rthc %p", mdbx_thread_self(), rthc); + TRACE("<< thread 0x%" PRIxPTR ", rthc %p", osal_thread_self(), rthc); /* Allow tail call optimization, i.e. gcc should generate the jmp instruction * instead of a call for pthread_mutex_unlock() and therefore CPU could not * return to current DSO's code section, which may be unloaded immediately @@ -1544,44 +1541,44 @@ __cold void mdbx_rthc_thread_dtor(void *rthc) { } MDBX_EXCLUDE_FOR_GPROF -__cold void mdbx_rthc_global_dtor(void) { - mdbx_trace(">> pid %d", mdbx_getpid()); +__cold void global_dtor(void) { + TRACE(">> pid %d", osal_getpid()); rthc_lock(); #if !defined(_WIN32) && !defined(_WIN64) uint64_t *rthc = pthread_getspecific(rthc_key); - mdbx_trace("== thread 0x%" PRIxPTR - ", rthc %p, pid %d, self-status 0x%08" PRIx64 ", left %d", - mdbx_thread_self(), __Wpedantic_format_voidptr(rthc), - mdbx_getpid(), rthc ? rthc_read(rthc) : ~UINT64_C(0), - atomic_load32(&rthc_pending, mo_Relaxed)); + TRACE("== thread 0x%" PRIxPTR ", rthc %p, pid %d, self-status 0x%08" PRIx64 + ", left %d", + osal_thread_self(), __Wpedantic_format_voidptr(rthc), osal_getpid(), + rthc ? rthc_read(rthc) : ~UINT64_C(0), + atomic_load32(&rthc_pending, mo_Relaxed)); if (rthc) { const uint64_t sign_registered = MDBX_THREAD_RTHC_REGISTERED(rthc); const uint64_t sign_counted = MDBX_THREAD_RTHC_COUNTED(rthc); const uint64_t state = rthc_read(rthc); if (state == sign_registered && rthc_compare_and_clean(rthc, sign_registered)) { - mdbx_trace("== thread 0x%" PRIxPTR - ", rthc %p, pid %d, self-status %s (0x%08" PRIx64 ")", - mdbx_thread_self(), __Wpedantic_format_voidptr(rthc), - mdbx_getpid(), "registered", state); + TRACE("== thread 0x%" PRIxPTR + ", rthc %p, pid %d, self-status %s (0x%08" PRIx64 ")", + osal_thread_self(), __Wpedantic_format_voidptr(rthc), osal_getpid(), + "registered", state); } else if (state == sign_counted && rthc_compare_and_clean(rthc, sign_counted)) { - mdbx_trace("== thread 0x%" PRIxPTR - ", rthc %p, pid %d, self-status %s (0x%08" PRIx64 ")", - mdbx_thread_self(), __Wpedantic_format_voidptr(rthc), - mdbx_getpid(), "counted", state); - mdbx_ensure(nullptr, atomic_sub32(&rthc_pending, 1) > 0); + TRACE("== thread 0x%" PRIxPTR + ", rthc %p, pid %d, self-status %s (0x%08" PRIx64 ")", + osal_thread_self(), __Wpedantic_format_voidptr(rthc), osal_getpid(), + "counted", state); + ENSURE(nullptr, atomic_sub32(&rthc_pending, 1) > 0); } else { - mdbx_warning("thread 0x%" PRIxPTR - ", rthc %p, pid %d, self-status %s (0x%08" PRIx64 ")", - mdbx_thread_self(), __Wpedantic_format_voidptr(rthc), - mdbx_getpid(), "wrong", state); + WARNING("thread 0x%" PRIxPTR + ", rthc %p, pid %d, self-status %s (0x%08" PRIx64 ")", + osal_thread_self(), __Wpedantic_format_voidptr(rthc), + osal_getpid(), "wrong", state); } } struct timespec abstime; - mdbx_ensure(nullptr, clock_gettime(CLOCK_REALTIME, &abstime) == 0); + ENSURE(nullptr, clock_gettime(CLOCK_REALTIME, &abstime) == 0); abstime.tv_nsec += 1000000000l / 10; if (abstime.tv_nsec >= 1000000000l) { abstime.tv_nsec -= 1000000000l; @@ -1593,8 +1590,7 @@ __cold void mdbx_rthc_global_dtor(void) { for (unsigned left; (left = atomic_load32(&rthc_pending, mo_AcquireRelease)) > 0;) { - mdbx_notice("tls-cleanup: pid %d, pending %u, wait for...", mdbx_getpid(), - left); + NOTICE("tls-cleanup: pid %d, pending %u, wait for...", osal_getpid(), left); const int rc = pthread_cond_timedwait(&rthc_cond, &rthc_mutex, &abstime); if (rc && rc != EINTR) break; @@ -1602,29 +1598,28 @@ __cold void mdbx_rthc_global_dtor(void) { thread_key_delete(rthc_key); #endif - const uint32_t self_pid = mdbx_getpid(); + const uint32_t self_pid = osal_getpid(); for (unsigned i = 0; i < rthc_count; ++i) { - const mdbx_thread_key_t key = rthc_table[i].thr_tls_key; + const osal_thread_key_t key = rthc_table[i].thr_tls_key; thread_key_delete(key); for (MDBX_reader *rthc = rthc_table[i].begin; rthc < rthc_table[i].end; ++rthc) { - mdbx_trace( - "== [%i] = key %" PRIuPTR ", %p ... %p, rthc %p (%+i), " - "rthc-pid %i, current-pid %i", - i, (uintptr_t)key, __Wpedantic_format_voidptr(rthc_table[i].begin), - __Wpedantic_format_voidptr(rthc_table[i].end), - __Wpedantic_format_voidptr(rthc), (int)(rthc - rthc_table[i].begin), - rthc->mr_pid.weak, self_pid); + TRACE("== [%i] = key %" PRIuPTR ", %p ... %p, rthc %p (%+i), " + "rthc-pid %i, current-pid %i", + i, (uintptr_t)key, __Wpedantic_format_voidptr(rthc_table[i].begin), + __Wpedantic_format_voidptr(rthc_table[i].end), + __Wpedantic_format_voidptr(rthc), (int)(rthc - rthc_table[i].begin), + rthc->mr_pid.weak, self_pid); if (atomic_load32(&rthc->mr_pid, mo_Relaxed) == self_pid) { atomic_store32(&rthc->mr_pid, 0, mo_AcquireRelease); - mdbx_trace("== cleanup %p", __Wpedantic_format_voidptr(rthc)); + TRACE("== cleanup %p", __Wpedantic_format_voidptr(rthc)); } } } rthc_limit = rthc_count = 0; if (rthc_table != rthc_table_static) - mdbx_free(rthc_table); + osal_free(rthc_table); rthc_table = nullptr; rthc_unlock(); @@ -1637,22 +1632,22 @@ __cold void mdbx_rthc_global_dtor(void) { workaround_glibc_bug21031(); #endif - mdbx_trace("<< pid %d\n", mdbx_getpid()); + TRACE("<< pid %d\n", osal_getpid()); } -__cold int mdbx_rthc_alloc(mdbx_thread_key_t *pkey, MDBX_reader *begin, - MDBX_reader *end) { +__cold int rthc_alloc(osal_thread_key_t *pkey, MDBX_reader *begin, + MDBX_reader *end) { assert(pkey != NULL); #ifndef NDEBUG - *pkey = (mdbx_thread_key_t)0xBADBADBAD; + *pkey = (osal_thread_key_t)0xBADBADBAD; #endif /* NDEBUG */ rthc_lock(); - mdbx_trace(">> rthc_count %u, rthc_limit %u", rthc_count, rthc_limit); + TRACE(">> rthc_count %u, rthc_limit %u", rthc_count, rthc_limit); int rc; if (rthc_count == rthc_limit) { rthc_entry_t *new_table = - mdbx_realloc((rthc_table == rthc_table_static) ? nullptr : rthc_table, + osal_realloc((rthc_table == rthc_table_static) ? nullptr : rthc_table, sizeof(rthc_entry_t) * rthc_limit * 2); if (new_table == nullptr) { rc = MDBX_ENOMEM; @@ -1669,15 +1664,14 @@ __cold int mdbx_rthc_alloc(mdbx_thread_key_t *pkey, MDBX_reader *begin, goto bailout; *pkey = rthc_table[rthc_count].thr_tls_key; - mdbx_trace("== [%i] = key %" PRIuPTR ", %p ... %p", rthc_count, - (uintptr_t)*pkey, __Wpedantic_format_voidptr(begin), - __Wpedantic_format_voidptr(end)); + TRACE("== [%i] = key %" PRIuPTR ", %p ... %p", rthc_count, (uintptr_t)*pkey, + __Wpedantic_format_voidptr(begin), __Wpedantic_format_voidptr(end)); rthc_table[rthc_count].begin = begin; rthc_table[rthc_count].end = end; ++rthc_count; - mdbx_trace("<< key %" PRIuPTR ", rthc_count %u, rthc_limit %u", - (uintptr_t)*pkey, rthc_count, rthc_limit); + TRACE("<< key %" PRIuPTR ", rthc_count %u, rthc_limit %u", (uintptr_t)*pkey, + rthc_count, rthc_limit); rthc_unlock(); return MDBX_SUCCESS; @@ -1686,30 +1680,30 @@ bailout: return rc; } -__cold void mdbx_rthc_remove(const mdbx_thread_key_t key) { +__cold void rthc_remove(const osal_thread_key_t key) { thread_key_delete(key); rthc_lock(); - mdbx_trace(">> key %zu, rthc_count %u, rthc_limit %u", (uintptr_t)key, - rthc_count, rthc_limit); + TRACE(">> key %zu, rthc_count %u, rthc_limit %u", (uintptr_t)key, rthc_count, + rthc_limit); for (unsigned i = 0; i < rthc_count; ++i) { if (key == rthc_table[i].thr_tls_key) { - const uint32_t self_pid = mdbx_getpid(); - mdbx_trace("== [%i], %p ...%p, current-pid %d", i, - __Wpedantic_format_voidptr(rthc_table[i].begin), - __Wpedantic_format_voidptr(rthc_table[i].end), self_pid); + const uint32_t self_pid = osal_getpid(); + TRACE("== [%i], %p ...%p, current-pid %d", i, + __Wpedantic_format_voidptr(rthc_table[i].begin), + __Wpedantic_format_voidptr(rthc_table[i].end), self_pid); for (MDBX_reader *rthc = rthc_table[i].begin; rthc < rthc_table[i].end; ++rthc) { if (atomic_load32(&rthc->mr_pid, mo_Relaxed) == self_pid) { atomic_store32(&rthc->mr_pid, 0, mo_AcquireRelease); - mdbx_trace("== cleanup %p", __Wpedantic_format_voidptr(rthc)); + TRACE("== cleanup %p", __Wpedantic_format_voidptr(rthc)); } } if (--rthc_count > 0) rthc_table[i] = rthc_table[rthc_count]; else if (rthc_table != rthc_table_static) { - mdbx_free(rthc_table); + osal_free(rthc_table); rthc_table = rthc_table_static; rthc_limit = RTHC_INITIAL_LIMIT; } @@ -1717,8 +1711,8 @@ __cold void mdbx_rthc_remove(const mdbx_thread_key_t key) { } } - mdbx_trace("<< key %zu, rthc_count %u, rthc_limit %u", (size_t)key, - rthc_count, rthc_limit); + TRACE("<< key %zu, rthc_count %u, rthc_limit %u", (size_t)key, rthc_count, + rthc_limit); rthc_unlock(); } @@ -1731,7 +1725,7 @@ static __inline void lcklist_lock(void) { #if defined(_WIN32) || defined(_WIN64) EnterCriticalSection(&lcklist_critical_section); #else - mdbx_ensure(nullptr, mdbx_pthread_mutex_lock(&lcklist_mutex) == 0); + ENSURE(nullptr, osal_pthread_mutex_lock(&lcklist_mutex) == 0); #endif } @@ -1739,7 +1733,7 @@ static __inline void lcklist_unlock(void) { #if defined(_WIN32) || defined(_WIN64) LeaveCriticalSection(&lcklist_critical_section); #else - mdbx_ensure(nullptr, pthread_mutex_unlock(&lcklist_mutex) == 0); + ENSURE(nullptr, pthread_mutex_unlock(&lcklist_mutex) == 0); #endif } @@ -1752,7 +1746,7 @@ MDBX_NOTHROW_CONST_FUNCTION static uint64_t rrxmrrxmsx_0(uint64_t v) { return v ^ v >> 28; } -static int uniq_peek(const mdbx_mmap_t *pending, mdbx_mmap_t *scan) { +static int uniq_peek(const osal_mmap_t *pending, osal_mmap_t *scan) { int rc; uint64_t bait; MDBX_lockinfo *const pending_lck = pending->lck; @@ -1762,32 +1756,31 @@ static int uniq_peek(const mdbx_mmap_t *pending, mdbx_mmap_t *scan) { rc = MDBX_SUCCESS; } else { bait = 0 /* hush MSVC warning */; - rc = mdbx_msync(scan, 0, sizeof(MDBX_lockinfo), MDBX_SYNC_DATA); + rc = osal_msync(scan, 0, sizeof(MDBX_lockinfo), MDBX_SYNC_DATA); if (rc == MDBX_SUCCESS) - rc = mdbx_pread(pending->fd, &bait, sizeof(scan_lck->mti_bait_uniqueness), + rc = osal_pread(pending->fd, &bait, sizeof(scan_lck->mti_bait_uniqueness), offsetof(MDBX_lockinfo, mti_bait_uniqueness)); } if (likely(rc == MDBX_SUCCESS) && bait == atomic_load64(&scan_lck->mti_bait_uniqueness, mo_AcquireRelease)) rc = MDBX_RESULT_TRUE; - mdbx_trace("uniq-peek: %s, bait 0x%016" PRIx64 ",%s rc %d", - pending_lck ? "mem" : "file", bait, - (rc == MDBX_RESULT_TRUE) ? " found," : (rc ? " FAILED," : ""), rc); + TRACE("uniq-peek: %s, bait 0x%016" PRIx64 ",%s rc %d", + pending_lck ? "mem" : "file", bait, + (rc == MDBX_RESULT_TRUE) ? " found," : (rc ? " FAILED," : ""), rc); return rc; } -static int uniq_poke(const mdbx_mmap_t *pending, mdbx_mmap_t *scan, +static int uniq_poke(const osal_mmap_t *pending, osal_mmap_t *scan, uint64_t *abra) { if (*abra == 0) { - const uintptr_t tid = mdbx_thread_self(); + const uintptr_t tid = osal_thread_self(); uintptr_t uit = 0; memcpy(&uit, &tid, (sizeof(tid) < sizeof(uit)) ? sizeof(tid) : sizeof(uit)); - *abra = - rrxmrrxmsx_0(mdbx_osal_monotime() + UINT64_C(5873865991930747) * uit); + *abra = rrxmrrxmsx_0(osal_monotime() + UINT64_C(5873865991930747) * uit); } const uint64_t cadabra = - rrxmrrxmsx_0(*abra + UINT64_C(7680760450171793) * (unsigned)mdbx_getpid()) + rrxmrrxmsx_0(*abra + UINT64_C(7680760450171793) * (unsigned)osal_getpid()) << 24 | *abra >> 40; MDBX_lockinfo *const scan_lck = scan->lck; @@ -1796,7 +1789,7 @@ static int uniq_poke(const mdbx_mmap_t *pending, mdbx_mmap_t *scan, return uniq_peek(pending, scan); } -__cold static int uniq_check(const mdbx_mmap_t *pending, MDBX_env **found) { +__cold static int uniq_check(const osal_mmap_t *pending, MDBX_env **found) { *found = nullptr; uint64_t salt = 0; for (MDBX_env *scan = inprocess_lcklist_head; scan != RTHC_ENVLIST_END; @@ -1807,33 +1800,33 @@ __cold static int uniq_check(const mdbx_mmap_t *pending, MDBX_env **found) { : uniq_poke(pending, &scan->me_lck_mmap, &salt); if (err == MDBX_ENODATA) { uint64_t length; - if (likely(mdbx_filesize(pending->fd, &length) == MDBX_SUCCESS && + if (likely(osal_filesize(pending->fd, &length) == MDBX_SUCCESS && length == 0)) { /* LY: skip checking since LCK-file is empty, i.e. just created. */ - mdbx_debug("uniq-probe: %s", "unique (new/empty lck)"); + DEBUG("uniq-probe: %s", "unique (new/empty lck)"); return MDBX_RESULT_TRUE; } } if (err == MDBX_RESULT_TRUE) err = uniq_poke(pending, &scan->me_lck_mmap, &salt); if (err == MDBX_RESULT_TRUE) { - (void)mdbx_msync(&scan->me_lck_mmap, 0, sizeof(MDBX_lockinfo), + (void)osal_msync(&scan->me_lck_mmap, 0, sizeof(MDBX_lockinfo), MDBX_SYNC_NONE); err = uniq_poke(pending, &scan->me_lck_mmap, &salt); } if (err == MDBX_RESULT_TRUE) { err = uniq_poke(pending, &scan->me_lck_mmap, &salt); *found = scan; - mdbx_debug("uniq-probe: found %p", __Wpedantic_format_voidptr(*found)); + DEBUG("uniq-probe: found %p", __Wpedantic_format_voidptr(*found)); return MDBX_RESULT_FALSE; } if (unlikely(err != MDBX_SUCCESS)) { - mdbx_debug("uniq-probe: failed rc %d", err); + DEBUG("uniq-probe: failed rc %d", err); return err; } } - mdbx_debug("uniq-probe: %s", "unique"); + DEBUG("uniq-probe: %s", "unique"); return MDBX_RESULT_TRUE; } @@ -1841,8 +1834,8 @@ static int lcklist_detach_locked(MDBX_env *env) { MDBX_env *inprocess_neighbor = nullptr; int rc = MDBX_SUCCESS; if (env->me_lcklist_next != nullptr) { - mdbx_ensure(env, env->me_lcklist_next != nullptr); - mdbx_ensure(env, inprocess_lcklist_head != RTHC_ENVLIST_END); + ENSURE(env, env->me_lcklist_next != nullptr); + ENSURE(env, inprocess_lcklist_head != RTHC_ENVLIST_END); for (MDBX_env **ptr = &inprocess_lcklist_head; *ptr != RTHC_ENVLIST_END; ptr = &(*ptr)->me_lcklist_next) { if (*ptr == env) { @@ -1851,16 +1844,16 @@ static int lcklist_detach_locked(MDBX_env *env) { break; } } - mdbx_ensure(env, env->me_lcklist_next == nullptr); + ENSURE(env, env->me_lcklist_next == nullptr); } - rc = likely(mdbx_getpid() == env->me_pid) + rc = likely(osal_getpid() == env->me_pid) ? uniq_check(&env->me_lck_mmap, &inprocess_neighbor) : MDBX_PANIC; if (!inprocess_neighbor && env->me_live_reader) - (void)mdbx_rpid_clear(env); + (void)osal_rpid_clear(env); if (!MDBX_IS_ERROR(rc)) - rc = mdbx_lck_destroy(env, inprocess_neighbor); + rc = osal_lck_destroy(env, inprocess_neighbor); return rc; } @@ -2194,7 +2187,7 @@ static int lcklist_detach_locked(MDBX_env *env) { } \ } \ \ - if (mdbx_audit_enabled()) { \ + if (AUDIT_ENABLED()) { \ for (TYPE *scan = begin + 1; scan < end; ++scan) \ assert(CMP(scan[-1], scan[0])); \ } \ @@ -2212,7 +2205,7 @@ static int lcklist_detach_locked(MDBX_env *env) { tmp = begin + length + END_GAP; \ /* memset(tmp, 0xDeadBeef, sizeof(TYPE) * length); */ \ } else { \ - tmp = mdbx_malloc(sizeof(TYPE) * length); \ + tmp = osal_malloc(sizeof(TYPE) * length); \ if (unlikely(!tmp)) \ return false; \ } \ @@ -2265,7 +2258,7 @@ static int lcklist_detach_locked(MDBX_env *env) { } while (key_diff_mask >> 16); \ \ if (!(BUFFER_PREALLOCATED)) \ - mdbx_free(tmp); \ + osal_free(tmp); \ return true; \ } @@ -2330,7 +2323,7 @@ static int lcklist_detach_locked(MDBX_env *env) { it += length > 1 && expect_with_probability(CMP(*it, item), 0, .5); \ it += length > 0 && expect_with_probability(CMP(*it, item), 0, .5); \ \ - if (mdbx_audit_enabled()) { \ + if (AUDIT_ENABLED()) { \ for (const TYPE_LIST *scan = begin; scan < it; ++scan) \ assert(CMP(*scan, item)); \ for (const TYPE_LIST *scan = it; scan < end; ++scan) \ @@ -2343,7 +2336,7 @@ static int lcklist_detach_locked(MDBX_env *env) { /*----------------------------------------------------------------------------*/ -static __always_inline size_t pnl2bytes(size_t size) { +static __always_inline size_t pnl_size2bytes(size_t size) { assert(size > 0 && size <= MDBX_PGL_LIMIT); #if MDBX_PNL_PREALLOC_FOR_RADIXSORT size += size; @@ -2360,7 +2353,7 @@ static __always_inline size_t pnl2bytes(size_t size) { return bytes; } -static __always_inline pgno_t bytes2pnl(const size_t bytes) { +static __always_inline pgno_t pnl_bytes2size(const size_t bytes) { size_t size = bytes / sizeof(pgno_t); assert(size > 2 && size <= MDBX_PGL_LIMIT + /* alignment gap */ 65536); size -= 2; @@ -2370,14 +2363,14 @@ static __always_inline pgno_t bytes2pnl(const size_t bytes) { return (pgno_t)size; } -static MDBX_PNL mdbx_pnl_alloc(size_t size) { - size_t bytes = pnl2bytes(size); - MDBX_PNL pl = mdbx_malloc(bytes); +static MDBX_PNL pnl_alloc(size_t size) { + size_t bytes = pnl_size2bytes(size); + MDBX_PNL pl = osal_malloc(bytes); if (likely(pl)) { #if __GLIBC_PREREQ(2, 12) || defined(__FreeBSD__) || defined(malloc_usable_size) bytes = malloc_usable_size(pl); #endif /* malloc_usable_size */ - pl[0] = bytes2pnl(bytes); + pl[0] = pnl_bytes2size(bytes); assert(pl[0] >= size); pl[1] = 0; pl += 1; @@ -2385,34 +2378,35 @@ static MDBX_PNL mdbx_pnl_alloc(size_t size) { return pl; } -static void mdbx_pnl_free(MDBX_PNL pl) { +static void pnl_free(MDBX_PNL pl) { if (likely(pl)) - mdbx_free(pl - 1); + osal_free(pl - 1); } /* Shrink the PNL to the default size if it has grown larger */ -static void mdbx_pnl_shrink(MDBX_PNL *ppl) { - assert(bytes2pnl(pnl2bytes(MDBX_PNL_INITIAL)) >= MDBX_PNL_INITIAL && - bytes2pnl(pnl2bytes(MDBX_PNL_INITIAL)) < MDBX_PNL_INITIAL * 3 / 2); +static void pnl_shrink(MDBX_PNL *ppl) { + assert(pnl_bytes2size(pnl_size2bytes(MDBX_PNL_INITIAL)) >= MDBX_PNL_INITIAL && + pnl_bytes2size(pnl_size2bytes(MDBX_PNL_INITIAL)) < + MDBX_PNL_INITIAL * 3 / 2); assert(MDBX_PNL_SIZE(*ppl) <= MDBX_PGL_LIMIT && MDBX_PNL_ALLOCLEN(*ppl) >= MDBX_PNL_SIZE(*ppl)); MDBX_PNL_SIZE(*ppl) = 0; if (unlikely(MDBX_PNL_ALLOCLEN(*ppl) > MDBX_PNL_INITIAL * 2 - MDBX_CACHELINE_SIZE / sizeof(pgno_t))) { - size_t bytes = pnl2bytes(MDBX_PNL_INITIAL); - MDBX_PNL pl = mdbx_realloc(*ppl - 1, bytes); + size_t bytes = pnl_size2bytes(MDBX_PNL_INITIAL); + MDBX_PNL pl = osal_realloc(*ppl - 1, bytes); if (likely(pl)) { #if __GLIBC_PREREQ(2, 12) || defined(__FreeBSD__) || defined(malloc_usable_size) bytes = malloc_usable_size(pl); #endif /* malloc_usable_size */ - *pl = bytes2pnl(bytes); + *pl = pnl_bytes2size(bytes); *ppl = pl + 1; } } } /* Grow the PNL to the size growed to at least given size */ -static int mdbx_pnl_reserve(MDBX_PNL *ppl, const size_t wanna) { +static int pnl_reserve(MDBX_PNL *ppl, const size_t wanna) { const size_t allocated = MDBX_PNL_ALLOCLEN(*ppl); assert(MDBX_PNL_SIZE(*ppl) <= MDBX_PGL_LIMIT && MDBX_PNL_ALLOCLEN(*ppl) >= MDBX_PNL_SIZE(*ppl)); @@ -2420,20 +2414,20 @@ static int mdbx_pnl_reserve(MDBX_PNL *ppl, const size_t wanna) { return MDBX_SUCCESS; if (unlikely(wanna > /* paranoia */ MDBX_PGL_LIMIT)) { - mdbx_error("PNL too long (%zu > %zu)", wanna, (size_t)MDBX_PGL_LIMIT); + ERROR("PNL too long (%zu > %zu)", wanna, (size_t)MDBX_PGL_LIMIT); return MDBX_TXN_FULL; } const size_t size = (wanna + wanna - allocated < MDBX_PGL_LIMIT) ? wanna + wanna - allocated : MDBX_PGL_LIMIT; - size_t bytes = pnl2bytes(size); - MDBX_PNL pl = mdbx_realloc(*ppl - 1, bytes); + size_t bytes = pnl_size2bytes(size); + MDBX_PNL pl = osal_realloc(*ppl - 1, bytes); if (likely(pl)) { #if __GLIBC_PREREQ(2, 12) || defined(__FreeBSD__) || defined(malloc_usable_size) bytes = malloc_usable_size(pl); #endif /* malloc_usable_size */ - *pl = bytes2pnl(bytes); + *pl = pnl_bytes2size(bytes); assert(*pl >= wanna); *ppl = pl + 1; return MDBX_SUCCESS; @@ -2442,20 +2436,19 @@ static int mdbx_pnl_reserve(MDBX_PNL *ppl, const size_t wanna) { } /* Make room for num additional elements in an PNL */ -static __always_inline int __must_check_result mdbx_pnl_need(MDBX_PNL *ppl, - size_t num) { +static __always_inline int __must_check_result pnl_need(MDBX_PNL *ppl, + size_t num) { assert(MDBX_PNL_SIZE(*ppl) <= MDBX_PGL_LIMIT && MDBX_PNL_ALLOCLEN(*ppl) >= MDBX_PNL_SIZE(*ppl)); assert(num <= MDBX_PGL_LIMIT); const size_t wanna = MDBX_PNL_SIZE(*ppl) + num; - return likely(MDBX_PNL_ALLOCLEN(*ppl) >= wanna) - ? MDBX_SUCCESS - : mdbx_pnl_reserve(ppl, wanna); + return likely(MDBX_PNL_ALLOCLEN(*ppl) >= wanna) ? MDBX_SUCCESS + : pnl_reserve(ppl, wanna); } -static __always_inline void mdbx_pnl_xappend(MDBX_PNL pl, pgno_t pgno) { +static __always_inline void pnl_xappend(MDBX_PNL pl, pgno_t pgno) { assert(MDBX_PNL_SIZE(pl) < MDBX_PNL_ALLOCLEN(pl)); - if (mdbx_audit_enabled()) { + if (AUDIT_ENABLED()) { for (unsigned i = MDBX_PNL_SIZE(pl); i > 0; --i) assert(pgno != pl[i]); } @@ -2464,10 +2457,12 @@ static __always_inline void mdbx_pnl_xappend(MDBX_PNL pl, pgno_t pgno) { } /* Append an pgno range onto an unsorted PNL */ -__always_inline static int __must_check_result -mdbx_pnl_append_range(bool spilled, MDBX_PNL *ppl, pgno_t pgno, unsigned n) { +__always_inline static int __must_check_result pnl_append_range(bool spilled, + MDBX_PNL *ppl, + pgno_t pgno, + unsigned n) { assert(n > 0); - int rc = mdbx_pnl_need(ppl, n); + int rc = pnl_need(ppl, n); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -2492,11 +2487,10 @@ mdbx_pnl_append_range(bool spilled, MDBX_PNL *ppl, pgno_t pgno, unsigned n) { } /* Append an pgno range into the sorted PNL */ -__hot static int __must_check_result mdbx_pnl_insert_range(MDBX_PNL *ppl, - pgno_t pgno, - unsigned n) { +__hot static int __must_check_result pnl_insert_range(MDBX_PNL *ppl, + pgno_t pgno, unsigned n) { assert(n > 0); - int rc = mdbx_pnl_need(ppl, n); + int rc = pnl_need(ppl, n); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -2522,7 +2516,7 @@ __hot static bool pnl_check(const pgno_t *pl, const size_t limit) { if (unlikely(MDBX_PNL_MOST(pl) >= limit)) return false; - if ((!MDBX_DISABLE_VALIDATION || mdbx_audit_enabled()) && + if ((!MDBX_DISABLE_VALIDATION || AUDIT_ENABLED()) && likely(MDBX_PNL_SIZE(pl) > 1)) { const pgno_t *scan = MDBX_PNL_BEGIN(pl); const pgno_t *const end = MDBX_PNL_END(pl); @@ -2590,9 +2584,9 @@ __hot static void pnl_merge(MDBX_PNL dst, const MDBX_PNL src) { assert(pnl_check_allocated(dst, MAX_PAGENO + 1)); } -static void mdbx_spill_remove(MDBX_txn *txn, unsigned idx, unsigned npages) { - mdbx_tassert(txn, idx > 0 && idx <= MDBX_PNL_SIZE(txn->tw.spill_pages) && - txn->tw.spill_least_removed > 0); +static void spill_remove(MDBX_txn *txn, unsigned idx, unsigned npages) { + tASSERT(txn, idx > 0 && idx <= MDBX_PNL_SIZE(txn->tw.spill_pages) && + txn->tw.spill_least_removed > 0); txn->tw.spill_least_removed = (idx < txn->tw.spill_least_removed) ? idx : txn->tw.spill_least_removed; txn->tw.spill_pages[idx] |= 1; @@ -2619,8 +2613,8 @@ static void mdbx_spill_remove(MDBX_txn *txn, unsigned idx, unsigned npages) { } } -static MDBX_PNL mdbx_spill_purge(MDBX_txn *txn) { - mdbx_tassert(txn, txn->tw.spill_least_removed > 0); +static MDBX_PNL spill_purge(MDBX_txn *txn) { + tASSERT(txn, txn->tw.spill_least_removed > 0); const MDBX_PNL sl = txn->tw.spill_pages; if (txn->tw.spill_least_removed != INT_MAX) { unsigned len = MDBX_PNL_SIZE(sl), r, w; @@ -2629,12 +2623,12 @@ static MDBX_PNL mdbx_spill_purge(MDBX_txn *txn) { w += 1 - (sl[r] & 1); } for (size_t i = 1; i < w; ++i) - mdbx_tassert(txn, (sl[i] & 1) == 0); + tASSERT(txn, (sl[i] & 1) == 0); MDBX_PNL_SIZE(sl) = w - 1; txn->tw.spill_least_removed = INT_MAX; } else { for (size_t i = 1; i <= MDBX_PNL_SIZE(sl); ++i) - mdbx_tassert(txn, (sl[i] & 1) == 0); + tASSERT(txn, (sl[i] & 1) == 0); } return sl; } @@ -2649,14 +2643,14 @@ RADIXSORT_IMPL(pgno, pgno_t, MDBX_PNL_EXTRACT_KEY, SORT_IMPL(pgno_sort, false, pgno_t, MDBX_PNL_ORDERED) -__hot __noinline static void mdbx_pnl_sort_nochk(MDBX_PNL pnl) { +__hot __noinline static void pnl_sort_nochk(MDBX_PNL pnl) { if (likely(MDBX_PNL_SIZE(pnl) < MDBX_RADIXSORT_THRESHOLD) || unlikely(!pgno_radixsort(&MDBX_PNL_FIRST(pnl), MDBX_PNL_SIZE(pnl)))) pgno_sort(MDBX_PNL_BEGIN(pnl), MDBX_PNL_END(pnl)); } -static __inline void mdbx_pnl_sort(MDBX_PNL pnl, size_t limit4check) { - mdbx_pnl_sort_nochk(pnl); +static __inline void pnl_sort(MDBX_PNL pnl, size_t limit4check) { + pnl_sort_nochk(pnl); assert(pnl_check(pnl, limit4check)); (void)limit4check; } @@ -2665,8 +2659,8 @@ static __inline void mdbx_pnl_sort(MDBX_PNL pnl, size_t limit4check) { * Returns The index of the first item greater than or equal to pgno. */ SEARCH_IMPL(pgno_bsearch, pgno_t, pgno_t, MDBX_PNL_ORDERED) -__hot __noinline static unsigned mdbx_pnl_search_nochk(const MDBX_PNL pnl, - pgno_t pgno) { +__hot __noinline static unsigned pnl_search_nochk(const MDBX_PNL pnl, + pgno_t pgno) { const pgno_t *begin = MDBX_PNL_BEGIN(pnl); const pgno_t *it = pgno_bsearch(begin, MDBX_PNL_SIZE(pnl), pgno); const pgno_t *end = begin + MDBX_PNL_SIZE(pnl); @@ -2678,53 +2672,53 @@ __hot __noinline static unsigned mdbx_pnl_search_nochk(const MDBX_PNL pnl, return (unsigned)(it - begin + 1); } -static __inline unsigned mdbx_pnl_search(const MDBX_PNL pnl, pgno_t pgno, - size_t limit) { +static __inline unsigned pnl_search(const MDBX_PNL pnl, pgno_t pgno, + size_t limit) { assert(pnl_check_allocated(pnl, limit)); assert(pgno < limit); (void)limit; - return mdbx_pnl_search_nochk(pnl, pgno); + return pnl_search_nochk(pnl, pgno); } -static __inline unsigned mdbx_search_spilled(const MDBX_txn *txn, pgno_t pgno) { +static __inline unsigned search_spilled(const MDBX_txn *txn, pgno_t pgno) { const MDBX_PNL pnl = txn->tw.spill_pages; if (likely(!pnl)) return 0; pgno <<= 1; - unsigned n = mdbx_pnl_search(pnl, pgno, (size_t)(MAX_PAGENO + 1) << 1); + unsigned n = pnl_search(pnl, pgno, (size_t)(MAX_PAGENO + 1) << 1); return (n <= MDBX_PNL_SIZE(pnl) && pnl[n] == pgno) ? n : 0; } -static __inline bool mdbx_intersect_spilled(const MDBX_txn *txn, pgno_t pgno, - unsigned npages) { +static __inline bool intersect_spilled(const MDBX_txn *txn, pgno_t pgno, + unsigned npages) { const MDBX_PNL pnl = txn->tw.spill_pages; if (likely(!pnl)) return false; const unsigned len = MDBX_PNL_SIZE(pnl); - if (mdbx_log_enabled(MDBX_LOG_EXTRA)) { - mdbx_debug_extra("PNL len %u [", len); + if (LOG_ENABLED(MDBX_LOG_EXTRA)) { + DEBUG_EXTRA("PNL len %u [", len); for (unsigned i = 1; i <= len; ++i) - mdbx_debug_extra_print(" %li", (pnl[i] & 1) ? -(long)(pnl[i] >> 1) - : (long)(pnl[i] >> 1)); - mdbx_debug_extra_print("%s\n", "]"); + DEBUG_EXTRA_PRINT(" %li", (pnl[i] & 1) ? -(long)(pnl[i] >> 1) + : (long)(pnl[i] >> 1)); + DEBUG_EXTRA_PRINT("%s\n", "]"); } const pgno_t spilled_range_begin = pgno << 1; const pgno_t spilled_range_last = ((pgno + npages) << 1) - 1; #if MDBX_PNL_ASCENDING const unsigned n = - mdbx_pnl_search(pnl, spilled_range_begin, (size_t)(MAX_PAGENO + 1) << 1); + pnl_search(pnl, spilled_range_begin, (size_t)(MAX_PAGENO + 1) << 1); assert(n && (n == MDBX_PNL_SIZE(pnl) + 1 || spilled_range_begin <= pnl[n])); const bool rc = n <= MDBX_PNL_SIZE(pnl) && pnl[n] <= spilled_range_last; #else const unsigned n = - mdbx_pnl_search(pnl, spilled_range_last, (size_t)(MAX_PAGENO + 1) << 1); + pnl_search(pnl, spilled_range_last, (size_t)(MAX_PAGENO + 1) << 1); assert(n && (n == MDBX_PNL_SIZE(pnl) + 1 || spilled_range_last >= pnl[n])); const bool rc = n <= MDBX_PNL_SIZE(pnl) && pnl[n] >= spilled_range_begin; #endif - if (mdbx_assert_enabled()) { + if (ASSERT_ENABLED()) { bool check = false; for (unsigned i = 0; i < npages; ++i) - check |= mdbx_search_spilled(txn, pgno + i) != 0; + check |= search_spilled(txn, pgno + i) != 0; assert(check == rc); } return rc; @@ -2732,7 +2726,7 @@ static __inline bool mdbx_intersect_spilled(const MDBX_txn *txn, pgno_t pgno, /*----------------------------------------------------------------------------*/ -static __always_inline size_t txl2bytes(const size_t size) { +static __always_inline size_t txl_size2bytes(const size_t size) { assert(size > 0 && size <= MDBX_TXL_MAX * 2); size_t bytes = ceil_powerof2(MDBX_ASSUME_MALLOC_OVERHEAD + sizeof(txnid_t) * (size + 2), @@ -2741,20 +2735,20 @@ static __always_inline size_t txl2bytes(const size_t size) { return bytes; } -static __always_inline size_t bytes2txl(const size_t bytes) { +static __always_inline size_t txl_bytes2size(const size_t bytes) { size_t size = bytes / sizeof(txnid_t); assert(size > 2 && size <= MDBX_TXL_MAX * 2); return size - 2; } -static MDBX_TXL mdbx_txl_alloc(void) { - size_t bytes = txl2bytes(MDBX_TXL_INITIAL); - MDBX_TXL tl = mdbx_malloc(bytes); +static MDBX_TXL txl_alloc(void) { + size_t bytes = txl_size2bytes(MDBX_TXL_INITIAL); + MDBX_TXL tl = osal_malloc(bytes); if (likely(tl)) { #if __GLIBC_PREREQ(2, 12) || defined(__FreeBSD__) || defined(malloc_usable_size) bytes = malloc_usable_size(tl); #endif /* malloc_usable_size */ - tl[0] = bytes2txl(bytes); + tl[0] = txl_bytes2size(bytes); assert(tl[0] >= MDBX_TXL_INITIAL); tl[1] = 0; tl += 1; @@ -2762,12 +2756,12 @@ static MDBX_TXL mdbx_txl_alloc(void) { return tl; } -static void mdbx_txl_free(MDBX_TXL tl) { +static void txl_free(MDBX_TXL tl) { if (likely(tl)) - mdbx_free(tl - 1); + osal_free(tl - 1); } -static int mdbx_txl_reserve(MDBX_TXL *ptl, const size_t wanna) { +static int txl_reserve(MDBX_TXL *ptl, const size_t wanna) { const size_t allocated = (size_t)MDBX_PNL_ALLOCLEN(*ptl); assert(MDBX_PNL_SIZE(*ptl) <= MDBX_TXL_MAX && MDBX_PNL_ALLOCLEN(*ptl) >= MDBX_PNL_SIZE(*ptl)); @@ -2775,20 +2769,20 @@ static int mdbx_txl_reserve(MDBX_TXL *ptl, const size_t wanna) { return MDBX_SUCCESS; if (unlikely(wanna > /* paranoia */ MDBX_TXL_MAX)) { - mdbx_error("TXL too long (%zu > %zu)", wanna, (size_t)MDBX_TXL_MAX); + ERROR("TXL too long (%zu > %zu)", wanna, (size_t)MDBX_TXL_MAX); return MDBX_TXN_FULL; } const size_t size = (wanna + wanna - allocated < MDBX_TXL_MAX) ? wanna + wanna - allocated : MDBX_TXL_MAX; - size_t bytes = txl2bytes(size); - MDBX_TXL tl = mdbx_realloc(*ptl - 1, bytes); + size_t bytes = txl_size2bytes(size); + MDBX_TXL tl = osal_realloc(*ptl - 1, bytes); if (likely(tl)) { #if __GLIBC_PREREQ(2, 12) || defined(__FreeBSD__) || defined(malloc_usable_size) bytes = malloc_usable_size(tl); #endif /* malloc_usable_size */ - *tl = bytes2txl(bytes); + *tl = txl_bytes2size(bytes); assert(*tl >= wanna); *ptl = tl + 1; return MDBX_SUCCESS; @@ -2796,18 +2790,17 @@ static int mdbx_txl_reserve(MDBX_TXL *ptl, const size_t wanna) { return MDBX_ENOMEM; } -static __always_inline int __must_check_result mdbx_txl_need(MDBX_TXL *ptl, - size_t num) { +static __always_inline int __must_check_result txl_need(MDBX_TXL *ptl, + size_t num) { assert(MDBX_PNL_SIZE(*ptl) <= MDBX_TXL_MAX && MDBX_PNL_ALLOCLEN(*ptl) >= MDBX_PNL_SIZE(*ptl)); assert(num <= MDBX_PGL_LIMIT); const size_t wanna = (size_t)MDBX_PNL_SIZE(*ptl) + num; - return likely(MDBX_PNL_ALLOCLEN(*ptl) >= wanna) - ? MDBX_SUCCESS - : mdbx_txl_reserve(ptl, wanna); + return likely(MDBX_PNL_ALLOCLEN(*ptl) >= wanna) ? MDBX_SUCCESS + : txl_reserve(ptl, wanna); } -static __always_inline void mdbx_txl_xappend(MDBX_TXL tl, txnid_t id) { +static __always_inline void txl_xappend(MDBX_TXL tl, txnid_t id) { assert(MDBX_PNL_SIZE(tl) < MDBX_PNL_ALLOCLEN(tl)); MDBX_PNL_SIZE(tl) += 1; MDBX_PNL_LAST(tl) = id; @@ -2815,17 +2808,17 @@ static __always_inline void mdbx_txl_xappend(MDBX_TXL tl, txnid_t id) { #define TXNID_SORT_CMP(first, last) ((first) > (last)) SORT_IMPL(txnid_sort, false, txnid_t, TXNID_SORT_CMP) -static void mdbx_txl_sort(MDBX_TXL tl) { +static void txl_sort(MDBX_TXL tl) { txnid_sort(MDBX_PNL_BEGIN(tl), MDBX_PNL_END(tl)); } -static int __must_check_result mdbx_txl_append(MDBX_TXL *ptl, txnid_t id) { +static int __must_check_result txl_append(MDBX_TXL *ptl, txnid_t id) { if (unlikely(MDBX_PNL_SIZE(*ptl) == MDBX_PNL_ALLOCLEN(*ptl))) { - int rc = mdbx_txl_need(ptl, MDBX_TXL_GRANULATE); + int rc = txl_need(ptl, MDBX_TXL_GRANULATE); if (unlikely(rc != MDBX_SUCCESS)) return rc; } - mdbx_txl_xappend(*ptl, id); + txl_xappend(*ptl, id); return MDBX_SUCCESS; } @@ -2837,7 +2830,7 @@ static int __must_check_result mdbx_txl_append(MDBX_TXL *ptl, txnid_t id) { #define MDBX_DPL_RESERVE_GAP \ (MDBX_DPL_GAP_FOR_MERGESORT + MDBX_DPL_GAP_FOR_EDGING) -static __always_inline size_t dpl2bytes(ptrdiff_t size) { +static __always_inline size_t dpl_size2bytes(ptrdiff_t size) { assert(size > CURSOR_STACK && (size_t)size <= MDBX_PGL_LIMIT); #if MDBX_DPL_PREALLOC_FOR_RADIXSORT size += size; @@ -2856,7 +2849,7 @@ static __always_inline size_t dpl2bytes(ptrdiff_t size) { return bytes; } -static __always_inline unsigned bytes2dpl(const ptrdiff_t bytes) { +static __always_inline unsigned dpl_bytes2size(const ptrdiff_t bytes) { size_t size = (bytes - sizeof(MDBX_dpl)) / sizeof(MDBX_dp); assert(size > CURSOR_STACK + MDBX_DPL_RESERVE_GAP && size <= MDBX_PGL_LIMIT + MDBX_PNL_GRANULATE); @@ -2889,29 +2882,30 @@ static __always_inline void dpl_clear(MDBX_dpl *dl) { assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); } -static void mdbx_dpl_free(MDBX_txn *txn) { +static void dpl_free(MDBX_txn *txn) { if (likely(txn->tw.dirtylist)) { - mdbx_free(txn->tw.dirtylist); + osal_free(txn->tw.dirtylist); txn->tw.dirtylist = NULL; } } -static MDBX_dpl *mdbx_dpl_reserve(MDBX_txn *txn, size_t size) { - size_t bytes = dpl2bytes((size < MDBX_PGL_LIMIT) ? size : MDBX_PGL_LIMIT); - MDBX_dpl *const dl = mdbx_realloc(txn->tw.dirtylist, bytes); +static MDBX_dpl *dpl_reserve(MDBX_txn *txn, size_t size) { + size_t bytes = + dpl_size2bytes((size < MDBX_PGL_LIMIT) ? size : MDBX_PGL_LIMIT); + MDBX_dpl *const dl = osal_realloc(txn->tw.dirtylist, bytes); if (likely(dl)) { #if __GLIBC_PREREQ(2, 12) || defined(__FreeBSD__) || defined(malloc_usable_size) bytes = malloc_usable_size(dl); #endif /* malloc_usable_size */ - dl->detent = bytes2dpl(bytes); - mdbx_tassert(txn, txn->tw.dirtylist == NULL || dl->length <= dl->detent); + dl->detent = dpl_bytes2size(bytes); + tASSERT(txn, txn->tw.dirtylist == NULL || dl->length <= dl->detent); txn->tw.dirtylist = dl; } return dl; } -static int mdbx_dpl_alloc(MDBX_txn *txn) { - mdbx_tassert(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); +static int dpl_alloc(MDBX_txn *txn) { + tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); const int wanna = (txn->mt_env->me_options.dp_initial < txn->mt_geo.upper) ? txn->mt_env->me_options.dp_initial : txn->mt_geo.upper; @@ -2923,7 +2917,7 @@ static int mdbx_dpl_alloc(MDBX_txn *txn) { (int)(txn->tw.dirtylist->detent - wanna) < -realloc_threshold))) return MDBX_SUCCESS; } - if (unlikely(!mdbx_dpl_reserve(txn, wanna))) + if (unlikely(!dpl_reserve(txn, wanna))) return MDBX_ENOMEM; dpl_clear(txn->tw.dirtylist); return MDBX_SUCCESS; @@ -2973,7 +2967,7 @@ __hot __noinline static MDBX_dpl *dpl_sort_slowpath(const MDBX_txn *txn) { assert(r == tmp - 1); assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); - if (mdbx_assert_enabled()) + if (ASSERT_ENABLED()) for (unsigned i = 0; i <= dl->length; ++i) assert(dl->items[i].pgno < dl->items[i + 1].pgno); } else { @@ -3005,7 +2999,7 @@ SEARCH_IMPL(dp_bsearch, MDBX_dp, pgno_t, DP_SEARCH_CMP) __hot __noinline static unsigned dpl_search(const MDBX_txn *txn, pgno_t pgno) { MDBX_dpl *dl = txn->tw.dirtylist; assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); - if (mdbx_audit_enabled()) { + if (AUDIT_ENABLED()) { for (const MDBX_dp *ptr = dl->items + dl->sorted; --ptr > dl->items;) { assert(ptr[0].pgno < ptr[1].pgno); assert(ptr[0].pgno >= NUM_METAS); @@ -3056,8 +3050,8 @@ dpl_endpgno(const MDBX_dpl *dl, unsigned i) { return dpl_npages(dl, i) + dl->items[i].pgno; } -static __inline bool mdbx_dpl_intersect(const MDBX_txn *txn, pgno_t pgno, - unsigned npages) { +static __inline bool dpl_intersect(const MDBX_txn *txn, pgno_t pgno, + unsigned npages) { MDBX_dpl *dl = txn->tw.dirtylist; assert(dl->sorted == dl->length); assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); @@ -3068,7 +3062,7 @@ static __inline bool mdbx_dpl_intersect(const MDBX_txn *txn, pgno_t pgno, const bool rc = /* intersection with founded */ pgno + npages > dl->items[n].pgno || /* intersection with prev */ dpl_endpgno(dl, n - 1) > pgno; - if (mdbx_assert_enabled()) { + if (ASSERT_ENABLED()) { bool check = false; for (unsigned i = 1; i <= dl->length; ++i) { const MDBX_page *const dp = dl->items[i].ptr; @@ -3081,7 +3075,7 @@ static __inline bool mdbx_dpl_intersect(const MDBX_txn *txn, pgno_t pgno, return rc; } -static __always_inline unsigned mdbx_dpl_exist(MDBX_txn *txn, pgno_t pgno) { +static __always_inline unsigned dpl_exist(MDBX_txn *txn, pgno_t pgno) { MDBX_dpl *dl = txn->tw.dirtylist; unsigned i = dpl_search(txn, pgno); assert((int)i > 0); @@ -3105,7 +3099,7 @@ MDBX_MAYBE_UNUSED static const MDBX_page *debug_dpl_find(const MDBX_txn *txn, return nullptr; } -static void mdbx_dpl_remove(const MDBX_txn *txn, unsigned i) { +static void dpl_remove(const MDBX_txn *txn, unsigned i) { MDBX_dpl *dl = txn->tw.dirtylist; assert((int)i > 0 && i <= dl->length); assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); @@ -3116,16 +3110,18 @@ static void mdbx_dpl_remove(const MDBX_txn *txn, unsigned i) { assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); } -static __always_inline int __must_check_result -mdbx_dpl_append(MDBX_txn *txn, pgno_t pgno, MDBX_page *page, unsigned npages) { +static __always_inline int __must_check_result dpl_append(MDBX_txn *txn, + pgno_t pgno, + MDBX_page *page, + unsigned npages) { MDBX_dpl *dl = txn->tw.dirtylist; assert(dl->length <= MDBX_PGL_LIMIT + MDBX_PNL_GRANULATE); assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); - if (mdbx_audit_enabled()) { + if (AUDIT_ENABLED()) { for (unsigned i = dl->length; i > 0; --i) { assert(dl->items[i].pgno != pgno); if (unlikely(dl->items[i].pgno == pgno)) { - mdbx_error("Page %u already exist in the DPL at %u", pgno, i); + ERROR("Page %u already exist in the DPL at %u", pgno, i); return MDBX_PROBLEM; } } @@ -3139,16 +3135,16 @@ mdbx_dpl_append(MDBX_txn *txn, pgno_t pgno, MDBX_page *page, unsigned npages) { if (unlikely(dl->length == dl->detent)) { if (unlikely(dl->detent >= MDBX_PGL_LIMIT)) { - mdbx_error("DPL is full (MDBX_PGL_LIMIT %zu)", MDBX_PGL_LIMIT); + ERROR("DPL is full (MDBX_PGL_LIMIT %zu)", MDBX_PGL_LIMIT); return MDBX_TXN_FULL; } const size_t size = (dl->detent < MDBX_PNL_INITIAL * 42) ? dl->detent + dl->detent : dl->detent + dl->detent / 2; - dl = mdbx_dpl_reserve(txn, size); + dl = dpl_reserve(txn, size); if (unlikely(!dl)) return MDBX_ENOMEM; - mdbx_tassert(txn, dl->length < dl->detent); + tASSERT(txn, dl->length < dl->detent); } /* copy the stub beyond the end */ @@ -3164,7 +3160,7 @@ mdbx_dpl_append(MDBX_txn *txn, pgno_t pgno, MDBX_page *page, unsigned npages) { return MDBX_SUCCESS; } -static __inline uint32_t mdbx_dpl_age(const MDBX_txn *txn, unsigned i) { +static __inline uint32_t dpl_age(const MDBX_txn *txn, unsigned i) { const MDBX_dpl *dl = txn->tw.dirtylist; assert((int)i > 0 && i <= dl->length); /* overflow could be here */ @@ -3173,15 +3169,15 @@ static __inline uint32_t mdbx_dpl_age(const MDBX_txn *txn, unsigned i) { /*----------------------------------------------------------------------------*/ -uint8_t mdbx_runtime_flags = MDBX_RUNTIME_FLAGS_INIT; -uint8_t mdbx_loglevel = MDBX_LOG_FATAL; -MDBX_debug_func *mdbx_debug_logger; +uint8_t runtime_flags = MDBX_RUNTIME_FLAGS_INIT; +uint8_t loglevel = MDBX_LOG_FATAL; +MDBX_debug_func *debug_logger; -static __must_check_result __inline int mdbx_page_retire(MDBX_cursor *mc, - MDBX_page *mp); +static __must_check_result __inline int page_retire(MDBX_cursor *mc, + MDBX_page *mp); -static int __must_check_result mdbx_page_dirty(MDBX_txn *txn, MDBX_page *mp, - unsigned npages); +static int __must_check_result page_dirty(MDBX_txn *txn, MDBX_page *mp, + unsigned npages); typedef struct page_result { MDBX_page *page; int err; @@ -3191,9 +3187,9 @@ static txnid_t kick_longlived_readers(MDBX_env *env, const txnid_t laggard); static pgr_t page_new(MDBX_cursor *mc, const unsigned flags); static pgr_t page_new_large(MDBX_cursor *mc, const unsigned npages); -static int mdbx_page_touch(MDBX_cursor *mc); -static int mdbx_cursor_touch(MDBX_cursor *mc); -static int mdbx_touch_dbi(MDBX_cursor *mc); +static int page_touch(MDBX_cursor *mc); +static int cursor_touch(MDBX_cursor *mc); +static int touch_dbi(MDBX_cursor *mc); #define MDBX_END_NAMES \ { \ @@ -3201,7 +3197,7 @@ static int mdbx_touch_dbi(MDBX_cursor *mc); "fail-beginchild" \ } enum { - /* mdbx_txn_end operation number, for logging */ + /* txn_end operation number, for logging */ MDBX_END_COMMITTED, MDBX_END_PURE_COMMIT, MDBX_END_ABORT, @@ -3210,12 +3206,12 @@ enum { MDBX_END_FAIL_BEGIN, MDBX_END_FAIL_BEGINCHILD }; -#define MDBX_END_OPMASK 0x0F /* mask for mdbx_txn_end() operation number */ +#define MDBX_END_OPMASK 0x0F /* mask for txn_end() operation number */ #define MDBX_END_UPDATE 0x10 /* update env state (DBIs) */ #define MDBX_END_FREE 0x20 /* free txn unless it is MDBX_env.me_txn0 */ #define MDBX_END_EOTDONE 0x40 /* txn's cursors already closed */ #define MDBX_END_SLOT 0x80 /* release any reader slot if MDBX_NOTLS */ -static int mdbx_txn_end(MDBX_txn *txn, const unsigned mode); +static int txn_end(MDBX_txn *txn, const unsigned mode); static __always_inline pgr_t page_get_inline(const uint16_t ILL, MDBX_cursor *const mc, @@ -3247,18 +3243,16 @@ static __always_inline int __must_check_result page_get(MDBX_cursor *mc, return ret.err; } -static int __must_check_result mdbx_page_search_root(MDBX_cursor *mc, - const MDBX_val *key, - int flags); +static int __must_check_result page_search_root(MDBX_cursor *mc, + const MDBX_val *key, int flags); #define MDBX_PS_MODIFY 1 #define MDBX_PS_ROOTONLY 2 #define MDBX_PS_FIRST 4 #define MDBX_PS_LAST 8 -static int __must_check_result mdbx_page_search(MDBX_cursor *mc, - const MDBX_val *key, int flags); -static int __must_check_result mdbx_page_merge(MDBX_cursor *csrc, - MDBX_cursor *cdst); +static int __must_check_result page_search(MDBX_cursor *mc, const MDBX_val *key, + int flags); +static int __must_check_result page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst); #define MDBX_SPLIT_REPLACE MDBX_APPENDDUP /* newkey is not new */ static int __must_check_result page_split(MDBX_cursor *mc, @@ -3268,27 +3262,25 @@ static int __must_check_result page_split(MDBX_cursor *mc, static bool meta_checktxnid(const MDBX_env *env, const volatile MDBX_meta *meta, bool report); -static int __must_check_result mdbx_validate_meta_copy(MDBX_env *env, - const MDBX_meta *meta, - MDBX_meta *dest); -static int __must_check_result mdbx_override_meta(MDBX_env *env, - unsigned target, - txnid_t txnid, - const MDBX_meta *shape); -static int __must_check_result mdbx_read_header(MDBX_env *env, MDBX_meta *meta, - const int lck_exclusive, - const mdbx_mode_t mode_bits); -static int __must_check_result mdbx_sync_locked(MDBX_env *env, unsigned flags, - MDBX_meta *const pending); -static int mdbx_env_close0(MDBX_env *env); +static int __must_check_result validate_meta_copy(MDBX_env *env, + const MDBX_meta *meta, + MDBX_meta *dest); +static int __must_check_result override_meta(MDBX_env *env, unsigned target, + txnid_t txnid, + const MDBX_meta *shape); +static int __must_check_result read_header(MDBX_env *env, MDBX_meta *meta, + const int lck_exclusive, + const mdbx_mode_t mode_bits); +static int __must_check_result sync_locked(MDBX_env *env, unsigned flags, + MDBX_meta *const pending); +static int env_close(MDBX_env *env); struct node_result { MDBX_node *node; bool exact; }; -static struct node_result mdbx_node_search(MDBX_cursor *mc, - const MDBX_val *key); +static struct node_result node_search(MDBX_cursor *mc, const MDBX_val *key); static int __must_check_result node_add_branch(MDBX_cursor *mc, unsigned indx, const MDBX_val *key, @@ -3300,71 +3292,63 @@ static int __must_check_result node_add_leaf2(MDBX_cursor *mc, unsigned indx, const MDBX_val *key); static void node_del(MDBX_cursor *mc, size_t ksize); -static void mdbx_node_shrink(MDBX_page *mp, unsigned indx); -static int __must_check_result mdbx_node_move(MDBX_cursor *csrc, - MDBX_cursor *cdst, bool fromleft); -static int __must_check_result mdbx_node_read(MDBX_cursor *mc, - const MDBX_node *leaf, - MDBX_val *data, - const MDBX_page *mp); -static int __must_check_result mdbx_rebalance(MDBX_cursor *mc); -static int __must_check_result mdbx_update_key(MDBX_cursor *mc, - const MDBX_val *key); +static void node_shrink(MDBX_page *mp, unsigned indx); +static int __must_check_result node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, + bool fromleft); +static int __must_check_result node_read(MDBX_cursor *mc, const MDBX_node *leaf, + MDBX_val *data, const MDBX_page *mp); +static int __must_check_result rebalance(MDBX_cursor *mc); +static int __must_check_result update_key(MDBX_cursor *mc, const MDBX_val *key); -static void mdbx_cursor_pop(MDBX_cursor *mc); -static int __must_check_result mdbx_cursor_push(MDBX_cursor *mc, MDBX_page *mp); +static void cursor_pop(MDBX_cursor *mc); +static int __must_check_result cursor_push(MDBX_cursor *mc, MDBX_page *mp); -static int __must_check_result mdbx_audit_ex(MDBX_txn *txn, - unsigned retired_stored, - bool dont_filter_gc); +static int __must_check_result audit_ex(MDBX_txn *txn, unsigned retired_stored, + bool dont_filter_gc); -static int __must_check_result mdbx_page_check(MDBX_cursor *const mc, - const MDBX_page *const mp); -static int __must_check_result mdbx_cursor_check(MDBX_cursor *mc); -static int __must_check_result mdbx_cursor_check_updating(MDBX_cursor *mc); -static int __must_check_result mdbx_cursor_del0(MDBX_cursor *mc); -static int __must_check_result mdbx_del0(MDBX_txn *txn, MDBX_dbi dbi, - const MDBX_val *key, - const MDBX_val *data, unsigned flags); +static int __must_check_result page_check(MDBX_cursor *const mc, + const MDBX_page *const mp); +static int __must_check_result cursor_check(MDBX_cursor *mc); +static int __must_check_result cursor_check_updating(MDBX_cursor *mc); +static int __must_check_result cursor_del(MDBX_cursor *mc); +static int __must_check_result delete (MDBX_txn *txn, MDBX_dbi dbi, + const MDBX_val *key, + const MDBX_val *data, unsigned flags); #define SIBLING_LEFT 0 #define SIBLING_RIGHT 2 -static int __must_check_result mdbx_cursor_sibling(MDBX_cursor *mc, int dir); -static int __must_check_result mdbx_cursor_next(MDBX_cursor *mc, MDBX_val *key, - MDBX_val *data, - MDBX_cursor_op op); -static int __must_check_result mdbx_cursor_prev(MDBX_cursor *mc, MDBX_val *key, - MDBX_val *data, - MDBX_cursor_op op); +static int __must_check_result cursor_sibling(MDBX_cursor *mc, int dir); +static int __must_check_result cursor_next(MDBX_cursor *mc, MDBX_val *key, + MDBX_val *data, MDBX_cursor_op op); +static int __must_check_result cursor_prev(MDBX_cursor *mc, MDBX_val *key, + MDBX_val *data, MDBX_cursor_op op); struct cursor_set_result { int err; bool exact; }; -static struct cursor_set_result mdbx_cursor_set(MDBX_cursor *mc, MDBX_val *key, - MDBX_val *data, - MDBX_cursor_op op); -static int __must_check_result mdbx_cursor_first(MDBX_cursor *mc, MDBX_val *key, - MDBX_val *data); -static int __must_check_result mdbx_cursor_last(MDBX_cursor *mc, MDBX_val *key, - MDBX_val *data); +static struct cursor_set_result cursor_set(MDBX_cursor *mc, MDBX_val *key, + MDBX_val *data, MDBX_cursor_op op); +static int __must_check_result cursor_first(MDBX_cursor *mc, MDBX_val *key, + MDBX_val *data); +static int __must_check_result cursor_last(MDBX_cursor *mc, MDBX_val *key, + MDBX_val *data); -static int __must_check_result mdbx_cursor_init(MDBX_cursor *mc, MDBX_txn *txn, - MDBX_dbi dbi); -static int __must_check_result mdbx_xcursor_init0(MDBX_cursor *mc); -static int __must_check_result mdbx_xcursor_init1(MDBX_cursor *mc, - MDBX_node *node, - const MDBX_page *mp); -static int __must_check_result mdbx_xcursor_init2(MDBX_cursor *mc, - MDBX_xcursor *src_mx, - bool new_dupdata); +static int __must_check_result cursor_init(MDBX_cursor *mc, MDBX_txn *txn, + MDBX_dbi dbi); +static int __must_check_result cursor_xinit0(MDBX_cursor *mc); +static int __must_check_result cursor_xinit1(MDBX_cursor *mc, MDBX_node *node, + const MDBX_page *mp); +static int __must_check_result cursor_xinit2(MDBX_cursor *mc, + MDBX_xcursor *src_mx, + bool new_dupdata); static void cursor_copy(const MDBX_cursor *csrc, MDBX_cursor *cdst); -static int __must_check_result mdbx_drop_tree(MDBX_cursor *mc, - const bool may_have_subDBs); -static int __must_check_result mdbx_fetch_sdb(MDBX_txn *txn, MDBX_dbi dbi); -static int __must_check_result mdbx_setup_dbx(MDBX_dbx *const dbx, - const MDBX_db *const db, - const unsigned pagesize); +static int __must_check_result drop_tree(MDBX_cursor *mc, + const bool may_have_subDBs); +static int __must_check_result fetch_sdb(MDBX_txn *txn, MDBX_dbi dbi); +static int __must_check_result setup_dbx(MDBX_dbx *const dbx, + const MDBX_db *const db, + const unsigned pagesize); static MDBX_cmp_func cmp_lexical, cmp_reverse, cmp_int_align4, cmp_int_align2, cmp_int_unaligned, cmp_lenfast; @@ -3522,30 +3506,30 @@ const char *mdbx_strerror_ANSI2OEM(int errnum) { } #endif /* Bit of madness for Windows */ -__cold void mdbx_debug_log_va(int level, const char *function, int line, - const char *fmt, va_list args) { - if (mdbx_debug_logger) - mdbx_debug_logger(level, function, line, fmt, args); +__cold void debug_log_va(int level, const char *function, int line, + const char *fmt, va_list args) { + if (debug_logger) + debug_logger(level, function, line, fmt, args); else { #if defined(_WIN32) || defined(_WIN64) if (IsDebuggerPresent()) { int prefix_len = 0; char *prefix = nullptr; if (function && line > 0) - prefix_len = mdbx_asprintf(&prefix, "%s:%d ", function, line); + prefix_len = osal_asprintf(&prefix, "%s:%d ", function, line); else if (function) - prefix_len = mdbx_asprintf(&prefix, "%s: ", function); + prefix_len = osal_asprintf(&prefix, "%s: ", function); else if (line > 0) - prefix_len = mdbx_asprintf(&prefix, "%d: ", line); + prefix_len = osal_asprintf(&prefix, "%d: ", line); if (prefix_len > 0 && prefix) { OutputDebugStringA(prefix); - mdbx_free(prefix); + osal_free(prefix); } char *msg = nullptr; - int msg_len = mdbx_vasprintf(&msg, fmt, args); + int msg_len = osal_vasprintf(&msg, fmt, args); if (msg_len > 0 && msg) { OutputDebugStringA(msg); - mdbx_free(msg); + osal_free(msg); } } #else @@ -3561,11 +3545,11 @@ __cold void mdbx_debug_log_va(int level, const char *function, int line, } } -__cold void mdbx_debug_log(int level, const char *function, int line, - const char *fmt, ...) { +__cold void debug_log(int level, const char *function, int line, + const char *fmt, ...) { va_list args; va_start(args, fmt); - mdbx_debug_log_va(level, function, line, fmt, args); + debug_log_va(level, function, line, fmt, args); va_end(args); } @@ -3616,7 +3600,7 @@ const char *mdbx_dump_val(const MDBX_val *key, char *const buf, /*------------------------------------------------------------------------------ LY: debug stuff */ -static const char *mdbx_leafnode_type(MDBX_node *n) { +static const char *leafnode_type(MDBX_node *n) { static const char *const tp[2][2] = {{"", ": DB"}, {": sub-page", ": sub-DB"}}; return F_ISSET(node_flags(n), F_BIGDATA) @@ -3626,7 +3610,7 @@ static const char *mdbx_leafnode_type(MDBX_node *n) { } /* Display all the keys in the page. */ -MDBX_MAYBE_UNUSED static void mdbx_page_list(MDBX_page *mp) { +MDBX_MAYBE_UNUSED static void page_list(MDBX_page *mp) { pgno_t pgno = mp->mp_pgno; const char *type; MDBX_node *node; @@ -3651,26 +3635,26 @@ MDBX_MAYBE_UNUSED static void mdbx_page_list(MDBX_page *mp) { type = "Leaf2 sub-page"; break; case P_OVERFLOW: - mdbx_verbose("Overflow page %" PRIaPGNO " pages %u\n", pgno, mp->mp_pages); + VERBOSE("Overflow page %" PRIaPGNO " pages %u\n", pgno, mp->mp_pages); return; case P_META: - mdbx_verbose("Meta-page %" PRIaPGNO " txnid %" PRIu64 "\n", pgno, - unaligned_peek_u64(4, page_meta(mp)->mm_txnid_a)); + VERBOSE("Meta-page %" PRIaPGNO " txnid %" PRIu64 "\n", pgno, + unaligned_peek_u64(4, page_meta(mp)->mm_txnid_a)); return; default: - mdbx_verbose("Bad page %" PRIaPGNO " flags 0x%X\n", pgno, mp->mp_flags); + VERBOSE("Bad page %" PRIaPGNO " flags 0x%X\n", pgno, mp->mp_flags); return; } nkeys = page_numkeys(mp); - mdbx_verbose("%s %" PRIaPGNO " numkeys %u\n", type, pgno, nkeys); + VERBOSE("%s %" PRIaPGNO " numkeys %u\n", type, pgno, nkeys); for (i = 0; i < nkeys; i++) { if (IS_LEAF2(mp)) { /* LEAF2 pages have no mp_ptrs[] or node headers */ key.iov_len = nsize = mp->mp_leaf2_ksize; key.iov_base = page_leaf2key(mp, i, nsize); total += nsize; - mdbx_verbose("key %u: nsize %u, %s\n", i, nsize, DKEY(&key)); + VERBOSE("key %u: nsize %u, %s\n", i, nsize, DKEY(&key)); continue; } node = page_node(mp, i); @@ -3678,8 +3662,8 @@ MDBX_MAYBE_UNUSED static void mdbx_page_list(MDBX_page *mp) { key.iov_base = node->mn_data; nsize = (unsigned)(NODESIZE + key.iov_len); if (IS_BRANCH(mp)) { - mdbx_verbose("key %u: page %" PRIaPGNO ", %s\n", i, node_pgno(node), - DKEY(&key)); + VERBOSE("key %u: page %" PRIaPGNO ", %s\n", i, node_pgno(node), + DKEY(&key)); total += nsize; } else { if (F_ISSET(node_flags(node), F_BIGDATA)) @@ -3688,14 +3672,14 @@ MDBX_MAYBE_UNUSED static void mdbx_page_list(MDBX_page *mp) { nsize += (unsigned)node_ds(node); total += nsize; nsize += sizeof(indx_t); - mdbx_verbose("key %u: nsize %u, %s%s\n", i, nsize, DKEY(&key), - mdbx_leafnode_type(node)); + VERBOSE("key %u: nsize %u, %s%s\n", i, nsize, DKEY(&key), + leafnode_type(node)); } total = EVEN(total); } - mdbx_verbose("Total: header %u + contents %u + unused %u\n", - IS_LEAF2(mp) ? PAGEHDRSZ : PAGEHDRSZ + mp->mp_lower, total, - page_room(mp)); + VERBOSE("Total: header %u + contents %u + unused %u\n", + IS_LEAF2(mp) ? PAGEHDRSZ : PAGEHDRSZ + mp->mp_lower, total, + page_room(mp)); } /*----------------------------------------------------------------------------*/ @@ -3726,9 +3710,9 @@ MDBX_MAYBE_UNUSED static bool cursor_is_tracked(const MDBX_cursor *mc) { /* Perform act while tracking temporary cursor mn */ #define WITH_CURSOR_TRACKING(mn, act) \ do { \ - mdbx_cassert(&(mn), \ - mn.mc_txn->mt_cursors != NULL /* must be not rdonly txt */); \ - mdbx_cassert(&(mn), !cursor_is_tracked(&(mn))); \ + cASSERT(&(mn), \ + mn.mc_txn->mt_cursors != NULL /* must be not rdonly txt */); \ + cASSERT(&(mn), !cursor_is_tracked(&(mn))); \ MDBX_cursor mc_dummy; \ MDBX_cursor **tracking_head = &(mn).mc_txn->mt_cursors[mn.mc_dbi]; \ MDBX_cursor *tracked = &(mn); \ @@ -3747,25 +3731,25 @@ MDBX_MAYBE_UNUSED static bool cursor_is_tracked(const MDBX_cursor *mc) { int mdbx_cmp(const MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *a, const MDBX_val *b) { - mdbx_assert(NULL, txn->mt_signature == MDBX_MT_SIGNATURE); + eASSERT(NULL, txn->mt_signature == MDBX_MT_SIGNATURE); return txn->mt_dbxs[dbi].md_cmp(a, b); } int mdbx_dcmp(const MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *a, const MDBX_val *b) { - mdbx_assert(NULL, txn->mt_signature == MDBX_MT_SIGNATURE); + eASSERT(NULL, txn->mt_signature == MDBX_MT_SIGNATURE); return txn->mt_dbxs[dbi].md_dcmp(a, b); } /* Allocate memory for a page. * Re-use old malloc'ed pages first for singletons, otherwise just malloc. * Set MDBX_TXN_ERROR on failure. */ -static MDBX_page *mdbx_page_malloc(MDBX_txn *txn, unsigned num) { +static MDBX_page *page_malloc(MDBX_txn *txn, unsigned num) { MDBX_env *env = txn->mt_env; MDBX_page *np = env->me_dp_reserve; size_t size = env->me_psize; if (likely(num == 1 && np)) { - mdbx_assert(env, env->me_dp_reserve_len > 0); + eASSERT(env, env->me_dp_reserve_len > 0); MDBX_ASAN_UNPOISON_MEMORY_REGION(np, size); VALGRIND_MEMPOOL_ALLOC(env, np, size); VALGRIND_MAKE_MEM_DEFINED(&np->mp_next, sizeof(np->mp_next)); @@ -3773,7 +3757,7 @@ static MDBX_page *mdbx_page_malloc(MDBX_txn *txn, unsigned num) { env->me_dp_reserve_len -= 1; } else { size = pgno2bytes(env, num); - np = mdbx_malloc(size); + np = osal_malloc(size); if (unlikely(!np)) { txn->mt_flags |= MDBX_TXN_ERROR; return np; @@ -3800,7 +3784,7 @@ static MDBX_page *mdbx_page_malloc(MDBX_txn *txn, unsigned num) { } /* Free a shadow dirty page */ -static void mdbx_dpage_free(MDBX_env *env, MDBX_page *dp, unsigned npages) { +static void dpage_free(MDBX_env *env, MDBX_page *dp, unsigned npages) { VALGRIND_MAKE_MEM_UNDEFINED(dp, pgno2bytes(env, npages)); MDBX_ASAN_UNPOISON_MEMORY_REGION(dp, pgno2bytes(env, npages)); if (MDBX_DEBUG != 0 || unlikely(env->me_flags & MDBX_PAGEPERTURB)) @@ -3817,40 +3801,40 @@ static void mdbx_dpage_free(MDBX_env *env, MDBX_page *dp, unsigned npages) { } else { /* large pages just get freed directly */ VALGRIND_MEMPOOL_FREE(env, dp); - mdbx_free(dp); + osal_free(dp); } } /* Return all dirty pages to dpage list */ -static void mdbx_dlist_free(MDBX_txn *txn) { +static void dlist_free(MDBX_txn *txn) { MDBX_env *env = txn->mt_env; MDBX_dpl *const dl = txn->tw.dirtylist; for (unsigned i = 1; i <= dl->length; i++) { MDBX_page *dp = dl->items[i].ptr; - mdbx_dpage_free(env, dp, dpl_npages(dl, i)); + dpage_free(env, dp, dpl_npages(dl, i)); } dpl_clear(dl); } -static __always_inline MDBX_db *mdbx_outer_db(MDBX_cursor *mc) { - mdbx_cassert(mc, (mc->mc_flags & C_SUB) != 0); +static __always_inline MDBX_db *outer_db(MDBX_cursor *mc) { + cASSERT(mc, (mc->mc_flags & C_SUB) != 0); MDBX_xcursor *mx = container_of(mc->mc_db, MDBX_xcursor, mx_db); MDBX_cursor_couple *couple = container_of(mx, MDBX_cursor_couple, inner); - mdbx_cassert(mc, mc->mc_db == &couple->outer.mc_xcursor->mx_db); - mdbx_cassert(mc, mc->mc_dbx == &couple->outer.mc_xcursor->mx_dbx); + cASSERT(mc, mc->mc_db == &couple->outer.mc_xcursor->mx_db); + cASSERT(mc, mc->mc_dbx == &couple->outer.mc_xcursor->mx_dbx); return couple->outer.mc_db; } -MDBX_MAYBE_UNUSED __cold static bool mdbx_dirtylist_check(MDBX_txn *txn) { +MDBX_MAYBE_UNUSED __cold static bool dirtylist_check(MDBX_txn *txn) { const MDBX_dpl *const dl = txn->tw.dirtylist; assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); - mdbx_tassert(txn, txn->tw.dirtyroom + dl->length == - (txn->mt_parent ? txn->mt_parent->tw.dirtyroom - : txn->mt_env->me_options.dp_limit)); + tASSERT(txn, txn->tw.dirtyroom + dl->length == + (txn->mt_parent ? txn->mt_parent->tw.dirtyroom + : txn->mt_env->me_options.dp_limit)); - if (!mdbx_audit_enabled()) + if (!AUDIT_ENABLED()) return true; unsigned loose = 0; @@ -3859,55 +3843,55 @@ MDBX_MAYBE_UNUSED __cold static bool mdbx_dirtylist_check(MDBX_txn *txn) { if (!dp) continue; - mdbx_tassert(txn, dp->mp_pgno == dl->items[i].pgno); + tASSERT(txn, dp->mp_pgno == dl->items[i].pgno); if (unlikely(dp->mp_pgno != dl->items[i].pgno)) return false; - const uint32_t age = mdbx_dpl_age(txn, i); - mdbx_tassert(txn, age < UINT32_MAX / 3); + const uint32_t age = dpl_age(txn, i); + tASSERT(txn, age < UINT32_MAX / 3); if (unlikely(age > UINT32_MAX / 3)) return false; - mdbx_tassert(txn, dp->mp_flags == P_LOOSE || IS_MODIFIABLE(txn, dp)); + tASSERT(txn, dp->mp_flags == P_LOOSE || IS_MODIFIABLE(txn, dp)); if (dp->mp_flags == P_LOOSE) { loose += 1; } else if (unlikely(!IS_MODIFIABLE(txn, dp))) return false; const unsigned num = dpl_npages(dl, i); - mdbx_tassert(txn, txn->mt_next_pgno >= dp->mp_pgno + num); + tASSERT(txn, txn->mt_next_pgno >= dp->mp_pgno + num); if (unlikely(txn->mt_next_pgno < dp->mp_pgno + num)) return false; if (i < dl->sorted) { - mdbx_tassert(txn, dl->items[i + 1].pgno >= dp->mp_pgno + num); + tASSERT(txn, dl->items[i + 1].pgno >= dp->mp_pgno + num); if (unlikely(dl->items[i + 1].pgno < dp->mp_pgno + num)) return false; } - const unsigned rpa = mdbx_pnl_search(txn->tw.reclaimed_pglist, dp->mp_pgno, - txn->mt_next_pgno); - mdbx_tassert(txn, rpa > MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) || - txn->tw.reclaimed_pglist[rpa] != dp->mp_pgno); + const unsigned rpa = + pnl_search(txn->tw.reclaimed_pglist, dp->mp_pgno, txn->mt_next_pgno); + tASSERT(txn, rpa > MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) || + txn->tw.reclaimed_pglist[rpa] != dp->mp_pgno); if (rpa <= MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) && unlikely(txn->tw.reclaimed_pglist[rpa] == dp->mp_pgno)) return false; if (num > 1) { - const unsigned rpb = mdbx_pnl_search( - txn->tw.reclaimed_pglist, dp->mp_pgno + num - 1, txn->mt_next_pgno); - mdbx_tassert(txn, rpa == rpb); + const unsigned rpb = pnl_search(txn->tw.reclaimed_pglist, + dp->mp_pgno + num - 1, txn->mt_next_pgno); + tASSERT(txn, rpa == rpb); if (unlikely(rpa != rpb)) return false; } } - mdbx_tassert(txn, loose == txn->tw.loose_count); + tASSERT(txn, loose == txn->tw.loose_count); if (unlikely(loose != txn->tw.loose_count)) return false; for (unsigned i = 1; i <= MDBX_PNL_SIZE(txn->tw.retired_pages); ++i) { const MDBX_page *const dp = debug_dpl_find(txn, txn->tw.retired_pages[i]); - mdbx_tassert(txn, !dp); + tASSERT(txn, !dp); if (unlikely(dp)) return false; } @@ -3916,20 +3900,20 @@ MDBX_MAYBE_UNUSED __cold static bool mdbx_dirtylist_check(MDBX_txn *txn) { } #if MDBX_ENABLE_REFUND -static void mdbx_refund_reclaimed(MDBX_txn *txn) { +static void refund_reclaimed(MDBX_txn *txn) { /* Scanning in descend order */ pgno_t next_pgno = txn->mt_next_pgno; const MDBX_PNL pnl = txn->tw.reclaimed_pglist; - mdbx_tassert(txn, MDBX_PNL_SIZE(pnl) && MDBX_PNL_MOST(pnl) == next_pgno - 1); + tASSERT(txn, MDBX_PNL_SIZE(pnl) && MDBX_PNL_MOST(pnl) == next_pgno - 1); #if MDBX_PNL_ASCENDING unsigned i = MDBX_PNL_SIZE(pnl); - mdbx_tassert(txn, pnl[i] == next_pgno - 1); + tASSERT(txn, pnl[i] == next_pgno - 1); while (--next_pgno, --i > 0 && pnl[i] == next_pgno - 1) ; MDBX_PNL_SIZE(pnl) = i; #else unsigned i = 1; - mdbx_tassert(txn, pnl[i] == next_pgno - 1); + tASSERT(txn, pnl[i] == next_pgno - 1); unsigned len = MDBX_PNL_SIZE(pnl); while (--next_pgno, ++i <= len && pnl[i] == next_pgno - 1) ; @@ -3937,42 +3921,42 @@ static void mdbx_refund_reclaimed(MDBX_txn *txn) { for (unsigned move = 0; move < len; ++move) pnl[1 + move] = pnl[i + move]; #endif - mdbx_verbose("refunded %" PRIaPGNO " pages: %" PRIaPGNO " -> %" PRIaPGNO, - txn->mt_next_pgno - next_pgno, txn->mt_next_pgno, next_pgno); + VERBOSE("refunded %" PRIaPGNO " pages: %" PRIaPGNO " -> %" PRIaPGNO, + txn->mt_next_pgno - next_pgno, txn->mt_next_pgno, next_pgno); txn->mt_next_pgno = next_pgno; - mdbx_tassert(txn, pnl_check_allocated(txn->tw.reclaimed_pglist, - txn->mt_next_pgno - 1)); + tASSERT(txn, + pnl_check_allocated(txn->tw.reclaimed_pglist, txn->mt_next_pgno - 1)); } -static void mdbx_refund_loose(MDBX_txn *txn) { - mdbx_tassert(txn, txn->tw.loose_pages != nullptr); - mdbx_tassert(txn, txn->tw.loose_count > 0); +static void refund_loose(MDBX_txn *txn) { + tASSERT(txn, txn->tw.loose_pages != nullptr); + tASSERT(txn, txn->tw.loose_count > 0); MDBX_dpl *const dl = txn->tw.dirtylist; - mdbx_tassert(txn, dl->length >= txn->tw.loose_count); + tASSERT(txn, dl->length >= txn->tw.loose_count); pgno_t onstack[MDBX_CACHELINE_SIZE * 8 / sizeof(pgno_t)]; MDBX_PNL suitable = onstack; if (dl->length - dl->sorted > txn->tw.loose_count) { /* Dirty list is useless since unsorted. */ - if (bytes2pnl(sizeof(onstack)) < txn->tw.loose_count) { - suitable = mdbx_pnl_alloc(txn->tw.loose_count); + if (pnl_bytes2size(sizeof(onstack)) < txn->tw.loose_count) { + suitable = pnl_alloc(txn->tw.loose_count); if (unlikely(!suitable)) return /* this is not a reason for transaction fail */; } /* Collect loose-pages which may be refunded. */ - mdbx_tassert(txn, txn->mt_next_pgno >= MIN_PAGENO + txn->tw.loose_count); + tASSERT(txn, txn->mt_next_pgno >= MIN_PAGENO + txn->tw.loose_count); pgno_t most = MIN_PAGENO; unsigned w = 0; for (const MDBX_page *lp = txn->tw.loose_pages; lp; lp = lp->mp_next) { - mdbx_tassert(txn, lp->mp_flags == P_LOOSE); - mdbx_tassert(txn, txn->mt_next_pgno > lp->mp_pgno); + tASSERT(txn, lp->mp_flags == P_LOOSE); + tASSERT(txn, txn->mt_next_pgno > lp->mp_pgno); if (likely(txn->mt_next_pgno - txn->tw.loose_count <= lp->mp_pgno)) { - mdbx_tassert(txn, - w < ((suitable == onstack) ? bytes2pnl(sizeof(onstack)) - : MDBX_PNL_ALLOCLEN(suitable))); + tASSERT(txn, + w < ((suitable == onstack) ? pnl_bytes2size(sizeof(onstack)) + : MDBX_PNL_ALLOCLEN(suitable))); suitable[++w] = lp->mp_pgno; most = (lp->mp_pgno > most) ? lp->mp_pgno : most; } @@ -3981,14 +3965,14 @@ static void mdbx_refund_loose(MDBX_txn *txn) { if (most + 1 == txn->mt_next_pgno) { /* Sort suitable list and refund pages at the tail. */ MDBX_PNL_SIZE(suitable) = w; - mdbx_pnl_sort(suitable, MAX_PAGENO + 1); + pnl_sort(suitable, MAX_PAGENO + 1); /* Scanning in descend order */ const int step = MDBX_PNL_ASCENDING ? -1 : 1; const int begin = MDBX_PNL_ASCENDING ? MDBX_PNL_SIZE(suitable) : 1; const int end = MDBX_PNL_ASCENDING ? 0 : MDBX_PNL_SIZE(suitable) + 1; - mdbx_tassert(txn, suitable[begin] >= suitable[end - step]); - mdbx_tassert(txn, most == suitable[begin]); + tASSERT(txn, suitable[begin] >= suitable[end - step]); + tASSERT(txn, most == suitable[begin]); for (int i = begin + step; i != end; i += step) { if (suitable[i] != most - 1) @@ -3996,8 +3980,8 @@ static void mdbx_refund_loose(MDBX_txn *txn) { most -= 1; } const unsigned refunded = txn->mt_next_pgno - most; - mdbx_debug("refund-suitable %u pages %" PRIaPGNO " -> %" PRIaPGNO, - refunded, most, txn->mt_next_pgno); + DEBUG("refund-suitable %u pages %" PRIaPGNO " -> %" PRIaPGNO, refunded, + most, txn->mt_next_pgno); txn->tw.loose_count -= refunded; txn->tw.dirtyroom += refunded; assert(txn->tw.dirtyroom <= txn->mt_env->me_options.dp_limit); @@ -4022,8 +4006,7 @@ static void mdbx_refund_loose(MDBX_txn *txn) { } } dpl_setlen(dl, w); - mdbx_tassert(txn, - txn->tw.dirtyroom + txn->tw.dirtylist->length == + tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length == (txn->mt_parent ? txn->mt_parent->tw.dirtyroom : txn->mt_env->me_options.dp_limit)); @@ -4032,19 +4015,19 @@ static void mdbx_refund_loose(MDBX_txn *txn) { } else { /* Dirtylist is mostly sorted, just refund loose pages at the end. */ dpl_sort(txn); - mdbx_tassert(txn, dl->length < 2 || - dl->items[1].pgno < dl->items[dl->length].pgno); - mdbx_tassert(txn, dl->sorted == dl->length); + tASSERT(txn, + dl->length < 2 || dl->items[1].pgno < dl->items[dl->length].pgno); + tASSERT(txn, dl->sorted == dl->length); /* Scan dirtylist tail-forward and cutoff suitable pages. */ unsigned n; for (n = dl->length; dl->items[n].pgno == txn->mt_next_pgno - 1 && dl->items[n].ptr->mp_flags == P_LOOSE; --n) { - mdbx_tassert(txn, n > 0); + tASSERT(txn, n > 0); MDBX_page *dp = dl->items[n].ptr; - mdbx_debug("refund-sorted page %" PRIaPGNO, dp->mp_pgno); - mdbx_tassert(txn, dp->mp_pgno == dl->items[n].pgno); + DEBUG("refund-sorted page %" PRIaPGNO, dp->mp_pgno); + tASSERT(txn, dp->mp_pgno == dl->items[n].pgno); txn->mt_next_pgno -= 1; } dpl_setlen(dl, n); @@ -4054,8 +4037,7 @@ static void mdbx_refund_loose(MDBX_txn *txn) { dl->sorted = dl->length; txn->tw.loose_count -= refunded; txn->tw.dirtyroom += refunded; - mdbx_tassert(txn, - txn->tw.dirtyroom + txn->tw.dirtylist->length == + tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length == (txn->mt_parent ? txn->mt_parent->tw.dirtyroom : txn->mt_env->me_options.dp_limit)); @@ -4063,41 +4045,41 @@ static void mdbx_refund_loose(MDBX_txn *txn) { unlink_loose: for (MDBX_page **link = &txn->tw.loose_pages; *link;) { MDBX_page *dp = *link; - mdbx_tassert(txn, dp->mp_flags == P_LOOSE); + tASSERT(txn, dp->mp_flags == P_LOOSE); if (txn->mt_next_pgno > dp->mp_pgno) { link = &dp->mp_next; } else { *link = dp->mp_next; if ((txn->mt_flags & MDBX_WRITEMAP) == 0) - mdbx_dpage_free(txn->mt_env, dp, 1); + dpage_free(txn->mt_env, dp, 1); } } } } - mdbx_tassert(txn, mdbx_dirtylist_check(txn)); + tASSERT(txn, dirtylist_check(txn)); if (suitable != onstack) - mdbx_pnl_free(suitable); + pnl_free(suitable); txn->tw.loose_refund_wl = txn->mt_next_pgno; } -static bool mdbx_refund(MDBX_txn *txn) { +static bool txn_refund(MDBX_txn *txn) { const pgno_t before = txn->mt_next_pgno; if (txn->tw.loose_pages && txn->tw.loose_refund_wl > txn->mt_next_pgno) - mdbx_refund_loose(txn); + refund_loose(txn); while (true) { if (MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) == 0 || MDBX_PNL_MOST(txn->tw.reclaimed_pglist) != txn->mt_next_pgno - 1) break; - mdbx_refund_reclaimed(txn); + refund_reclaimed(txn); if (!txn->tw.loose_pages || txn->tw.loose_refund_wl <= txn->mt_next_pgno) break; const pgno_t memo = txn->mt_next_pgno; - mdbx_refund_loose(txn); + refund_loose(txn); if (memo == txn->mt_next_pgno) break; } @@ -4107,29 +4089,29 @@ static bool mdbx_refund(MDBX_txn *txn) { if (txn->tw.spill_pages) /* Squash deleted pagenums if we refunded any */ - mdbx_spill_purge(txn); + spill_purge(txn); return true; } #else /* MDBX_ENABLE_REFUND */ -static __inline bool mdbx_refund(MDBX_txn *txn) { +static __inline bool txn_refund(MDBX_txn *txn) { (void)txn; /* No online auto-compactification. */ return false; } #endif /* MDBX_ENABLE_REFUND */ -__cold static void mdbx_kill_page(MDBX_txn *txn, MDBX_page *mp, pgno_t pgno, - unsigned npages) { +__cold static void kill_page(MDBX_txn *txn, MDBX_page *mp, pgno_t pgno, + unsigned npages) { MDBX_env *const env = txn->mt_env; - mdbx_debug("kill %u page(s) %" PRIaPGNO, npages, pgno); - mdbx_assert(env, pgno >= NUM_METAS && npages); + DEBUG("kill %u page(s) %" PRIaPGNO, npages, pgno); + eASSERT(env, pgno >= NUM_METAS && npages); if (!IS_FROZEN(txn, mp)) { const size_t bytes = pgno2bytes(env, npages); memset(mp, -1, bytes); mp->mp_pgno = pgno; if ((env->me_flags & MDBX_WRITEMAP) == 0) - mdbx_pwrite(env->me_lazy_fd, mp, bytes, pgno2bytes(env, pgno)); + osal_pwrite(env->me_lazy_fd, mp, bytes, pgno2bytes(env, pgno)); } else { struct iovec iov[MDBX_COMMIT_PAGES]; iov[0].iov_len = env->me_psize; @@ -4139,27 +4121,26 @@ __cold static void mdbx_kill_page(MDBX_txn *txn, MDBX_page *mp, pgno_t pgno, while (--npages) { iov[n] = iov[0]; if (++n == MDBX_COMMIT_PAGES) { - mdbx_pwritev(env->me_lazy_fd, iov, MDBX_COMMIT_PAGES, iov_off, + osal_pwritev(env->me_lazy_fd, iov, MDBX_COMMIT_PAGES, iov_off, pgno2bytes(env, MDBX_COMMIT_PAGES)); iov_off += pgno2bytes(env, MDBX_COMMIT_PAGES); n = 0; } } - mdbx_pwritev(env->me_lazy_fd, iov, n, iov_off, pgno2bytes(env, n)); + osal_pwritev(env->me_lazy_fd, iov, n, iov_off, pgno2bytes(env, n)); } } /* Remove page from dirty list */ -static __inline void mdbx_page_wash(MDBX_txn *txn, const unsigned di, - MDBX_page *const mp, - const unsigned npages) { - mdbx_tassert(txn, di && di <= txn->tw.dirtylist->length && - txn->tw.dirtylist->items[di].ptr == mp); - mdbx_dpl_remove(txn, di); +static __inline void page_wash(MDBX_txn *txn, const unsigned di, + MDBX_page *const mp, const unsigned npages) { + tASSERT(txn, di && di <= txn->tw.dirtylist->length && + txn->tw.dirtylist->items[di].ptr == mp); + dpl_remove(txn, di); txn->tw.dirtyroom++; - mdbx_tassert(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length == - (txn->mt_parent ? txn->mt_parent->tw.dirtyroom - : txn->mt_env->me_options.dp_limit)); + tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length == + (txn->mt_parent ? txn->mt_parent->tw.dirtyroom + : txn->mt_env->me_options.dp_limit)); mp->mp_txnid = INVALID_TXNID; mp->mp_flags = P_BAD; VALGRIND_MAKE_MEM_UNDEFINED(mp, PAGEHDRSZ); @@ -4169,7 +4150,7 @@ static __inline void mdbx_page_wash(MDBX_txn *txn, const unsigned di, MDBX_ASAN_POISON_MEMORY_REGION(page_data(mp), pgno2bytes(txn->mt_env, npages) - PAGEHDRSZ); } else - mdbx_dpage_free(txn->mt_env, mp, npages); + dpage_free(txn->mt_env, mp, npages); } /* Retire, loosen or free a single page. @@ -4180,12 +4161,12 @@ static __inline void mdbx_page_wash(MDBX_txn *txn, const unsigned di, * * If the page wasn't dirtied in this txn, just add it * to this txn's free list. */ -static int mdbx_page_retire_ex(MDBX_cursor *mc, const pgno_t pgno, - MDBX_page *mp /* maybe null */, - unsigned pageflags /* maybe unknown/zero */) { +static int page_retire_ex(MDBX_cursor *mc, const pgno_t pgno, + MDBX_page *mp /* maybe null */, + unsigned pageflags /* maybe unknown/zero */) { int rc; MDBX_txn *const txn = mc->mc_txn; - mdbx_tassert(txn, !mp || (mp->mp_pgno == pgno && mp->mp_flags == pageflags)); + tASSERT(txn, !mp || (mp->mp_pgno == pgno && mp->mp_flags == pageflags)); /* During deleting entire subtrees, it is reasonable and possible to avoid * reading leaf pages, i.e. significantly reduce hard page-faults & IOPs: @@ -4201,41 +4182,41 @@ static int mdbx_page_retire_ex(MDBX_cursor *mc, const pgno_t pgno, unsigned di = 0, si = 0, npages = 1; bool is_frozen = false, is_spilled = false, is_shadowed = false; if (unlikely(!mp)) { - if (mdbx_assert_enabled() && pageflags) { + if (ASSERT_ENABLED() && pageflags) { pgr_t check; check = page_get_any(mc, pgno, txn->mt_front); if (unlikely(check.err != MDBX_SUCCESS)) return check.err; - mdbx_tassert(txn, (check.page->mp_flags & ~P_LEAF2) == - (pageflags & ~P_FROZEN)); - mdbx_tassert(txn, !(pageflags & P_FROZEN) || IS_FROZEN(txn, check.page)); + tASSERT(txn, + (check.page->mp_flags & ~P_LEAF2) == (pageflags & ~P_FROZEN)); + tASSERT(txn, !(pageflags & P_FROZEN) || IS_FROZEN(txn, check.page)); } if (pageflags & P_FROZEN) { is_frozen = true; - if (mdbx_assert_enabled()) { + if (ASSERT_ENABLED()) { for (MDBX_txn *scan = txn; scan; scan = scan->mt_parent) { - mdbx_tassert(txn, !mdbx_search_spilled(scan, pgno)); - mdbx_tassert(txn, !scan->tw.dirtylist || !debug_dpl_find(scan, pgno)); + tASSERT(txn, !search_spilled(scan, pgno)); + tASSERT(txn, !scan->tw.dirtylist || !debug_dpl_find(scan, pgno)); } } goto status_done; } else if (pageflags && txn->tw.dirtylist) { - if ((di = mdbx_dpl_exist(txn, pgno)) != 0) { + if ((di = dpl_exist(txn, pgno)) != 0) { mp = txn->tw.dirtylist->items[di].ptr; - mdbx_tassert(txn, IS_MODIFIABLE(txn, mp)); + tASSERT(txn, IS_MODIFIABLE(txn, mp)); goto status_done; } - if ((si = mdbx_search_spilled(txn, pgno)) != 0) { + if ((si = search_spilled(txn, pgno)) != 0) { is_spilled = true; goto status_done; } for (MDBX_txn *parent = txn->mt_parent; parent; parent = parent->mt_parent) { - if (mdbx_dpl_exist(parent, pgno)) { + if (dpl_exist(parent, pgno)) { is_shadowed = true; goto status_done; } - if (mdbx_search_spilled(parent, pgno)) { + if (search_spilled(parent, pgno)) { is_spilled = true; goto status_done; } @@ -4248,7 +4229,7 @@ static int mdbx_page_retire_ex(MDBX_cursor *mc, const pgno_t pgno, if (unlikely(pg.err != MDBX_SUCCESS)) return pg.err; mp = pg.page; - mdbx_tassert(txn, !pageflags || mp->mp_flags == pageflags); + tASSERT(txn, !pageflags || mp->mp_flags == pageflags); pageflags = mp->mp_flags; } @@ -4258,21 +4239,21 @@ static int mdbx_page_retire_ex(MDBX_cursor *mc, const pgno_t pgno, is_spilled = IS_SPILLED(txn, mp) && !(txn->mt_flags & MDBX_WRITEMAP); is_shadowed = IS_SHADOWED(txn, mp); if (is_dirty) { - mdbx_tassert(txn, !is_spilled); - mdbx_tassert(txn, !mdbx_search_spilled(txn, pgno)); - mdbx_tassert(txn, debug_dpl_find(txn, pgno) == mp || txn->mt_parent || - (txn->mt_flags & MDBX_WRITEMAP)); + tASSERT(txn, !is_spilled); + tASSERT(txn, !search_spilled(txn, pgno)); + tASSERT(txn, debug_dpl_find(txn, pgno) == mp || txn->mt_parent || + (txn->mt_flags & MDBX_WRITEMAP)); } else { - mdbx_tassert(txn, !debug_dpl_find(txn, pgno)); + tASSERT(txn, !debug_dpl_find(txn, pgno)); } - di = is_dirty ? mdbx_dpl_exist(txn, pgno) : 0; - si = is_spilled ? mdbx_search_spilled(txn, pgno) : 0; - mdbx_tassert(txn, !is_dirty || di || (txn->mt_flags & MDBX_WRITEMAP)); + di = is_dirty ? dpl_exist(txn, pgno) : 0; + si = is_spilled ? search_spilled(txn, pgno) : 0; + tASSERT(txn, !is_dirty || di || (txn->mt_flags & MDBX_WRITEMAP)); } else { - mdbx_tassert(txn, !IS_MODIFIABLE(txn, mp)); - mdbx_tassert(txn, !IS_SPILLED(txn, mp)); - mdbx_tassert(txn, !IS_SHADOWED(txn, mp)); + tASSERT(txn, !IS_MODIFIABLE(txn, mp)); + tASSERT(txn, !IS_SPILLED(txn, mp)); + tASSERT(txn, !IS_SHADOWED(txn, mp)); } status_done: @@ -4280,27 +4261,27 @@ status_done: STATIC_ASSERT(P_BRANCH == 1); const bool is_branch = pageflags & P_BRANCH; if (unlikely(mc->mc_flags & C_SUB)) { - MDBX_db *outer = mdbx_outer_db(mc); - mdbx_cassert(mc, !is_branch || outer->md_branch_pages > 0); + MDBX_db *outer = outer_db(mc); + cASSERT(mc, !is_branch || outer->md_branch_pages > 0); outer->md_branch_pages -= is_branch; - mdbx_cassert(mc, is_branch || outer->md_leaf_pages > 0); + cASSERT(mc, is_branch || outer->md_leaf_pages > 0); outer->md_leaf_pages -= 1 - is_branch; } - mdbx_cassert(mc, !is_branch || mc->mc_db->md_branch_pages > 0); + cASSERT(mc, !is_branch || mc->mc_db->md_branch_pages > 0); mc->mc_db->md_branch_pages -= is_branch; - mdbx_cassert(mc, (pageflags & P_LEAF) == 0 || mc->mc_db->md_leaf_pages > 0); + cASSERT(mc, (pageflags & P_LEAF) == 0 || mc->mc_db->md_leaf_pages > 0); mc->mc_db->md_leaf_pages -= (pageflags & P_LEAF) != 0; } else { npages = mp->mp_pages; - mdbx_cassert(mc, mc->mc_db->md_overflow_pages >= npages); + cASSERT(mc, mc->mc_db->md_overflow_pages >= npages); mc->mc_db->md_overflow_pages -= npages; } if (is_frozen) { retire: - mdbx_debug("retire %u page %" PRIaPGNO, npages, pgno); - rc = mdbx_pnl_append_range(false, &txn->tw.retired_pages, pgno, npages); - mdbx_tassert(txn, mdbx_dirtylist_check(txn)); + DEBUG("retire %u page %" PRIaPGNO, npages, pgno); + rc = pnl_append_range(false, &txn->tw.retired_pages, pgno, npages); + tASSERT(txn, dirtylist_check(txn)); return rc; } @@ -4315,44 +4296,43 @@ status_done: * Её МОЖНО вытолкнуть в нераспределенный хвост. */ kind = "dirty"; /* Remove from dirty list */ - mdbx_page_wash(txn, di, mp, npages); + page_wash(txn, di, mp, npages); } else if (si) { /* Страница пролита в этой транзакции, т.е. она аллоцирована * и запачкана в этой или одной из родительских транзакций. * Её МОЖНО вытолкнуть в нераспределенный хвост. */ kind = "spilled"; - mdbx_spill_remove(txn, si, npages); + spill_remove(txn, si, npages); } else if ((txn->mt_flags & MDBX_WRITEMAP)) { kind = "writemap"; - mdbx_tassert(txn, mp && IS_MODIFIABLE(txn, mp)); + tASSERT(txn, mp && IS_MODIFIABLE(txn, mp)); } else { /* Страница аллоцирована, запачкана и возможно пролита в одной * из родительских транзакций. * Её МОЖНО вытолкнуть в нераспределенный хвост. */ kind = "parent's"; - if (mdbx_assert_enabled() && mp) { + if (ASSERT_ENABLED() && mp) { kind = nullptr; for (MDBX_txn *parent = txn->mt_parent; parent; parent = parent->mt_parent) { - if (mdbx_search_spilled(parent, pgno)) { + if (search_spilled(parent, pgno)) { kind = "parent-spilled"; - mdbx_tassert(txn, is_spilled); + tASSERT(txn, is_spilled); break; } if (mp == debug_dpl_find(parent, pgno)) { kind = "parent-dirty"; - mdbx_tassert(txn, !is_spilled); + tASSERT(txn, !is_spilled); break; } } - mdbx_tassert(txn, kind != nullptr); + tASSERT(txn, kind != nullptr); } - mdbx_tassert(txn, - is_spilled || is_shadowed || (mp && IS_SHADOWED(txn, mp))); + tASSERT(txn, is_spilled || is_shadowed || (mp && IS_SHADOWED(txn, mp))); } - mdbx_debug("refunded %u %s page %" PRIaPGNO, npages, kind, pgno); + DEBUG("refunded %u %s page %" PRIaPGNO, npages, kind, pgno); txn->mt_next_pgno = pgno; - mdbx_refund(txn); + txn_refund(txn); return MDBX_SUCCESS; } @@ -4366,7 +4346,7 @@ status_done: txn->mt_next_pgno > pgno + txn->mt_env->me_options.dp_loose_limit || txn->mt_next_pgno <= txn->mt_env->me_options.dp_loose_limit))) { - mdbx_debug("loosen dirty page %" PRIaPGNO, pgno); + DEBUG("loosen dirty page %" PRIaPGNO, pgno); mp->mp_flags = P_LOOSE; mp->mp_next = txn->tw.loose_pages; txn->tw.loose_pages = mp; @@ -4396,16 +4376,16 @@ status_done: for (MDBX_txn *parent = txn->mt_parent; parent && (parent->mt_flags & MDBX_TXN_SPILLS); parent = parent->mt_parent) { - if (mdbx_intersect_spilled(parent, pgno, npages)) + if (intersect_spilled(parent, pgno, npages)) goto skip_invalidate; - if (mdbx_dpl_intersect(parent, pgno, npages)) + if (dpl_intersect(parent, pgno, npages)) goto skip_invalidate; } #if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__) if (MDBX_DEBUG != 0 || unlikely(txn->mt_env->me_flags & MDBX_PAGEPERTURB)) #endif - mdbx_kill_page(txn, mp, pgno, npages); + kill_page(txn, mp, pgno, npages); if (!(txn->mt_flags & MDBX_WRITEMAP)) { VALGRIND_MAKE_MEM_NOACCESS(page_data(pgno2page(txn->mt_env, pgno)), pgno2bytes(txn->mt_env, npages) - PAGEHDRSZ); @@ -4416,21 +4396,20 @@ status_done: } skip_invalidate: /* Remove from dirty list */ - mdbx_page_wash(txn, di, mp, npages); + page_wash(txn, di, mp, npages); reclaim: - mdbx_debug("reclaim %u %s page %" PRIaPGNO, npages, "dirty", pgno); - rc = mdbx_pnl_insert_range(&txn->tw.reclaimed_pglist, pgno, npages); - mdbx_tassert(txn, - pnl_check_allocated(txn->tw.reclaimed_pglist, + DEBUG("reclaim %u %s page %" PRIaPGNO, npages, "dirty", pgno); + rc = pnl_insert_range(&txn->tw.reclaimed_pglist, pgno, npages); + tASSERT(txn, pnl_check_allocated(txn->tw.reclaimed_pglist, txn->mt_next_pgno - MDBX_ENABLE_REFUND)); - mdbx_tassert(txn, mdbx_dirtylist_check(txn)); + tASSERT(txn, dirtylist_check(txn)); return rc; } if (si) { /* Page ws spilled in this txn */ - mdbx_spill_remove(txn, si, npages); + spill_remove(txn, si, npages); /* Страница могла быть выделена и затем пролита в этой транзакции, * тогда её необходимо поместить в reclaimed-список. * Либо она могла быть выделена в одной из родительских транзакций и затем @@ -4438,7 +4417,7 @@ status_done: * retired-список для последующей фильтрации при коммите. */ for (MDBX_txn *parent = txn->mt_parent; parent; parent = parent->mt_parent) { - if (mdbx_dpl_exist(parent, pgno)) + if (dpl_exist(parent, pgno)) goto retire; } /* Страница точно была выделена в этой транзакции @@ -4448,15 +4427,15 @@ status_done: if (is_shadowed) { /* Dirty page MUST BE a clone from (one of) parent transaction(s). */ - if (mdbx_assert_enabled()) { + if (ASSERT_ENABLED()) { const MDBX_page *parent_dp = nullptr; /* Check parent(s)'s dirty lists. */ for (MDBX_txn *parent = txn->mt_parent; parent && !parent_dp; parent = parent->mt_parent) { - mdbx_tassert(txn, !mdbx_search_spilled(parent, pgno)); + tASSERT(txn, !search_spilled(parent, pgno)); parent_dp = debug_dpl_find(parent, pgno); } - mdbx_tassert(txn, parent_dp && (!mp || parent_dp == mp)); + tASSERT(txn, parent_dp && (!mp || parent_dp == mp)); } /* Страница была выделена в родительской транзакции и теперь может быть * использована повторно, но только внутри этой транзакции, либо дочерних. @@ -4473,11 +4452,11 @@ status_done: goto retire; } -static __inline int mdbx_page_retire(MDBX_cursor *mc, MDBX_page *mp) { - return mdbx_page_retire_ex(mc, mp->mp_pgno, mp, mp->mp_flags); +static __inline int page_retire(MDBX_cursor *mc, MDBX_page *mp) { + return page_retire_ex(mc, mp->mp_pgno, mp, mp->mp_flags); } -struct mdbx_iov_ctx { +struct iov_ctx { unsigned iov_items; size_t iov_bytes; size_t iov_off; @@ -4486,8 +4465,7 @@ struct mdbx_iov_ctx { struct iovec iov[MDBX_COMMIT_PAGES]; }; -static __inline void mdbx_iov_init(MDBX_txn *const txn, - struct mdbx_iov_ctx *ctx) { +static __inline void iov_init(MDBX_txn *const txn, struct iov_ctx *ctx) { ctx->flush_begin = MAX_PAGENO; ctx->flush_end = MIN_PAGENO; ctx->iov_items = 0; @@ -4496,39 +4474,37 @@ static __inline void mdbx_iov_init(MDBX_txn *const txn, (void)txn; } -static __inline void mdbx_iov_done(MDBX_txn *const txn, - struct mdbx_iov_ctx *ctx) { - mdbx_tassert(txn, ctx->iov_items == 0); +static __inline void iov_done(MDBX_txn *const txn, struct iov_ctx *ctx) { + tASSERT(txn, ctx->iov_items == 0); #if defined(__linux__) || defined(__gnu_linux__) MDBX_env *const env = txn->mt_env; - if (!(txn->mt_flags & MDBX_WRITEMAP) && - mdbx_linux_kernel_version < 0x02060b00) + if (!(txn->mt_flags & MDBX_WRITEMAP) && linux_kernel_version < 0x02060b00) /* Linux kernels older than version 2.6.11 ignore the addr and nbytes * arguments, making this function fairly expensive. Therefore, the * whole cache is always flushed. */ - mdbx_flush_incoherent_mmap( + osal_flush_incoherent_mmap( env->me_map + pgno2bytes(env, ctx->flush_begin), pgno2bytes(env, ctx->flush_end - ctx->flush_begin), env->me_os_psize); #endif /* Linux */ } -static int mdbx_iov_write(MDBX_txn *const txn, struct mdbx_iov_ctx *ctx) { - mdbx_tassert(txn, !(txn->mt_flags & MDBX_WRITEMAP)); - mdbx_tassert(txn, ctx->iov_items > 0); +static int iov_write(MDBX_txn *const txn, struct iov_ctx *ctx) { + tASSERT(txn, !(txn->mt_flags & MDBX_WRITEMAP)); + tASSERT(txn, ctx->iov_items > 0); MDBX_env *const env = txn->mt_env; int rc; if (likely(ctx->iov_items == 1)) { - mdbx_assert(env, ctx->iov_bytes == (size_t)ctx->iov[0].iov_len); - rc = mdbx_pwrite(env->me_lazy_fd, ctx->iov[0].iov_base, ctx->iov[0].iov_len, + eASSERT(env, ctx->iov_bytes == (size_t)ctx->iov[0].iov_len); + rc = osal_pwrite(env->me_lazy_fd, ctx->iov[0].iov_base, ctx->iov[0].iov_len, ctx->iov_off); } else { - rc = mdbx_pwritev(env->me_lazy_fd, ctx->iov, ctx->iov_items, ctx->iov_off, + rc = osal_pwritev(env->me_lazy_fd, ctx->iov, ctx->iov_items, ctx->iov_off, ctx->iov_bytes); } if (unlikely(rc != MDBX_SUCCESS)) - mdbx_error("Write error: %s", mdbx_strerror(rc)); + ERROR("Write error: %s", mdbx_strerror(rc)); else { VALGRIND_MAKE_MEM_DEFINED(txn->mt_env->me_map + ctx->iov_off, ctx->iov_bytes); @@ -4552,15 +4528,14 @@ static int mdbx_iov_write(MDBX_txn *const txn, struct mdbx_iov_ctx *ctx) { while (likely(rc == MDBX_SUCCESS) && unlikely(memcmp(wp, rp, ctx->iov[i].iov_len) != 0)) { if (!timestamp) { - timestamp = mdbx_osal_monotime(); - mdbx_iov_done(txn, ctx); - mdbx_warning( + timestamp = osal_monotime(); + iov_done(txn, ctx); + WARNING( "catch delayed/non-arrived page %" PRIaPGNO " %s", wp->mp_pgno, "(workaround for incoherent flaw of unified page/buffer cache)"); - } else if (unlikely(mdbx_osal_monotime() - timestamp > 65536 / 10)) { - mdbx_error( - "bailout waiting for %" PRIaPGNO " page arrival %s", wp->mp_pgno, - "(workaround for incoherent flaw of unified page/buffer cache)"); + } else if (unlikely(osal_monotime() - timestamp > 65536 / 10)) { + ERROR("bailout waiting for %" PRIaPGNO " page arrival %s", wp->mp_pgno, + "(workaround for incoherent flaw of unified page/buffer cache)"); rc = MDBX_CORRUPTED; } #if defined(_WIN32) || defined(_WIN64) @@ -4573,19 +4548,17 @@ static int mdbx_iov_write(MDBX_txn *const txn, struct mdbx_iov_ctx *ctx) { usleep(42); #endif } - mdbx_dpage_free(env, wp, bytes2pgno(env, ctx->iov[i].iov_len)); + dpage_free(env, wp, bytes2pgno(env, ctx->iov[i].iov_len)); } return rc; } -static int iov_page(MDBX_txn *txn, struct mdbx_iov_ctx *ctx, MDBX_page *dp, +static int iov_page(MDBX_txn *txn, struct iov_ctx *ctx, MDBX_page *dp, unsigned npages) { MDBX_env *const env = txn->mt_env; - mdbx_tassert(txn, - dp->mp_pgno >= MIN_PAGENO && dp->mp_pgno < txn->mt_next_pgno); - mdbx_tassert(txn, IS_MODIFIABLE(txn, dp)); - mdbx_tassert(txn, - !(dp->mp_flags & ~(P_BRANCH | P_LEAF | P_LEAF2 | P_OVERFLOW))); + tASSERT(txn, dp->mp_pgno >= MIN_PAGENO && dp->mp_pgno < txn->mt_next_pgno); + tASSERT(txn, IS_MODIFIABLE(txn, dp)); + tASSERT(txn, !(dp->mp_flags & ~(P_BRANCH | P_LEAF | P_LEAF2 | P_OVERFLOW))); ctx->flush_begin = (ctx->flush_begin < dp->mp_pgno) ? ctx->flush_begin : dp->mp_pgno; @@ -4595,24 +4568,24 @@ static int iov_page(MDBX_txn *txn, struct mdbx_iov_ctx *ctx, MDBX_page *dp, env->me_lck->mti_unsynced_pages.weak += npages; if (IS_SHADOWED(txn, dp)) { - mdbx_tassert(txn, !(txn->mt_flags & MDBX_WRITEMAP)); + tASSERT(txn, !(txn->mt_flags & MDBX_WRITEMAP)); dp->mp_txnid = txn->mt_txnid; - mdbx_tassert(txn, IS_SPILLED(txn, dp)); + tASSERT(txn, IS_SPILLED(txn, dp)); const size_t size = pgno2bytes(env, npages); if (ctx->iov_off + ctx->iov_bytes != pgno2bytes(env, dp->mp_pgno) || ctx->iov_items == ARRAY_LENGTH(ctx->iov) || ctx->iov_bytes + size > MAX_WRITE) { if (ctx->iov_items) { - int err = mdbx_iov_write(txn, ctx); + int err = iov_write(txn, ctx); if (unlikely(err != MDBX_SUCCESS)) return err; #if defined(__linux__) || defined(__gnu_linux__) - if (mdbx_linux_kernel_version >= 0x02060b00) + if (linux_kernel_version >= 0x02060b00) /* Linux kernels older than version 2.6.11 ignore the addr and nbytes * arguments, making this function fairly expensive. Therefore, the * whole cache is always flushed. */ #endif /* Linux */ - mdbx_flush_incoherent_mmap(env->me_map + ctx->iov_off, ctx->iov_bytes, + osal_flush_incoherent_mmap(env->me_map + ctx->iov_off, ctx->iov_bytes, env->me_os_psize); } ctx->iov_off = pgno2bytes(env, dp->mp_pgno); @@ -4622,18 +4595,18 @@ static int iov_page(MDBX_txn *txn, struct mdbx_iov_ctx *ctx, MDBX_page *dp, ctx->iov_items += 1; ctx->iov_bytes += size; } else { - mdbx_tassert(txn, txn->mt_flags & MDBX_WRITEMAP); + tASSERT(txn, txn->mt_flags & MDBX_WRITEMAP); } return MDBX_SUCCESS; } -static int spill_page(MDBX_txn *txn, struct mdbx_iov_ctx *ctx, MDBX_page *dp, +static int spill_page(MDBX_txn *txn, struct iov_ctx *ctx, MDBX_page *dp, unsigned npages) { - mdbx_tassert(txn, !(txn->mt_flags & MDBX_WRITEMAP)); + tASSERT(txn, !(txn->mt_flags & MDBX_WRITEMAP)); pgno_t pgno = dp->mp_pgno; int err = iov_page(txn, ctx, dp, npages); if (likely(err == MDBX_SUCCESS)) { - err = mdbx_pnl_append_range(true, &txn->tw.spill_pages, pgno << 1, npages); + err = pnl_append_range(true, &txn->tw.spill_pages, pgno << 1, npages); #if MDBX_ENABLE_PGOP_STAT if (likely(err == MDBX_SUCCESS)) txn->mt_env->me_lck->mti_pgop_stat.spill.weak += npages; @@ -4644,7 +4617,7 @@ static int spill_page(MDBX_txn *txn, struct mdbx_iov_ctx *ctx, MDBX_page *dp, /* Set unspillable LRU-label for dirty pages watched by txn. * Returns the number of pages marked as unspillable. */ -static unsigned mdbx_cursor_keep(MDBX_txn *txn, MDBX_cursor *mc) { +static unsigned cursor_keep(MDBX_txn *txn, MDBX_cursor *mc) { unsigned keep = 0; while (mc->mc_flags & C_INITIALIZED) { for (unsigned i = 0; i < mc->mc_snum; ++i) { @@ -4652,7 +4625,7 @@ static unsigned mdbx_cursor_keep(MDBX_txn *txn, MDBX_cursor *mc) { if (IS_MODIFIABLE(txn, mp) && !IS_SUBP(mp)) { unsigned const n = dpl_search(txn, mp->mp_pgno); if (txn->tw.dirtylist->items[n].pgno == mp->mp_pgno && - mdbx_dpl_age(txn, n)) { + dpl_age(txn, n)) { txn->tw.dirtylist->items[n].lru = txn->tw.dirtylru; ++keep; } @@ -4665,14 +4638,14 @@ static unsigned mdbx_cursor_keep(MDBX_txn *txn, MDBX_cursor *mc) { return keep; } -static unsigned mdbx_txn_keep(MDBX_txn *txn, MDBX_cursor *m0) { - unsigned keep = m0 ? mdbx_cursor_keep(txn, m0) : 0; +static unsigned txn_keep(MDBX_txn *txn, MDBX_cursor *m0) { + unsigned keep = m0 ? cursor_keep(txn, m0) : 0; for (unsigned i = FREE_DBI; i < txn->mt_numdbs; ++i) if (F_ISSET(txn->mt_dbistate[i], DBI_DIRTY | DBI_VALID) && txn->mt_dbs[i].md_root != P_INVALID) for (MDBX_cursor *mc = txn->mt_cursors[i]; mc; mc = mc->mc_next) if (mc != m0) - keep += mdbx_cursor_keep(txn, mc); + keep += cursor_keep(txn, mc); return keep; } @@ -4683,21 +4656,21 @@ static unsigned mdbx_txn_keep(MDBX_txn *txn, MDBX_cursor *m0) { static unsigned spill_prio(const MDBX_txn *txn, const unsigned i, const uint32_t reciprocal) { MDBX_dpl *const dl = txn->tw.dirtylist; - const uint32_t age = mdbx_dpl_age(txn, i); + const uint32_t age = dpl_age(txn, i); const unsigned npages = dpl_npages(dl, i); const pgno_t pgno = dl->items[i].pgno; if (age == 0) { - mdbx_debug("skip %s %u page %" PRIaPGNO, "keep", npages, pgno); + DEBUG("skip %s %u page %" PRIaPGNO, "keep", npages, pgno); return 256; } MDBX_page *const dp = dl->items[i].ptr; if (dp->mp_flags & (P_LOOSE | P_SPILLED)) { - mdbx_debug("skip %s %u page %" PRIaPGNO, - (dp->mp_flags & P_LOOSE) ? "loose" - : (dp->mp_flags & P_LOOSE) ? "loose" - : "parent-spilled", - npages, pgno); + DEBUG("skip %s %u page %" PRIaPGNO, + (dp->mp_flags & P_LOOSE) ? "loose" + : (dp->mp_flags & P_LOOSE) ? "loose" + : "parent-spilled", + npages, pgno); return 256; } @@ -4706,17 +4679,17 @@ static unsigned spill_prio(const MDBX_txn *txn, const unsigned i, MDBX_txn *parent = txn->mt_parent; if (parent && (parent->mt_flags & MDBX_TXN_SPILLS)) { do - if (mdbx_intersect_spilled(parent, pgno, npages)) { - mdbx_debug("skip-2 parent-spilled %u page %" PRIaPGNO, npages, pgno); + if (intersect_spilled(parent, pgno, npages)) { + DEBUG("skip-2 parent-spilled %u page %" PRIaPGNO, npages, pgno); dp->mp_flags |= P_SPILLED; return 256; } while ((parent = parent->mt_parent) != nullptr); } - mdbx_tassert(txn, age * (uint64_t)reciprocal < UINT32_MAX); + tASSERT(txn, age * (uint64_t)reciprocal < UINT32_MAX); unsigned prio = age * reciprocal >> 24; - mdbx_tassert(txn, prio < 256); + tASSERT(txn, prio < 256); if (likely(npages == 1)) return prio = 256 - prio; @@ -4728,7 +4701,7 @@ static unsigned spill_prio(const MDBX_txn *txn, const unsigned i, factor |= factor >> 16; factor = prio * log2n_powerof2(factor + 1) + /* golden ratio */ 157; factor = (factor < 256) ? 255 - factor : 0; - mdbx_tassert(txn, factor < 256 && factor < (256 - prio)); + tASSERT(txn, factor < 256 && factor < (256 - prio)); return prio = factor; } @@ -4750,8 +4723,8 @@ static unsigned spill_prio(const MDBX_txn *txn, const unsigned i, * If the txn never references them again, they can be left alone. * If the txn only reads them, they can be used without any fuss. * If the txn writes them again, they can be dirtied immediately without - * going thru all of the work of mdbx_page_touch(). Such references are - * handled by mdbx_page_unspill(). + * going thru all of the work of page_touch(). Such references are + * handled by page_unspill(). * * Also note, we never spill DB root pages, nor pages of active cursors, * because we'll need these back again soon anyway. And in nested txns, @@ -4759,8 +4732,8 @@ static unsigned spill_prio(const MDBX_txn *txn, const unsigned i, * parent txn. That would alter the parent txns' data even though * the child hasn't committed yet, and we'd have no way to undo it if * the child aborted. */ -static int mdbx_txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0, - const unsigned need) { +static int txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0, + const unsigned need) { #if xMDBX_DEBUG_SPILLING != 1 /* production mode */ if (likely(txn->tw.dirtyroom + txn->tw.loose_count >= need)) @@ -4786,12 +4759,12 @@ static int mdbx_txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0, if (!wanna_spill) return MDBX_SUCCESS; - mdbx_notice("spilling %u dirty-entries (have %u dirty-room, need %u)", - wanna_spill, txn->tw.dirtyroom, need); - mdbx_tassert(txn, txn->tw.dirtylist->length >= wanna_spill); + NOTICE("spilling %u dirty-entries (have %u dirty-room, need %u)", wanna_spill, + txn->tw.dirtyroom, need); + tASSERT(txn, txn->tw.dirtylist->length >= wanna_spill); - struct mdbx_iov_ctx ctx; - mdbx_iov_init(txn, &ctx); + struct iov_ctx ctx; + iov_init(txn, &ctx); int rc = MDBX_SUCCESS; if (txn->mt_flags & MDBX_WRITEMAP) { MDBX_dpl *const dl = txn->tw.dirtylist; @@ -4804,21 +4777,21 @@ static int mdbx_txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0, dl->items[++w] = dl->items[r]; else if (!MDBX_FAKE_SPILL_WRITEMAP) { rc = iov_page(txn, &ctx, dp, dpl_npages(dl, r)); - mdbx_tassert(txn, rc == MDBX_SUCCESS); + tASSERT(txn, rc == MDBX_SUCCESS); } } - mdbx_tassert(txn, span == r - 1 - w && w == txn->tw.loose_count); + tASSERT(txn, span == r - 1 - w && w == txn->tw.loose_count); dl->sorted = (dl->sorted == dl->length) ? w : 0; dpl_setlen(dl, w); - mdbx_tassert(txn, mdbx_dirtylist_check(txn)); + tASSERT(txn, dirtylist_check(txn)); if (!MDBX_FAKE_SPILL_WRITEMAP && ctx.flush_end > ctx.flush_begin) { MDBX_env *const env = txn->mt_env; #if MDBX_ENABLE_PGOP_STAT env->me_lck->mti_pgop_stat.wops.weak += 1; #endif /* MDBX_ENABLE_PGOP_STAT */ - rc = mdbx_msync(&env->me_dxb_mmap, + rc = osal_msync(&env->me_dxb_mmap, pgno_align2os_bytes(env, ctx.flush_begin), pgno_align2os_bytes(env, ctx.flush_end - ctx.flush_begin), MDBX_SYNC_NONE); @@ -4826,10 +4799,10 @@ static int mdbx_txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0, return rc; } - mdbx_tassert(txn, !(txn->mt_flags & MDBX_WRITEMAP)); + tASSERT(txn, !(txn->mt_flags & MDBX_WRITEMAP)); if (!txn->tw.spill_pages) { txn->tw.spill_least_removed = INT_MAX; - txn->tw.spill_pages = mdbx_pnl_alloc(wanna_spill); + txn->tw.spill_pages = pnl_alloc(wanna_spill); if (unlikely(!txn->tw.spill_pages)) { rc = MDBX_ENOMEM; bailout: @@ -4838,10 +4811,10 @@ static int mdbx_txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0, } } else { /* purge deleted slots */ - mdbx_spill_purge(txn); - rc = mdbx_pnl_reserve(&txn->tw.spill_pages, wanna_spill); + spill_purge(txn); + rc = pnl_reserve(&txn->tw.spill_pages, wanna_spill); (void)rc /* ignore since the resulting list may be shorter - and mdbx_pnl_append() will increase pnl on demand */ + and pnl_append() will increase pnl on demand */ ; } @@ -4849,16 +4822,16 @@ static int mdbx_txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0, MDBX_dpl *const dl = dpl_sort(txn); /* Preserve pages which may soon be dirtied again */ - const unsigned unspillable = mdbx_txn_keep(txn, m0); + const unsigned unspillable = txn_keep(txn, m0); if (unspillable + txn->tw.loose_count >= dl->length) { #if xMDBX_DEBUG_SPILLING == 1 /* avoid false failure in debug mode */ if (likely(txn->tw.dirtyroom + txn->tw.loose_count >= need)) return MDBX_SUCCESS; #endif /* xMDBX_DEBUG_SPILLING */ - mdbx_error("all %u dirty pages are unspillable since referenced " - "by a cursor(s), use fewer cursors or increase " - "MDBX_opt_txn_dp_limit", - unspillable); + ERROR("all %u dirty pages are unspillable since referenced " + "by a cursor(s), use fewer cursors or increase " + "MDBX_opt_txn_dp_limit", + unspillable); goto done; } @@ -4888,11 +4861,11 @@ static int mdbx_txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0, /* get min/max of LRU-labels */ uint32_t age_max = 0; for (unsigned i = 1; i <= dl->length; ++i) { - const uint32_t age = mdbx_dpl_age(txn, i); + const uint32_t age = dpl_age(txn, i); age_max = (age_max >= age) ? age_max : age; } - mdbx_verbose("lru-head %u, age-max %u", txn->tw.dirtylru, age_max); + VERBOSE("lru-head %u, age-max %u", txn->tw.dirtylru, age_max); /* half of 8-bit radix-sort */ unsigned radix_counters[256], spillable = 0, spilled = 0; @@ -4921,10 +4894,10 @@ static int mdbx_txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0, break; } - mdbx_verbose("prio2spill %u, prio2adjacent %u, amount %u, spillable %u, " - "wanna_spill %u", - prio2spill, prio2adjacent, amount, spillable, wanna_spill); - mdbx_tassert(txn, prio2spill < prio2adjacent && prio2adjacent <= 256); + VERBOSE("prio2spill %u, prio2adjacent %u, amount %u, spillable %u, " + "wanna_spill %u", + prio2spill, prio2adjacent, amount, spillable, wanna_spill); + tASSERT(txn, prio2spill < prio2adjacent && prio2adjacent <= 256); unsigned prev_prio = 256; unsigned r, w, prio; @@ -4938,10 +4911,10 @@ static int mdbx_txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0, if (prio <= prio2spill) { if (prev_prio < prio2adjacent && prev_prio > prio2spill && dpl_endpgno(dl, r - 1) == pgno) { - mdbx_debug("co-spill %u prev-adjacent page %" PRIaPGNO - " (age %d, prio %u)", - dpl_npages(dl, w), dl->items[r - 1].pgno, - mdbx_dpl_age(txn, r - 1), prev_prio); + DEBUG("co-spill %u prev-adjacent page %" PRIaPGNO + " (age %d, prio %u)", + dpl_npages(dl, w), dl->items[r - 1].pgno, dpl_age(txn, r - 1), + prev_prio); --w; rc = spill_page(txn, &ctx, dl->items[r - 1].ptr, dpl_npages(dl, r - 1)); @@ -4950,8 +4923,8 @@ static int mdbx_txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0, ++spilled; } - mdbx_debug("spill %u page %" PRIaPGNO " (age %d, prio %u)", npages, - dp->mp_pgno, mdbx_dpl_age(txn, r), prio); + DEBUG("spill %u page %" PRIaPGNO " (age %d, prio %u)", npages, + dp->mp_pgno, dpl_age(txn, r), prio); rc = spill_page(txn, &ctx, dp, npages); if (unlikely(rc != MDBX_SUCCESS)) break; @@ -4960,9 +4933,9 @@ static int mdbx_txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0, } if (prev_prio <= prio2spill && dpl_endpgno(dl, r - 1) == pgno) { - mdbx_debug("co-spill %u next-adjacent page %" PRIaPGNO - " (age %d, prio %u)", - npages, dp->mp_pgno, mdbx_dpl_age(txn, r), prio); + DEBUG("co-spill %u next-adjacent page %" PRIaPGNO + " (age %d, prio %u)", + npages, dp->mp_pgno, dpl_age(txn, r), prio); rc = spill_page(txn, &ctx, dp, npages); if (unlikely(rc != MDBX_SUCCESS)) break; @@ -4974,50 +4947,48 @@ static int mdbx_txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0, dl->items[++w] = dl->items[r]; } - mdbx_tassert(txn, spillable == 0 || spilled > 0); + tASSERT(txn, spillable == 0 || spilled > 0); while (r <= dl->length) dl->items[++w] = dl->items[r++]; - mdbx_tassert(txn, r - 1 - w == spilled); + tASSERT(txn, r - 1 - w == spilled); dl->sorted = dpl_setlen(dl, w); txn->tw.dirtyroom += spilled; - mdbx_tassert(txn, mdbx_dirtylist_check(txn)); + tASSERT(txn, dirtylist_check(txn)); if (ctx.iov_items) - rc = mdbx_iov_write(txn, &ctx); + rc = iov_write(txn, &ctx); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; - mdbx_pnl_sort(txn->tw.spill_pages, (size_t)txn->mt_next_pgno << 1); + pnl_sort(txn->tw.spill_pages, (size_t)txn->mt_next_pgno << 1); txn->mt_flags |= MDBX_TXN_SPILLS; - mdbx_notice("spilled %u dirty-entries, now have %u dirty-room", spilled, - txn->tw.dirtyroom); - mdbx_iov_done(txn, &ctx); + NOTICE("spilled %u dirty-entries, now have %u dirty-room", spilled, + txn->tw.dirtyroom); + iov_done(txn, &ctx); } else { - mdbx_tassert(txn, ctx.iov_items == 0 && rc == MDBX_SUCCESS); + tASSERT(txn, ctx.iov_items == 0 && rc == MDBX_SUCCESS); for (unsigned i = 1; i <= dl->length; ++i) { MDBX_page *dp = dl->items[i].ptr; - mdbx_notice( - "dirtylist[%u]: pgno %u, npages %u, flags 0x%04X, age %u, prio %u", i, - dp->mp_pgno, dpl_npages(dl, i), dp->mp_flags, mdbx_dpl_age(txn, i), - spill_prio(txn, i, reciprocal)); + NOTICE("dirtylist[%u]: pgno %u, npages %u, flags 0x%04X, age %u, prio %u", + i, dp->mp_pgno, dpl_npages(dl, i), dp->mp_flags, dpl_age(txn, i), + spill_prio(txn, i, reciprocal)); } } #if xMDBX_DEBUG_SPILLING == 2 if (txn->tw.loose_count + txn->tw.dirtyroom <= need / 2 + 1) - mdbx_error("dirty-list length: before %u, after %u, parent %i, loose %u; " - "needed %u, spillable %u; " - "spilled %u dirty-entries, now have %u dirty-room", - dl->length + spilled, dl->length, - (txn->mt_parent && txn->mt_parent->tw.dirtylist) - ? (int)txn->mt_parent->tw.dirtylist->length - : -1, - txn->tw.loose_count, need, spillable, spilled, - txn->tw.dirtyroom); - mdbx_ensure(txn->mt_env, txn->tw.loose_count + txn->tw.dirtyroom > need / 2); + ERROR("dirty-list length: before %u, after %u, parent %i, loose %u; " + "needed %u, spillable %u; " + "spilled %u dirty-entries, now have %u dirty-room", + dl->length + spilled, dl->length, + (txn->mt_parent && txn->mt_parent->tw.dirtylist) + ? (int)txn->mt_parent->tw.dirtylist->length + : -1, + txn->tw.loose_count, need, spillable, spilled, txn->tw.dirtyroom); + ENSURE(txn->mt_env, txn->tw.loose_count + txn->tw.dirtyroom > need / 2); #endif /* xMDBX_DEBUG_SPILLING */ done: @@ -5027,8 +4998,8 @@ done: : MDBX_TXN_FULL; } -static int mdbx_cursor_spill(MDBX_cursor *mc, const MDBX_val *key, - const MDBX_val *data) { +static int cursor_spill(MDBX_cursor *mc, const MDBX_val *key, + const MDBX_val *data) { MDBX_txn *txn = mc->mc_txn; /* Estimate how much space this operation will take: */ /* 1) Max b-tree height, reasonable enough with including dups' sub-tree */ @@ -5055,7 +5026,7 @@ static int mdbx_cursor_spill(MDBX_cursor *mc, const MDBX_val *key, mc->mc_txn->mt_env->debug_dirtied_act = 0; #endif /* xMDBX_DEBUG_SPILLING == 2 */ - return mdbx_txn_spill(txn, mc, need); + return txn_spill(txn, mc, need); } /*----------------------------------------------------------------------------*/ @@ -5080,7 +5051,7 @@ MDBX_NOTHROW_PURE_FUNCTION static __inline txnid_t constmeta_txnid(const MDBX_env *env, const MDBX_meta *meta) { txnid_t a = unaligned_peek_u64(4, &meta->mm_txnid_a); txnid_t b = unaligned_peek_u64(4, &meta->mm_txnid_b); - mdbx_assert(env, a == b); + eASSERT(env, a == b); (void)env; return (a == b) ? a : 0; } @@ -5104,31 +5075,31 @@ static __inline txnid_t meta_txnid(const MDBX_env *env, static __inline void meta_update_begin(const MDBX_env *env, MDBX_meta *meta, txnid_t txnid) { - mdbx_assert(env, meta >= METAPAGE(env, 0) && meta < METAPAGE_END(env)); - mdbx_assert(env, unaligned_peek_u64(4, meta->mm_txnid_a) < txnid && - unaligned_peek_u64(4, meta->mm_txnid_b) < txnid); + eASSERT(env, meta >= METAPAGE(env, 0) && meta < METAPAGE_END(env)); + eASSERT(env, unaligned_peek_u64(4, meta->mm_txnid_a) < txnid && + unaligned_peek_u64(4, meta->mm_txnid_b) < txnid); (void)env; unaligned_poke_u64(4, meta->mm_txnid_b, 0); - mdbx_memory_fence(mo_AcquireRelease, true); + osal_memory_fence(mo_AcquireRelease, true); unaligned_poke_u64(4, meta->mm_txnid_a, txnid); } static __inline void meta_update_end(const MDBX_env *env, MDBX_meta *meta, txnid_t txnid) { - mdbx_assert(env, meta >= METAPAGE(env, 0) && meta < METAPAGE_END(env)); - mdbx_assert(env, unaligned_peek_u64(4, meta->mm_txnid_a) == txnid); - mdbx_assert(env, unaligned_peek_u64(4, meta->mm_txnid_b) < txnid); + eASSERT(env, meta >= METAPAGE(env, 0) && meta < METAPAGE_END(env)); + eASSERT(env, unaligned_peek_u64(4, meta->mm_txnid_a) == txnid); + eASSERT(env, unaligned_peek_u64(4, meta->mm_txnid_b) < txnid); (void)env; - mdbx_jitter4testing(true); + jitter4testing(true); memcpy(&meta->mm_bootid, &bootid, 16); unaligned_poke_u64(4, meta->mm_txnid_b, txnid); - mdbx_memory_fence(mo_AcquireRelease, true); + osal_memory_fence(mo_AcquireRelease, true); } static __inline void meta_set_txnid(const MDBX_env *env, MDBX_meta *meta, const txnid_t txnid) { - mdbx_assert(env, !env->me_map || meta < METAPAGE(env, 0) || - meta >= METAPAGE_END(env)); + eASSERT(env, + !env->me_map || meta < METAPAGE(env, 0) || meta >= METAPAGE_END(env)); (void)env; /* update inconsistently since this function used ONLY for filling meta-image * for writing, but not the actual meta-page */ @@ -5153,20 +5124,20 @@ enum meta_choice_mode { prefer_last, prefer_steady }; static __inline bool meta_ot(const enum meta_choice_mode mode, const MDBX_env *env, volatile const MDBX_meta *a, volatile const MDBX_meta *b) { - mdbx_jitter4testing(true); + jitter4testing(true); const txnid_t txnid_a = meta_txnid(env, a); - mdbx_jitter4testing(true); + jitter4testing(true); const txnid_t txnid_b = meta_txnid(env, b); - mdbx_jitter4testing(true); + jitter4testing(true); const bool is_stead_b = META_IS_STEADY(b); if (mode == prefer_steady) { - mdbx_jitter4testing(true); + jitter4testing(true); const bool is_stead_a = META_IS_STEADY(a); if (is_stead_a != is_stead_b) return is_stead_b; } else { - mdbx_assert(env, mode == prefer_last); + eASSERT(env, mode == prefer_last); } if (txnid_a == txnid_b) return is_stead_b; @@ -5175,16 +5146,16 @@ static __inline bool meta_ot(const enum meta_choice_mode mode, static bool meta_eq(const MDBX_env *env, volatile const MDBX_meta *a, volatile const MDBX_meta *b) { - mdbx_jitter4testing(true); + jitter4testing(true); const txnid_t txnid = meta_txnid(env, a); if (!txnid || txnid != meta_txnid(env, b)) return false; - mdbx_jitter4testing(true); + jitter4testing(true); if (META_IS_STEADY(a) != META_IS_STEADY(b)) return false; - mdbx_jitter4testing(true); + jitter4testing(true); return true; } @@ -5205,7 +5176,7 @@ static __always_inline volatile const MDBX_meta * meta_recent(const enum meta_choice_mode mode, const MDBX_env *env, volatile const MDBX_meta *a, volatile const MDBX_meta *b) { const bool a_older_that_b = meta_ot(mode, env, a, b); - mdbx_assert(env, !meta_eq(env, a, b)); + eASSERT(env, !meta_eq(env, a, b)); return a_older_that_b ? b : a; } @@ -5213,7 +5184,7 @@ static const MDBX_meta *meta_ancient_prefer_weak(const MDBX_env *env, const MDBX_meta *a, const MDBX_meta *b) { const bool a_older_that_b = meta_ot(prefer_steady, env, a, b); - mdbx_assert(env, !meta_eq(env, a, b)); + eASSERT(env, !meta_eq(env, a, b)); return a_older_that_b ? a : b; } @@ -5241,8 +5212,7 @@ MDBX_NOTHROW_PURE_FUNCTION static __inline const MDBX_meta * constmeta_prefer_steady(const MDBX_env *env) { #if MDBX_CACHE_METAPTR if (likely(env->cache_steady_meta)) { - mdbx_assert(env, - env->cache_steady_meta == meta_mostrecent(prefer_steady, env)); + eASSERT(env, env->cache_steady_meta == meta_mostrecent(prefer_steady, env)); return (const MDBX_meta *)env->cache_steady_meta; } #endif /* MDBX_CACHE_METAPTR */ @@ -5262,25 +5232,25 @@ MDBX_NOTHROW_PURE_FUNCTION static __inline const MDBX_meta * constmeta_prefer_last(const MDBX_env *env) { #if MDBX_CACHE_METAPTR if (likely(env->cache_last_meta)) { - mdbx_assert(env, env->cache_last_meta == meta_mostrecent(prefer_last, env)); + eASSERT(env, env->cache_last_meta == meta_mostrecent(prefer_last, env)); return (const MDBX_meta *)env->cache_last_meta; } #endif /* MDBX_CACHE_METAPTR */ return (const MDBX_meta *)meta_prefer_last(env); } -__cold static txnid_t mdbx_recent_committed_txnid(const MDBX_env *env) { +__cold static txnid_t recent_committed_txnid(const MDBX_env *env) { while (true) { volatile const MDBX_meta *head = meta_prefer_last(env); const txnid_t recent = meta_txnid(env, head); - mdbx_memory_fence(mo_AcquireRelease, false); + osal_memory_fence(mo_AcquireRelease, false); if (likely(head == meta_prefer_last(env) && recent == meta_txnid(env, head))) return recent; } } -static const char *mdbx_durable_str(volatile const MDBX_meta *const meta) { +static const char *durable_caption(volatile const MDBX_meta *const meta) { if (META_IS_STEADY(meta)) return (unaligned_peek_u64_volatile(4, meta->mm_datasync_sign) == meta_sign((const MDBX_meta *)meta)) @@ -5295,24 +5265,24 @@ static const char *mdbx_durable_str(volatile const MDBX_meta *const meta) { static txnid_t find_oldest_reader(MDBX_env *env) { const uint32_t nothing_changed = MDBX_STRING_TETRAD("None"); const txnid_t steady = constmeta_txnid(env, constmeta_prefer_steady(env)); - mdbx_assert(env, steady <= env->me_txn0->mt_txnid); + eASSERT(env, steady <= env->me_txn0->mt_txnid); MDBX_lockinfo *const lck = env->me_lck_mmap.lck; if (unlikely(lck == NULL /* exclusive without-lck mode */)) { - mdbx_assert(env, env->me_lck == (void *)&env->x_lckless_stub); + eASSERT(env, env->me_lck == (void *)&env->x_lckless_stub); return env->me_lck->mti_oldest_reader.weak = steady; } const txnid_t prev_oldest = atomic_load64(&lck->mti_oldest_reader, mo_AcquireRelease); - mdbx_assert(env, steady >= prev_oldest); + eASSERT(env, steady >= prev_oldest); txnid_t new_oldest = prev_oldest; while (new_oldest != steady && nothing_changed != atomic_load32(&lck->mti_readers_refresh_flag, mo_AcquireRelease)) { lck->mti_readers_refresh_flag.weak = nothing_changed; - mdbx_jitter4testing(false); + jitter4testing(false); const unsigned snap_nreaders = atomic_load32(&lck->mti_numreaders, mo_AcquireRelease); new_oldest = steady; @@ -5322,7 +5292,7 @@ static txnid_t find_oldest_reader(MDBX_env *env) { atomic_load32(&lck->mti_readers[i].mr_pid, mo_AcquireRelease); if (!pid) continue; - mdbx_jitter4testing(true); + jitter4testing(true); const txnid_t rtxn = safe64_read(&lck->mti_readers[i].mr_txnid); if (unlikely(rtxn < prev_oldest)) { @@ -5330,9 +5300,9 @@ static txnid_t find_oldest_reader(MDBX_env *env) { atomic_load32(&lck->mti_readers_refresh_flag, mo_AcquireRelease)) && safe64_reset_compare(&lck->mti_readers[i].mr_txnid, rtxn)) { - mdbx_notice("kick stuck reader[%u of %u].pid_%u %" PRIaTXN - " < prev-oldest %" PRIaTXN ", steady-txn %" PRIaTXN, - i, snap_nreaders, pid, rtxn, prev_oldest, steady); + NOTICE("kick stuck reader[%u of %u].pid_%u %" PRIaTXN + " < prev-oldest %" PRIaTXN ", steady-txn %" PRIaTXN, + i, snap_nreaders, pid, rtxn, prev_oldest, steady); } continue; } @@ -5346,9 +5316,8 @@ static txnid_t find_oldest_reader(MDBX_env *env) { } if (new_oldest != prev_oldest) { - mdbx_verbose("update oldest %" PRIaTXN " -> %" PRIaTXN, prev_oldest, - new_oldest); - mdbx_assert(env, new_oldest >= lck->mti_oldest_reader.weak); + VERBOSE("update oldest %" PRIaTXN " -> %" PRIaTXN, prev_oldest, new_oldest); + eASSERT(env, new_oldest >= lck->mti_oldest_reader.weak); atomic_store64(&lck->mti_oldest_reader, new_oldest, mo_Relaxed); } return new_oldest; @@ -5364,7 +5333,7 @@ __cold static pgno_t find_largest_snapshot(const MDBX_env *env, atomic_load32(&lck->mti_numreaders, mo_AcquireRelease); for (unsigned i = 0; i < snap_nreaders; ++i) { if (atomic_load32(&lck->mti_readers[i].mr_pid, mo_AcquireRelease)) { - /* mdbx_jitter4testing(true); */ + /* jitter4testing(true); */ const pgno_t snap_pages = atomic_load32( &lck->mti_readers[i].mr_snapshot_pages_used, mo_Relaxed); const txnid_t snap_txnid = safe64_read(&lck->mti_readers[i].mr_txnid); @@ -5384,13 +5353,13 @@ __cold static pgno_t find_largest_snapshot(const MDBX_env *env, } /* Add a page to the txn's dirty list */ -static int __must_check_result mdbx_page_dirty(MDBX_txn *txn, MDBX_page *mp, - unsigned npages) { +__hot static int __must_check_result page_dirty(MDBX_txn *txn, MDBX_page *mp, + unsigned npages) { #if xMDBX_DEBUG_SPILLING == 2 txn->mt_env->debug_dirtied_act += 1; - mdbx_ensure(txn->mt_env, - txn->mt_env->debug_dirtied_act < txn->mt_env->debug_dirtied_est); - mdbx_ensure(txn->mt_env, txn->tw.dirtyroom + txn->tw.loose_count > 0); + ENSURE(txn->mt_env, + txn->mt_env->debug_dirtied_act < txn->mt_env->debug_dirtied_est); + ENSURE(txn->mt_env, txn->tw.dirtyroom + txn->tw.loose_count > 0); #endif /* xMDBX_DEBUG_SPILLING == 2 */ int rc; @@ -5398,35 +5367,34 @@ static int __must_check_result mdbx_page_dirty(MDBX_txn *txn, MDBX_page *mp, if (unlikely(txn->tw.dirtyroom == 0)) { if (txn->tw.loose_count) { MDBX_page *loose = txn->tw.loose_pages; - mdbx_debug("purge-and-reclaim loose page %" PRIaPGNO, loose->mp_pgno); - rc = mdbx_pnl_insert_range(&txn->tw.reclaimed_pglist, loose->mp_pgno, 1); + DEBUG("purge-and-reclaim loose page %" PRIaPGNO, loose->mp_pgno); + rc = pnl_insert_range(&txn->tw.reclaimed_pglist, loose->mp_pgno, 1); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; unsigned di = dpl_search(txn, loose->mp_pgno); - mdbx_tassert(txn, txn->tw.dirtylist->items[di].ptr == loose); - mdbx_dpl_remove(txn, di); + tASSERT(txn, txn->tw.dirtylist->items[di].ptr == loose); + dpl_remove(txn, di); txn->tw.loose_pages = loose->mp_next; txn->tw.loose_count--; txn->tw.dirtyroom++; if (!(txn->mt_flags & MDBX_WRITEMAP)) - mdbx_dpage_free(txn->mt_env, loose, 1); + dpage_free(txn->mt_env, loose, 1); } else { - mdbx_error("Dirtyroom is depleted, DPL length %u", - txn->tw.dirtylist->length); + ERROR("Dirtyroom is depleted, DPL length %u", txn->tw.dirtylist->length); if (!(txn->mt_flags & MDBX_WRITEMAP)) - mdbx_dpage_free(txn->mt_env, mp, npages); + dpage_free(txn->mt_env, mp, npages); return MDBX_TXN_FULL; } } - rc = mdbx_dpl_append(txn, mp->mp_pgno, mp, npages); + rc = dpl_append(txn, mp->mp_pgno, mp, npages); if (unlikely(rc != MDBX_SUCCESS)) { bailout: txn->mt_flags |= MDBX_TXN_ERROR; return rc; } txn->tw.dirtyroom--; - mdbx_tassert(txn, mdbx_dirtylist_check(txn)); + tASSERT(txn, dirtylist_check(txn)); return MDBX_SUCCESS; } @@ -5460,11 +5428,10 @@ MDBX_MAYBE_UNUSED static __always_inline int ignore_enosys(int err) { #if MDBX_ENABLE_MADVISE /* Turn on/off readahead. It's harmful when the DB is larger than RAM. */ -__cold static int mdbx_set_readahead(MDBX_env *env, const pgno_t edge, - const bool enable, - const bool force_whole) { - mdbx_assert(env, edge >= NUM_METAS && edge <= MAX_PAGENO + 1); - mdbx_assert(env, (enable & 1) == (enable != 0)); +__cold static int set_readahead(MDBX_env *env, const pgno_t edge, + const bool enable, const bool force_whole) { + eASSERT(env, edge >= NUM_METAS && edge <= MAX_PAGENO + 1); + eASSERT(env, (enable & 1) == (enable != 0)); const bool toggle = force_whole || ((enable ^ env->me_lck->mti_readahead_anchor) & 1) || !env->me_lck->mti_readahead_anchor; @@ -5480,12 +5447,12 @@ __cold static int mdbx_set_readahead(MDBX_env *env, const pgno_t edge, length = (length < limit) ? length : limit; length -= offset; - mdbx_assert(env, 0 <= (intptr_t)length); + eASSERT(env, 0 <= (intptr_t)length); if (length == 0) return MDBX_SUCCESS; - mdbx_notice("readahead %s %u..%u", enable ? "ON" : "OFF", - bytes2pgno(env, offset), bytes2pgno(env, offset + length)); + NOTICE("readahead %s %u..%u", enable ? "ON" : "OFF", bytes2pgno(env, offset), + bytes2pgno(env, offset + length)); #if defined(F_RDAHEAD) if (toggle && unlikely(fcntl(env->me_lazy_fd, F_RDAHEAD, enable) == -1)) @@ -5587,9 +5554,9 @@ __cold static int mdbx_set_readahead(MDBX_env *env, const pgno_t edge, } #endif /* MDBX_ENABLE_MADVISE */ -__cold static int mdbx_mapresize(MDBX_env *env, const pgno_t used_pgno, - const pgno_t size_pgno, - const pgno_t limit_pgno, const bool implicit) { +__cold static int map_resize(MDBX_env *env, const pgno_t used_pgno, + const pgno_t size_pgno, const pgno_t limit_pgno, + const bool implicit) { const size_t limit_bytes = pgno_align2os_bytes(env, limit_pgno); const size_t size_bytes = pgno_align2os_bytes(env, size_pgno); const size_t prev_size = env->me_dxb_mmap.current; @@ -5598,22 +5565,22 @@ __cold static int mdbx_mapresize(MDBX_env *env, const pgno_t used_pgno, const void *const prev_addr = env->me_map; #endif /* MDBX_ENABLE_MADVISE || MDBX_USE_VALGRIND */ - mdbx_verbose("resize datafile/mapping: " - "present %" PRIuPTR " -> %" PRIuPTR ", " - "limit %" PRIuPTR " -> %" PRIuPTR, - prev_size, size_bytes, prev_limit, limit_bytes); + VERBOSE("resize datafile/mapping: " + "present %" PRIuPTR " -> %" PRIuPTR ", " + "limit %" PRIuPTR " -> %" PRIuPTR, + prev_size, size_bytes, prev_limit, limit_bytes); - mdbx_assert(env, limit_bytes >= size_bytes); - mdbx_assert(env, bytes2pgno(env, size_bytes) >= size_pgno); - mdbx_assert(env, bytes2pgno(env, limit_bytes) >= limit_pgno); + eASSERT(env, limit_bytes >= size_bytes); + eASSERT(env, bytes2pgno(env, size_bytes) >= size_pgno); + eASSERT(env, bytes2pgno(env, limit_bytes) >= limit_pgno); unsigned mresize_flags = env->me_flags & (MDBX_RDONLY | MDBX_WRITEMAP | MDBX_UTTERLY_NOSYNC); #if defined(_WIN32) || defined(_WIN64) /* Acquire guard in exclusive mode for: * - to avoid collision between read and write txns around env->me_dbgeo; - * - to avoid attachment of new reading threads (see mdbx_rdt_lock); */ - mdbx_srwlock_AcquireExclusive(&env->me_remap_guard); + * - to avoid attachment of new reading threads (see osal_rdt_lock); */ + osal_srwlock_AcquireExclusive(&env->me_remap_guard); mdbx_handle_array_t *suspended = NULL; mdbx_handle_array_t array_onstack; int rc = MDBX_SUCCESS; @@ -5635,9 +5602,9 @@ __cold static int mdbx_mapresize(MDBX_env *env, const pgno_t used_pgno, array_onstack.limit = ARRAY_LENGTH(array_onstack.handles); array_onstack.count = 0; suspended = &array_onstack; - rc = mdbx_suspend_threads_before_remap(env, &suspended); + rc = osal_suspend_threads_before_remap(env, &suspended); if (rc != MDBX_SUCCESS) { - mdbx_error("failed suspend-for-remap: errcode %d", rc); + ERROR("failed suspend-for-remap: errcode %d", rc); goto bailout; } mresize_flags |= implicit ? MDBX_MRESIZE_MAY_UNMAP @@ -5646,7 +5613,7 @@ __cold static int mdbx_mapresize(MDBX_env *env, const pgno_t used_pgno, #else /* Windows */ /* Acquire guard to avoid collision between read and write txns * around env->me_dbgeo */ - int rc = mdbx_fastmutex_acquire(&env->me_remap_guard); + int rc = osal_fastmutex_acquire(&env->me_remap_guard); if (unlikely(rc != MDBX_SUCCESS)) return rc; if (limit_bytes == env->me_dxb_mmap.limit && @@ -5656,7 +5623,7 @@ __cold static int mdbx_mapresize(MDBX_env *env, const pgno_t used_pgno, MDBX_lockinfo *const lck = env->me_lck_mmap.lck; if (limit_bytes != env->me_dxb_mmap.limit && !(env->me_flags & MDBX_NOTLS) && lck && !implicit) { - int err = mdbx_rdt_lock(env) /* lock readers table until remap done */; + int err = osal_rdt_lock(env) /* lock readers table until remap done */; if (unlikely(MDBX_IS_ERROR(err))) { rc = err; goto bailout; @@ -5665,14 +5632,14 @@ __cold static int mdbx_mapresize(MDBX_env *env, const pgno_t used_pgno, /* looking for readers from this process */ const unsigned snap_nreaders = atomic_load32(&lck->mti_numreaders, mo_AcquireRelease); - mdbx_assert(env, !implicit); + eASSERT(env, !implicit); mresize_flags |= MDBX_MRESIZE_MAY_UNMAP | MDBX_MRESIZE_MAY_MOVE; for (unsigned i = 0; i < snap_nreaders; ++i) { if (lck->mti_readers[i].mr_pid.weak == env->me_pid && - lck->mti_readers[i].mr_tid.weak != mdbx_thread_self()) { + lck->mti_readers[i].mr_tid.weak != osal_thread_self()) { /* the base address of the mapping can't be changed since * the other reader thread from this process exists. */ - mdbx_rdt_unlock(env); + osal_rdt_unlock(env); mresize_flags &= ~(MDBX_MRESIZE_MAY_UNMAP | MDBX_MRESIZE_MAY_MOVE); break; } @@ -5684,7 +5651,7 @@ __cold static int mdbx_mapresize(MDBX_env *env, const pgno_t used_pgno, #if MDBX_ENABLE_PGOP_STAT env->me_lck->mti_pgop_stat.wops.weak += 1; #endif /* MDBX_ENABLE_PGOP_STAT */ - rc = mdbx_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, used_pgno), + rc = osal_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, used_pgno), MDBX_SYNC_NONE); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; @@ -5692,9 +5659,9 @@ __cold static int mdbx_mapresize(MDBX_env *env, const pgno_t used_pgno, #if MDBX_ENABLE_MADVISE if (size_bytes < prev_size) { - mdbx_notice("resize-MADV_%s %u..%u", - (env->me_flags & MDBX_WRITEMAP) ? "REMOVE" : "DONTNEED", - size_pgno, bytes2pgno(env, prev_size)); + NOTICE("resize-MADV_%s %u..%u", + (env->me_flags & MDBX_WRITEMAP) ? "REMOVE" : "DONTNEED", size_pgno, + bytes2pgno(env, prev_size)); rc = MDBX_RESULT_TRUE; #if defined(MADV_REMOVE) if (env->me_flags & MDBX_WRITEMAP) @@ -5728,7 +5695,7 @@ __cold static int mdbx_mapresize(MDBX_env *env, const pgno_t used_pgno, #endif /* MDBX_ENABLE_MADVISE */ meta_cache_clear(env); - rc = mdbx_mresize(mresize_flags, &env->me_dxb_mmap, size_bytes, limit_bytes); + rc = osal_mresize(mresize_flags, &env->me_dxb_mmap, size_bytes, limit_bytes); #if MDBX_ENABLE_MADVISE if (rc == MDBX_SUCCESS) { @@ -5742,15 +5709,15 @@ __cold static int mdbx_mapresize(MDBX_env *env, const pgno_t used_pgno, || prev_size > size_bytes #endif /* Windows */ ; - rc = mdbx_set_readahead(env, size_pgno, readahead, force); + rc = set_readahead(env, size_pgno, readahead, force); } #endif /* MDBX_ENABLE_MADVISE */ bailout: if (rc == MDBX_SUCCESS) { - mdbx_assert(env, size_bytes == env->me_dxb_mmap.current); - mdbx_assert(env, size_bytes <= env->me_dxb_mmap.filesize); - mdbx_assert(env, limit_bytes == env->me_dxb_mmap.limit); + eASSERT(env, size_bytes == env->me_dxb_mmap.current); + eASSERT(env, size_bytes <= env->me_dxb_mmap.filesize); + eASSERT(env, limit_bytes == env->me_dxb_mmap.limit); #ifdef MDBX_USE_VALGRIND if (prev_limit != env->me_dxb_mmap.limit || prev_addr != env->me_map) { VALGRIND_DISCARD(env->me_valgrind_handle); @@ -5762,15 +5729,15 @@ bailout: #endif /* MDBX_USE_VALGRIND */ } else { if (rc != MDBX_UNABLE_EXTEND_MAPSIZE && rc != MDBX_EPERM) { - mdbx_error("failed resize datafile/mapping: " - "present %" PRIuPTR " -> %" PRIuPTR ", " - "limit %" PRIuPTR " -> %" PRIuPTR ", errcode %d", - prev_size, size_bytes, prev_limit, limit_bytes, rc); + ERROR("failed resize datafile/mapping: " + "present %" PRIuPTR " -> %" PRIuPTR ", " + "limit %" PRIuPTR " -> %" PRIuPTR ", errcode %d", + prev_size, size_bytes, prev_limit, limit_bytes, rc); } else { - mdbx_warning("unable resize datafile/mapping: " - "present %" PRIuPTR " -> %" PRIuPTR ", " - "limit %" PRIuPTR " -> %" PRIuPTR ", errcode %d", - prev_size, size_bytes, prev_limit, limit_bytes, rc); + WARNING("unable resize datafile/mapping: " + "present %" PRIuPTR " -> %" PRIuPTR ", " + "limit %" PRIuPTR " -> %" PRIuPTR ", errcode %d", + prev_size, size_bytes, prev_limit, limit_bytes, rc); } if (!env->me_dxb_mmap.address) { env->me_flags |= MDBX_FATAL_ERROR; @@ -5782,31 +5749,31 @@ bailout: #if defined(_WIN32) || defined(_WIN64) int err = MDBX_SUCCESS; - mdbx_srwlock_ReleaseExclusive(&env->me_remap_guard); + osal_srwlock_ReleaseExclusive(&env->me_remap_guard); if (suspended) { - err = mdbx_resume_threads_after_remap(suspended); + err = osal_resume_threads_after_remap(suspended); if (suspended != &array_onstack) - mdbx_free(suspended); + osal_free(suspended); } #else if (env->me_lck_mmap.lck && (mresize_flags & (MDBX_MRESIZE_MAY_UNMAP | MDBX_MRESIZE_MAY_MOVE)) != 0) - mdbx_rdt_unlock(env); - int err = mdbx_fastmutex_release(&env->me_remap_guard); + osal_rdt_unlock(env); + int err = osal_fastmutex_release(&env->me_remap_guard); #endif /* Windows */ if (err != MDBX_SUCCESS) { - mdbx_fatal("failed resume-after-remap: errcode %d", err); + FATAL("failed resume-after-remap: errcode %d", err); return MDBX_PANIC; } return rc; } -__cold static int mdbx_mapresize_implicit(MDBX_env *env, const pgno_t used_pgno, - const pgno_t size_pgno, - const pgno_t limit_pgno) { +__cold static int map_resize_implicit(MDBX_env *env, const pgno_t used_pgno, + const pgno_t size_pgno, + const pgno_t limit_pgno) { const pgno_t mapped_pgno = bytes2pgno(env, env->me_dxb_mmap.limit); - mdbx_assert(env, mapped_pgno >= used_pgno); - return mdbx_mapresize( + eASSERT(env, mapped_pgno >= used_pgno); + return map_resize( env, used_pgno, size_pgno, (size_pgno > mapped_pgno) ? limit_pgno @@ -5816,44 +5783,44 @@ __cold static int mdbx_mapresize_implicit(MDBX_env *env, const pgno_t used_pgno, true); } -static int mdbx_meta_unsteady(MDBX_env *env, const txnid_t last_steady, - MDBX_meta *const meta, mdbx_filehandle_t fd) { +static int meta_unsteady(MDBX_env *env, const txnid_t last_steady, + MDBX_meta *const meta, mdbx_filehandle_t fd) { const uint64_t wipe = MDBX_DATASIGN_NONE; if (unlikely(META_IS_STEADY(meta)) && constmeta_txnid(env, meta) <= last_steady) { - mdbx_warning("wipe txn #%" PRIaTXN ", meta %" PRIaPGNO, last_steady, - data_page(meta)->mp_pgno); + WARNING("wipe txn #%" PRIaTXN ", meta %" PRIaPGNO, last_steady, + data_page(meta)->mp_pgno); if (env->me_flags & MDBX_WRITEMAP) unaligned_poke_u64(4, meta->mm_datasync_sign, wipe); else - return mdbx_pwrite(fd, &wipe, sizeof(meta->mm_datasync_sign), + return osal_pwrite(fd, &wipe, sizeof(meta->mm_datasync_sign), (uint8_t *)&meta->mm_datasync_sign - env->me_map); if (constmeta_txnid(env, meta) == last_steady) - mdbx_assert(env, meta_checktxnid(env, meta, true)); + eASSERT(env, meta_checktxnid(env, meta, true)); } return MDBX_SUCCESS; } -__cold static int mdbx_wipe_steady(MDBX_env *env, const txnid_t last_steady) { +__cold static int wipe_steady(MDBX_env *env, const txnid_t last_steady) { #if MDBX_ENABLE_PGOP_STAT env->me_lck->mti_pgop_stat.wops.weak += 1; #endif /* MDBX_ENABLE_PGOP_STAT */ const mdbx_filehandle_t fd = (env->me_dsync_fd != INVALID_HANDLE_VALUE) ? env->me_dsync_fd : env->me_lazy_fd; - int err = mdbx_meta_unsteady(env, last_steady, METAPAGE(env, 0), fd); + int err = meta_unsteady(env, last_steady, METAPAGE(env, 0), fd); if (unlikely(err != MDBX_SUCCESS)) return err; - err = mdbx_meta_unsteady(env, last_steady, METAPAGE(env, 1), fd); + err = meta_unsteady(env, last_steady, METAPAGE(env, 1), fd); if (unlikely(err != MDBX_SUCCESS)) return err; - err = mdbx_meta_unsteady(env, last_steady, METAPAGE(env, 2), fd); + err = meta_unsteady(env, last_steady, METAPAGE(env, 2), fd); if (unlikely(err != MDBX_SUCCESS)) return err; if (env->me_flags & MDBX_WRITEMAP) { - mdbx_flush_incoherent_cpu_writeback(); - err = mdbx_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, NUM_METAS), + osal_flush_incoherent_cpu_writeback(); + err = osal_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, NUM_METAS), MDBX_SYNC_DATA); if (unlikely(err != MDBX_SUCCESS)) return err; @@ -5870,11 +5837,11 @@ __cold static int mdbx_wipe_steady(MDBX_env *env, const txnid_t last_steady) { } if (syncfilerange_unavailable) #endif /* MDBX_USE_SYNCFILERANGE */ - err = mdbx_fsync(env->me_lazy_fd, MDBX_SYNC_DATA); + err = osal_fsync(env->me_lazy_fd, MDBX_SYNC_DATA); if (unlikely(err != MDBX_SUCCESS)) return err; } - mdbx_flush_incoherent_mmap(env->me_map, pgno2bytes(env, NUM_METAS), + osal_flush_incoherent_mmap(env->me_map, pgno2bytes(env, NUM_METAS), env->me_os_psize); } @@ -6291,8 +6258,8 @@ static pgr_t page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num, int flags) { pgr_t ret; MDBX_txn *const txn = mc->mc_txn; MDBX_env *const env = txn->mt_env; - mdbx_assert(env, num == 0 || !(flags & MDBX_ALLOC_SLOT)); - mdbx_assert(env, num > 0 || !(flags & MDBX_ALLOC_NEW)); + eASSERT(env, num == 0 || !(flags & MDBX_ALLOC_SLOT)); + eASSERT(env, num > 0 || !(flags & MDBX_ALLOC_NEW)); const unsigned coalesce_threshold = env->me_maxgc_ov1page >> 2; if (likely(flags & MDBX_ALLOC_GC)) { @@ -6313,8 +6280,8 @@ static pgr_t page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num, int flags) { flags &= ~(MDBX_ALLOC_GC | MDBX_ALLOC_COALESCE); } - mdbx_assert(env, pnl_check_allocated(txn->tw.reclaimed_pglist, - txn->mt_next_pgno - MDBX_ENABLE_REFUND)); + eASSERT(env, pnl_check_allocated(txn->tw.reclaimed_pglist, + txn->mt_next_pgno - MDBX_ENABLE_REFUND)); pgno_t pgno, *re_list = txn->tw.reclaimed_pglist; unsigned re_len = MDBX_PNL_SIZE(re_list); pgno_t *range = nullptr; @@ -6331,17 +6298,17 @@ static pgr_t page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num, int flags) { /* Seek a big enough contiguous page range. * Prefer pages with lower pgno. */ - mdbx_assert(env, pnl_check_allocated(txn->tw.reclaimed_pglist, - txn->mt_next_pgno)); + eASSERT(env, + pnl_check_allocated(txn->tw.reclaimed_pglist, txn->mt_next_pgno)); if (!(flags & (MDBX_ALLOC_COALESCE | MDBX_ALLOC_SLOT)) && re_len >= num) { - mdbx_assert(env, MDBX_PNL_LAST(re_list) < txn->mt_next_pgno && - MDBX_PNL_FIRST(re_list) < txn->mt_next_pgno); + eASSERT(env, MDBX_PNL_LAST(re_list) < txn->mt_next_pgno && + MDBX_PNL_FIRST(re_list) < txn->mt_next_pgno); range = re_list + (MDBX_PNL_ASCENDING ? 1 : re_len); pgno = *range; if (num == 1) goto done; range = scan4seq(range, re_len, num - 1); - mdbx_tassert(txn, range == scan4range_checker(re_list, num - 1)); + tASSERT(txn, range == scan4range_checker(re_list, num - 1)); if (likely(range)) { pgno = *range; goto done; @@ -6355,11 +6322,11 @@ static pgr_t page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num, int flags) { /* Prepare to fetch and coalesce */ #if MDBX_ENABLE_PGOP_STAT if (likely(timestamp == 0)) - timestamp = mdbx_osal_monotime(); + timestamp = osal_monotime(); #endif /* MDBX_ENABLE_PGOP_STAT */ detent = find_oldest_reader(env) + 1; - ret.err = mdbx_cursor_init(&recur.outer, txn, FREE_DBI); + ret.err = cursor_init(&recur.outer, txn, FREE_DBI); if (unlikely(ret.err != MDBX_SUCCESS)) goto fail; if (flags & MDBX_LIFORECLAIM) { @@ -6435,14 +6402,14 @@ static pgr_t page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num, int flags) { /* Reading next GC record */ MDBX_page *const mp = recur.outer.mc_pg[recur.outer.mc_top]; - if (unlikely((ret.err = mdbx_node_read( + if (unlikely((ret.err = node_read( &recur.outer, page_node(mp, recur.outer.mc_ki[recur.outer.mc_top]), &data, mp)) != MDBX_SUCCESS)) goto fail; if ((flags & MDBX_LIFORECLAIM) && !txn->tw.lifo_reclaimed) { - txn->tw.lifo_reclaimed = mdbx_txl_alloc(); + txn->tw.lifo_reclaimed = txl_alloc(); if (unlikely(!txn->tw.lifo_reclaimed)) { ret.err = MDBX_ENOMEM; goto fail; @@ -6450,9 +6417,9 @@ static pgr_t page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num, int flags) { } /* Append PNL from GC record to tw.reclaimed_pglist */ - mdbx_cassert(mc, (mc->mc_flags & C_GCFREEZE) == 0); + cASSERT(mc, (mc->mc_flags & C_GCFREEZE) == 0); pgno_t *gc_pnl = (pgno_t *)data.iov_base; - mdbx_tassert(txn, data.iov_len >= MDBX_PNL_SIZEOF(gc_pnl)); + tASSERT(txn, data.iov_len >= MDBX_PNL_SIZEOF(gc_pnl)); if (unlikely(data.iov_len % sizeof(pgno_t) || data.iov_len < MDBX_PNL_SIZEOF(gc_pnl) || !pnl_check(gc_pnl, txn->mt_next_pgno))) { @@ -6473,61 +6440,58 @@ static pgr_t page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num, int flags) { * This is a rare case while search for a continuously multi-page region * in a large database. * todo4recovery://erased_by_github/libmdbx/issues/123 */ - mdbx_notice("stop reclaiming to avoid PNL overflow: %u (current) + %u " - "(chunk) -> %u", - MDBX_PNL_SIZE(txn->tw.reclaimed_pglist), gc_len, - gc_len + MDBX_PNL_SIZE(txn->tw.reclaimed_pglist)); + NOTICE("stop reclaiming to avoid PNL overflow: %u (current) + %u " + "(chunk) -> %u", + MDBX_PNL_SIZE(txn->tw.reclaimed_pglist), gc_len, + gc_len + MDBX_PNL_SIZE(txn->tw.reclaimed_pglist)); flags &= ~(MDBX_ALLOC_GC | MDBX_ALLOC_COALESCE); break; } - ret.err = mdbx_pnl_need(&txn->tw.reclaimed_pglist, gc_len); + ret.err = pnl_need(&txn->tw.reclaimed_pglist, gc_len); if (unlikely(ret.err != MDBX_SUCCESS)) goto fail; re_list = txn->tw.reclaimed_pglist; /* Remember ID of GC record */ if (flags & MDBX_LIFORECLAIM) { - ret.err = mdbx_txl_append(&txn->tw.lifo_reclaimed, last); + ret.err = txl_append(&txn->tw.lifo_reclaimed, last); if (unlikely(ret.err != MDBX_SUCCESS)) goto fail; } txn->tw.last_reclaimed = last; - if (mdbx_log_enabled(MDBX_LOG_EXTRA)) { - mdbx_debug_extra("PNL read txn %" PRIaTXN " root %" PRIaPGNO - " num %u, PNL", - last, txn->mt_dbs[FREE_DBI].md_root, gc_len); + if (LOG_ENABLED(MDBX_LOG_EXTRA)) { + DEBUG_EXTRA("PNL read txn %" PRIaTXN " root %" PRIaPGNO " num %u, PNL", + last, txn->mt_dbs[FREE_DBI].md_root, gc_len); for (unsigned i = gc_len; i; i--) - mdbx_debug_extra_print(" %" PRIaPGNO, gc_pnl[i]); - mdbx_debug_extra_print("%s\n", "."); + DEBUG_EXTRA_PRINT(" %" PRIaPGNO, gc_pnl[i]); + DEBUG_EXTRA_PRINT("%s\n", "."); } /* Merge in descending sorted order */ pnl_merge(re_list, gc_pnl); - if (mdbx_audit_enabled() && - unlikely(!pnl_check(re_list, txn->mt_next_pgno))) { + if (AUDIT_ENABLED() && unlikely(!pnl_check(re_list, txn->mt_next_pgno))) { ret.err = MDBX_CORRUPTED; goto fail; } - mdbx_tassert(txn, mdbx_dirtylist_check(txn)); + tASSERT(txn, dirtylist_check(txn)); re_len = MDBX_PNL_SIZE(re_list); - mdbx_tassert(txn, re_len == 0 || re_list[re_len] < txn->mt_next_pgno); + tASSERT(txn, re_len == 0 || re_list[re_len] < txn->mt_next_pgno); if (MDBX_ENABLE_REFUND && re_len && unlikely(MDBX_PNL_MOST(re_list) == txn->mt_next_pgno - 1)) { /* Refund suitable pages into "unallocated" space */ - mdbx_refund(txn); + txn_refund(txn); re_list = txn->tw.reclaimed_pglist; re_len = MDBX_PNL_SIZE(re_list); } /* Done for a kick-reclaim mode, actually no page needed */ if (unlikely(flags & MDBX_ALLOC_SLOT)) { - mdbx_debug("early-return NULL-page for %s mode", "MDBX_ALLOC_SLOT"); + DEBUG("early-return NULL-page for %s mode", "MDBX_ALLOC_SLOT"); #if MDBX_ENABLE_PGOP_STAT - mdbx_assert(env, timestamp != 0); - env->me_lck->mti_pgop_stat.gcrtime.weak += - mdbx_osal_monotime() - timestamp; + eASSERT(env, timestamp != 0); + env->me_lck->mti_pgop_stat.gcrtime.weak += osal_monotime() - timestamp; #endif /* MDBX_ENABLE_PGOP_STAT */ ret.err = MDBX_SUCCESS; ret.page = NULL; @@ -6537,14 +6501,13 @@ static pgr_t page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num, int flags) { /* Don't try to coalesce too much. */ if (re_len /* current size */ > coalesce_threshold) { if (flags & MDBX_ALLOC_COALESCE) - mdbx_trace("clear %s %s", "MDBX_ALLOC_COALESCE", - "since got threshold"); + TRACE("clear %s %s", "MDBX_ALLOC_COALESCE", "since got threshold"); flags &= ~MDBX_ALLOC_COALESCE; } } if (F_ISSET(flags, MDBX_ALLOC_COALESCE | MDBX_ALLOC_GC)) { - mdbx_debug_extra("clear %s and continue", "MDBX_ALLOC_COALESCE"); + DEBUG_EXTRA("clear %s and continue", "MDBX_ALLOC_COALESCE"); flags &= ~MDBX_ALLOC_COALESCE; continue; } @@ -6567,11 +6530,10 @@ static pgr_t page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num, int flags) { /* does reclaiming stopped at the last steady point? */ if (head != steady && META_IS_STEADY(steady) && detent == constmeta_txnid(env, steady) + 1) { - mdbx_debug("gc-kick-steady: head %" PRIaTXN "-%s, tail %" PRIaTXN - "-%s, detent %" PRIaTXN, - constmeta_txnid(env, head), mdbx_durable_str(head), - constmeta_txnid(env, steady), mdbx_durable_str(steady), - detent); + DEBUG("gc-kick-steady: head %" PRIaTXN "-%s, tail %" PRIaTXN + "-%s, detent %" PRIaTXN, + constmeta_txnid(env, head), durable_caption(head), + constmeta_txnid(env, steady), durable_caption(steady), detent); ret.err = MDBX_RESULT_TRUE; const pgno_t autosync_threshold = atomic_load32(&env->me_lck->mti_autosync_threshold, mo_Relaxed); @@ -6590,15 +6552,15 @@ static pgr_t page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num, int flags) { next >= steady->mm_geo.now)) { /* wipe steady checkpoint in MDBX_UTTERLY_NOSYNC mode * without any auto-sync threshold(s). */ - ret.err = mdbx_wipe_steady(env, detent); - mdbx_debug("gc-wipe-steady, rc %d", ret.err); - mdbx_assert(env, steady != meta_prefer_steady(env)); + ret.err = wipe_steady(env, detent); + DEBUG("gc-wipe-steady, rc %d", ret.err); + eASSERT(env, steady != meta_prefer_steady(env)); } else if ((flags & MDBX_ALLOC_NEW) == 0 || (autosync_threshold && atomic_load32(&env->me_lck->mti_unsynced_pages, mo_Relaxed) >= autosync_threshold) || (autosync_period && - mdbx_osal_monotime() - + osal_monotime() - atomic_load64(&env->me_lck->mti_sync_timestamp, mo_Relaxed) >= autosync_period) || @@ -6607,9 +6569,9 @@ static pgr_t page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num, int flags) { (autosync_threshold | autosync_period) == 0)) { /* make steady checkpoint. */ MDBX_meta meta = *head; - ret.err = mdbx_sync_locked(env, env->me_flags & MDBX_WRITEMAP, &meta); - mdbx_debug("gc-make-steady, rc %d", ret.err); - mdbx_assert(env, steady != meta_prefer_steady(env)); + ret.err = sync_locked(env, env->me_flags & MDBX_WRITEMAP, &meta); + DEBUG("gc-make-steady, rc %d", ret.err); + eASSERT(env, steady != meta_prefer_steady(env)); } if (likely(ret.err != MDBX_RESULT_TRUE)) { if (unlikely(ret.err != MDBX_SUCCESS)) @@ -6635,41 +6597,39 @@ static pgr_t page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num, int flags) { if (flags & MDBX_ALLOC_NEW) { ret.err = MDBX_MAP_FULL; if (next < txn->mt_geo.upper && txn->mt_geo.grow_pv) { - mdbx_assert(env, next > txn->mt_end_pgno); + eASSERT(env, next > txn->mt_end_pgno); const pgno_t grow_step = pv2pages(txn->mt_geo.grow_pv); size_t aligned = pgno_align2os_pgno( env, (pgno_t)(next + grow_step - next % grow_step)); if (aligned > txn->mt_geo.upper) aligned = txn->mt_geo.upper; - mdbx_assert(env, aligned > txn->mt_end_pgno); + eASSERT(env, aligned > txn->mt_end_pgno); - mdbx_verbose("try growth datafile to %zu pages (+%zu)", aligned, - aligned - txn->mt_end_pgno); - ret.err = mdbx_mapresize_implicit(env, txn->mt_next_pgno, - (pgno_t)aligned, txn->mt_geo.upper); + VERBOSE("try growth datafile to %zu pages (+%zu)", aligned, + aligned - txn->mt_end_pgno); + ret.err = map_resize_implicit(env, txn->mt_next_pgno, (pgno_t)aligned, + txn->mt_geo.upper); if (ret.err == MDBX_SUCCESS) { env->me_txn->mt_end_pgno = (pgno_t)aligned; goto done; } - mdbx_error("unable growth datafile to %zu pages (+%zu), errcode %d", - aligned, aligned - txn->mt_end_pgno, ret.err); + ERROR("unable growth datafile to %zu pages (+%zu), errcode %d", aligned, + aligned - txn->mt_end_pgno, ret.err); } else { - mdbx_notice("gc-alloc: next %zu > upper %" PRIaPGNO, next, - txn->mt_geo.upper); + NOTICE("gc-alloc: next %zu > upper %" PRIaPGNO, next, + txn->mt_geo.upper); } } fail: #if MDBX_ENABLE_PGOP_STAT if (timestamp) - env->me_lck->mti_pgop_stat.gcrtime.weak += - mdbx_osal_monotime() - timestamp; + env->me_lck->mti_pgop_stat.gcrtime.weak += osal_monotime() - timestamp; #endif /* MDBX_ENABLE_PGOP_STAT */ - mdbx_assert(env, - pnl_check_allocated(txn->tw.reclaimed_pglist, - txn->mt_next_pgno - MDBX_ENABLE_REFUND)); + eASSERT(env, pnl_check_allocated(txn->tw.reclaimed_pglist, + txn->mt_next_pgno - MDBX_ENABLE_REFUND)); int level; const char *what; if (likely(!(flags & MDBX_ALLOC_FAKE))) { @@ -6680,26 +6640,26 @@ static pgr_t page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num, int flags) { level = (flags & MDBX_ALLOC_NOLOG) ? MDBX_LOG_DEBUG : MDBX_LOG_NOTICE; what = (flags & MDBX_ALLOC_SLOT) ? "gc-slot/backlog" : "backlog-pages"; } - if (mdbx_log_enabled(level)) - mdbx_debug_log(level, __func__, __LINE__, - "unable alloc %u %s, flags 0x%x, errcode %d\n", num, what, - flags, ret.err); + if (LOG_ENABLED(level)) + debug_log(level, __func__, __LINE__, + "unable alloc %u %s, flags 0x%x, errcode %d\n", num, what, + flags, ret.err); - mdbx_assert(env, ret.err != MDBX_SUCCESS); + eASSERT(env, ret.err != MDBX_SUCCESS); ret.page = NULL; return ret; } done: - mdbx_assert(env, !(flags & MDBX_ALLOC_SLOT)); - mdbx_ensure(env, pgno >= NUM_METAS); + eASSERT(env, !(flags & MDBX_ALLOC_SLOT)); + ENSURE(env, pgno >= NUM_METAS); #if MDBX_ENABLE_PGOP_STAT if (likely(timestamp)) - env->me_lck->mti_pgop_stat.gcrtime.weak += mdbx_osal_monotime() - timestamp; + env->me_lck->mti_pgop_stat.gcrtime.weak += osal_monotime() - timestamp; #endif /* MDBX_ENABLE_PGOP_STAT */ if (unlikely(flags & MDBX_ALLOC_FAKE)) { - mdbx_debug("return NULL-page for %u pages %s allocation", num, - "gc-slot/backlog"); + DEBUG("return NULL-page for %u pages %s allocation", num, + "gc-slot/backlog"); ret.page = NULL; ret.err = MDBX_SUCCESS; return ret; @@ -6710,7 +6670,7 @@ done: VALGRIND_MAKE_MEM_UNDEFINED(ret.page, pgno2bytes(env, num)); MDBX_ASAN_UNPOISON_MEMORY_REGION(ret.page, pgno2bytes(env, num)); } else { - ret.page = mdbx_page_malloc(txn, num); + ret.page = page_malloc(txn, num); if (unlikely(!ret.page)) { ret.err = MDBX_ENOMEM; goto fail; @@ -6718,9 +6678,9 @@ done: } if (range) { - mdbx_cassert(mc, (mc->mc_flags & C_GCFREEZE) == 0); - mdbx_tassert(txn, pgno < txn->mt_next_pgno); - mdbx_tassert(txn, pgno == *range); + cASSERT(mc, (mc->mc_flags & C_GCFREEZE) == 0); + tASSERT(txn, pgno < txn->mt_next_pgno); + tASSERT(txn, pgno == *range); /* Cutoff allocated pages from tw.reclaimed_pglist */ #if MDBX_PNL_ASCENDING for (const pgno_t *const end = re_list + re_len - num; range <= end; @@ -6731,12 +6691,11 @@ done: range[-(ptrdiff_t)num] = *range; #endif MDBX_PNL_SIZE(re_list) = re_len -= num; - mdbx_tassert(txn, - pnl_check_allocated(txn->tw.reclaimed_pglist, + tASSERT(txn, pnl_check_allocated(txn->tw.reclaimed_pglist, txn->mt_next_pgno - MDBX_ENABLE_REFUND)); } else { txn->mt_next_pgno = pgno + num; - mdbx_assert(env, txn->mt_next_pgno <= txn->mt_end_pgno); + eASSERT(env, txn->mt_next_pgno <= txn->mt_end_pgno); } if (unlikely(env->me_flags & MDBX_PAGEPERTURB)) @@ -6746,16 +6705,15 @@ done: ret.page->mp_pgno = pgno; ret.page->mp_leaf2_ksize = 0; ret.page->mp_flags = 0; - if ((mdbx_assert_enabled() || mdbx_audit_enabled()) && num > 1) { + if ((ASSERT_ENABLED() || AUDIT_ENABLED()) && num > 1) { ret.page->mp_pages = num; ret.page->mp_flags = P_OVERFLOW; } - ret.err = mdbx_page_dirty(txn, ret.page, num); + ret.err = page_dirty(txn, ret.page, num); if (unlikely(ret.err != MDBX_SUCCESS)) goto fail; - mdbx_tassert(txn, - pnl_check_allocated(txn->tw.reclaimed_pglist, + tASSERT(txn, pnl_check_allocated(txn->tw.reclaimed_pglist, txn->mt_next_pgno - MDBX_ENABLE_REFUND)); return ret; } @@ -6767,7 +6725,7 @@ __hot static pgr_t page_alloc(MDBX_cursor *mc) { while (likely(txn->tw.loose_pages)) { #if MDBX_ENABLE_REFUND if (unlikely(txn->tw.loose_refund_wl > txn->mt_next_pgno)) { - mdbx_refund(txn); + txn_refund(txn); if (!txn->tw.loose_pages) break; } @@ -6776,10 +6734,9 @@ __hot static pgr_t page_alloc(MDBX_cursor *mc) { MDBX_page *page = txn->tw.loose_pages; txn->tw.loose_pages = page->mp_next; txn->tw.loose_count--; - mdbx_debug_extra("db %d use loose page %" PRIaPGNO, DDBI(mc), - page->mp_pgno); - mdbx_tassert(txn, page->mp_pgno < txn->mt_next_pgno); - mdbx_tassert(txn, page->mp_pgno >= NUM_METAS); + DEBUG_EXTRA("db %d use loose page %" PRIaPGNO, DDBI(mc), page->mp_pgno); + tASSERT(txn, page->mp_pgno < txn->mt_next_pgno); + tASSERT(txn, page->mp_pgno >= NUM_METAS); VALGRIND_MAKE_MEM_UNDEFINED(page_data(page), page_space(txn->mt_env)); MDBX_ASAN_UNPOISON_MEMORY_REGION(page_data(page), page_space(txn->mt_env)); page->mp_txnid = txn->mt_front; @@ -6806,7 +6763,7 @@ __hot static pgr_t page_alloc(MDBX_cursor *mc) { ret.page = pgno2page(env, pgno); MDBX_ASAN_UNPOISON_MEMORY_REGION(ret.page, env->me_psize); } else { - ret.page = mdbx_page_malloc(txn, 1); + ret.page = page_malloc(txn, 1); if (unlikely(!ret.page)) { ret.err = MDBX_ENOMEM; return ret; @@ -6817,11 +6774,10 @@ __hot static pgr_t page_alloc(MDBX_cursor *mc) { ret.page->mp_pgno = pgno; ret.page->mp_leaf2_ksize = 0; ret.page->mp_flags = 0; - mdbx_tassert(txn, ret.page->mp_pgno >= NUM_METAS); + tASSERT(txn, ret.page->mp_pgno >= NUM_METAS); - ret.err = mdbx_page_dirty(txn, ret.page, 1); - mdbx_tassert(txn, - pnl_check_allocated(txn->tw.reclaimed_pglist, + ret.err = page_dirty(txn, ret.page, 1); + tASSERT(txn, pnl_check_allocated(txn->tw.reclaimed_pglist, txn->mt_next_pgno - MDBX_ENABLE_REFUND)); return ret; } @@ -6831,8 +6787,8 @@ __hot static pgr_t page_alloc(MDBX_cursor *mc) { } /* Copy the used portions of a non-large/overflow page. */ -__hot static void mdbx_page_copy(MDBX_page *dst, const MDBX_page *src, - size_t psize) { +__hot static void page_copy(MDBX_page *dst, const MDBX_page *src, + size_t psize) { STATIC_ASSERT(UINT16_MAX > MAX_PAGESIZE - PAGEHDRSZ); STATIC_ASSERT(MIN_PAGESIZE > PAGEHDRSZ + NODESIZE * 4); if ((src->mp_flags & (P_LEAF2 | P_OVERFLOW)) == 0) { @@ -6856,34 +6812,34 @@ __hot static void mdbx_page_copy(MDBX_page *dst, const MDBX_page *src, * * If a page being referenced was spilled to disk in this txn, bring * it back and make it dirty/writable again. */ -static pgr_t __must_check_result mdbx_page_unspill(MDBX_txn *const txn, - const MDBX_page *const mp) { - mdbx_verbose("unspill page %" PRIaPGNO, mp->mp_pgno); - mdbx_tassert(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0); - mdbx_tassert(txn, IS_SPILLED(txn, mp)); +static pgr_t __must_check_result page_unspill(MDBX_txn *const txn, + const MDBX_page *const mp) { + VERBOSE("unspill page %" PRIaPGNO, mp->mp_pgno); + tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0); + tASSERT(txn, IS_SPILLED(txn, mp)); const MDBX_txn *scan = txn; pgr_t ret; do { - mdbx_tassert(txn, (scan->mt_flags & MDBX_TXN_SPILLS) != 0); - const unsigned si = mdbx_search_spilled(scan, mp->mp_pgno); + tASSERT(txn, (scan->mt_flags & MDBX_TXN_SPILLS) != 0); + const unsigned si = search_spilled(scan, mp->mp_pgno); if (!si) continue; const unsigned npages = IS_OVERFLOW(mp) ? mp->mp_pages : 1; - ret.page = mdbx_page_malloc(txn, npages); + ret.page = page_malloc(txn, npages); if (unlikely(!ret.page)) { ret.err = MDBX_ENOMEM; return ret; } - mdbx_page_copy(ret.page, mp, pgno2bytes(txn->mt_env, npages)); + page_copy(ret.page, mp, pgno2bytes(txn->mt_env, npages)); if (scan == txn) { /* If in current txn, this page is no longer spilled. * If it happens to be the last page, truncate the spill list. * Otherwise mark it as deleted by setting the LSB. */ - mdbx_spill_remove(txn, si, npages); + spill_remove(txn, si, npages); } /* otherwise, if belonging to a parent txn, the * page remains spilled until child commits */ - ret.err = mdbx_page_dirty(txn, ret.page, npages); + ret.err = page_dirty(txn, ret.page, npages); if (unlikely(ret.err != MDBX_SUCCESS)) return ret; #if MDBX_ENABLE_PGOP_STAT @@ -6894,11 +6850,11 @@ static pgr_t __must_check_result mdbx_page_unspill(MDBX_txn *const txn, return ret; } while (likely((scan = scan->mt_parent) != nullptr && (scan->mt_flags & MDBX_TXN_SPILLS) != 0)); - mdbx_error("Page %" PRIaPGNO " mod-txnid %" PRIaTXN - " not found in the spill-list(s), current txn %" PRIaTXN - " front %" PRIaTXN ", root txn %" PRIaTXN " front %" PRIaTXN, - mp->mp_pgno, mp->mp_txnid, txn->mt_txnid, txn->mt_front, - txn->mt_env->me_txn0->mt_txnid, txn->mt_env->me_txn0->mt_front); + ERROR("Page %" PRIaPGNO " mod-txnid %" PRIaTXN + " not found in the spill-list(s), current txn %" PRIaTXN + " front %" PRIaTXN ", root txn %" PRIaTXN " front %" PRIaTXN, + mp->mp_pgno, mp->mp_txnid, txn->mt_txnid, txn->mt_front, + txn->mt_env->me_txn0->mt_txnid, txn->mt_env->me_txn0->mt_front); ret.err = MDBX_PROBLEM; ret.page = NULL; return ret; @@ -6910,25 +6866,25 @@ static pgr_t __must_check_result mdbx_page_unspill(MDBX_txn *const txn, * [in] mc cursor pointing to the page to be touched * * Returns 0 on success, non-zero on failure. */ -__hot static int mdbx_page_touch(MDBX_cursor *mc) { +__hot static int page_touch(MDBX_cursor *mc) { const MDBX_page *const mp = mc->mc_pg[mc->mc_top]; MDBX_page *np; MDBX_txn *txn = mc->mc_txn; int rc; - if (mdbx_assert_enabled()) { + if (ASSERT_ENABLED()) { if (mc->mc_flags & C_SUB) { MDBX_xcursor *mx = container_of(mc->mc_db, MDBX_xcursor, mx_db); MDBX_cursor_couple *couple = container_of(mx, MDBX_cursor_couple, inner); - mdbx_tassert(txn, mc->mc_db == &couple->outer.mc_xcursor->mx_db); - mdbx_tassert(txn, mc->mc_dbx == &couple->outer.mc_xcursor->mx_dbx); - mdbx_tassert(txn, *couple->outer.mc_dbistate & DBI_DIRTY); + tASSERT(txn, mc->mc_db == &couple->outer.mc_xcursor->mx_db); + tASSERT(txn, mc->mc_dbx == &couple->outer.mc_xcursor->mx_dbx); + tASSERT(txn, *couple->outer.mc_dbistate & DBI_DIRTY); } else { - mdbx_tassert(txn, *mc->mc_dbistate & DBI_DIRTY); + tASSERT(txn, *mc->mc_dbistate & DBI_DIRTY); } - mdbx_tassert(txn, mc->mc_txn->mt_flags & MDBX_TXN_DIRTY); - mdbx_tassert(txn, !IS_OVERFLOW(mp)); - mdbx_tassert(txn, mdbx_dirtylist_check(txn)); + tASSERT(txn, mc->mc_txn->mt_flags & MDBX_TXN_DIRTY); + tASSERT(txn, !IS_OVERFLOW(mp)); + tASSERT(txn, dirtylist_check(txn)); } if (IS_MODIFIABLE(txn, mp) || IS_SUBP(mp)) @@ -6936,7 +6892,7 @@ __hot static int mdbx_page_touch(MDBX_cursor *mc) { if (IS_FROZEN(txn, mp)) { /* CoW the page */ - rc = mdbx_pnl_need(&txn->tw.retired_pages, 1); + rc = pnl_need(&txn->tw.retired_pages, 1); if (unlikely(rc != MDBX_SUCCESS)) goto fail; const pgr_t par = page_alloc(mc); @@ -6946,10 +6902,10 @@ __hot static int mdbx_page_touch(MDBX_cursor *mc) { goto fail; const pgno_t pgno = np->mp_pgno; - mdbx_debug("touched db %d page %" PRIaPGNO " -> %" PRIaPGNO, DDBI(mc), - mp->mp_pgno, pgno); - mdbx_tassert(txn, mp->mp_pgno != pgno); - mdbx_pnl_xappend(txn->tw.retired_pages, mp->mp_pgno); + DEBUG("touched db %d page %" PRIaPGNO " -> %" PRIaPGNO, DDBI(mc), + mp->mp_pgno, pgno); + tASSERT(txn, mp->mp_pgno != pgno); + pnl_xappend(txn->tw.retired_pages, mp->mp_pgno); /* Update the parent page, if any, to point to the new page */ if (mc->mc_top) { MDBX_page *parent = mc->mc_pg[mc->mc_top - 1]; @@ -6962,43 +6918,43 @@ __hot static int mdbx_page_touch(MDBX_cursor *mc) { #if MDBX_ENABLE_PGOP_STAT txn->mt_env->me_lck->mti_pgop_stat.cow.weak += 1; #endif /* MDBX_ENABLE_PGOP_STAT */ - mdbx_page_copy(np, mp, txn->mt_env->me_psize); + page_copy(np, mp, txn->mt_env->me_psize); np->mp_pgno = pgno; np->mp_txnid = txn->mt_front; } else if (IS_SPILLED(txn, mp)) { - pgr_t pur = mdbx_page_unspill(txn, mp); + pgr_t pur = page_unspill(txn, mp); np = pur.page; rc = pur.err; if (likely(rc == MDBX_SUCCESS)) { - mdbx_tassert(txn, np != nullptr); + tASSERT(txn, np != nullptr); goto done; } goto fail; } else { if (unlikely(!txn->mt_parent)) { - mdbx_error("Unexpected not frozen/modifiable/spilled but shadowed %s " - "page %" PRIaPGNO " mod-txnid %" PRIaTXN "," - " without parent transaction, current txn %" PRIaTXN - " front %" PRIaTXN, - IS_BRANCH(mp) ? "branch" : "leaf", mp->mp_pgno, mp->mp_txnid, - mc->mc_txn->mt_txnid, mc->mc_txn->mt_front); + ERROR("Unexpected not frozen/modifiable/spilled but shadowed %s " + "page %" PRIaPGNO " mod-txnid %" PRIaTXN "," + " without parent transaction, current txn %" PRIaTXN + " front %" PRIaTXN, + IS_BRANCH(mp) ? "branch" : "leaf", mp->mp_pgno, mp->mp_txnid, + mc->mc_txn->mt_txnid, mc->mc_txn->mt_front); rc = MDBX_PROBLEM; goto fail; } - mdbx_debug("clone db %d page %" PRIaPGNO, DDBI(mc), mp->mp_pgno); - mdbx_tassert(txn, txn->tw.dirtylist->length <= - MDBX_PGL_LIMIT + MDBX_PNL_GRANULATE); + DEBUG("clone db %d page %" PRIaPGNO, DDBI(mc), mp->mp_pgno); + tASSERT(txn, + txn->tw.dirtylist->length <= MDBX_PGL_LIMIT + MDBX_PNL_GRANULATE); /* No - copy it */ - np = mdbx_page_malloc(txn, 1); + np = page_malloc(txn, 1); if (unlikely(!np)) { rc = MDBX_ENOMEM; goto fail; } - mdbx_page_copy(np, mp, txn->mt_env->me_psize); + page_copy(np, mp, txn->mt_env->me_psize); /* insert a clone of parent's dirty page, so don't touch dirtyroom */ - rc = mdbx_page_dirty(txn, np, 1); + rc = page_dirty(txn, np, 1); if (unlikely(rc != MDBX_SUCCESS)) goto fail; @@ -7039,8 +6995,7 @@ fail: return rc; } -__cold static int mdbx_env_sync_internal(MDBX_env *env, bool force, - bool nonblock) { +__cold static int env_sync(MDBX_env *env, bool force, bool nonblock) { bool locked = false; int rc = MDBX_RESULT_TRUE /* means "nothing to sync" */; @@ -7072,12 +7027,12 @@ retry:; atomic_load64(&env->me_lck->mti_autosync_period, mo_Relaxed); if (force || (autosync_threshold && unsynced_pages >= autosync_threshold) || (autosync_period && - mdbx_osal_monotime() - + osal_monotime() - atomic_load64(&env->me_lck->mti_sync_timestamp, mo_Relaxed) >= autosync_period)) flags &= MDBX_WRITEMAP /* clear flags for full steady sync */; - const bool inside_txn = (env->me_txn0->mt_owner == mdbx_thread_self()); + const bool inside_txn = (env->me_txn0->mt_owner == osal_thread_self()); if (!inside_txn) { if (!locked) { int err; @@ -7087,27 +7042,27 @@ retry:; /* pre-sync to avoid latency for writer */ if (unsynced_pages > /* FIXME: define threshold */ 16 && (flags & MDBX_SAFE_NOSYNC) == 0) { - mdbx_assert(env, ((flags ^ env->me_flags) & MDBX_WRITEMAP) == 0); + eASSERT(env, ((flags ^ env->me_flags) & MDBX_WRITEMAP) == 0); if (flags & MDBX_WRITEMAP) { /* Acquire guard to avoid collision with remap */ #if defined(_WIN32) || defined(_WIN64) - mdbx_srwlock_AcquireShared(&env->me_remap_guard); + osal_srwlock_AcquireShared(&env->me_remap_guard); #else - err = mdbx_fastmutex_acquire(&env->me_remap_guard); + err = osal_fastmutex_acquire(&env->me_remap_guard); if (unlikely(err != MDBX_SUCCESS)) return err; #endif const size_t usedbytes = pgno_align2os_bytes(env, head->mm_geo.next); - err = mdbx_msync(&env->me_dxb_mmap, 0, usedbytes, MDBX_SYNC_DATA); + err = osal_msync(&env->me_dxb_mmap, 0, usedbytes, MDBX_SYNC_DATA); #if defined(_WIN32) || defined(_WIN64) - mdbx_srwlock_ReleaseShared(&env->me_remap_guard); + osal_srwlock_ReleaseShared(&env->me_remap_guard); #else - int unlock_err = mdbx_fastmutex_release(&env->me_remap_guard); + int unlock_err = osal_fastmutex_release(&env->me_remap_guard); if (unlikely(unlock_err != MDBX_SUCCESS) && err == MDBX_SUCCESS) err = unlock_err; #endif } else - err = mdbx_fsync(env->me_lazy_fd, MDBX_SYNC_DATA); + err = osal_fsync(env->me_lazy_fd, MDBX_SYNC_DATA); if (unlikely(err != MDBX_SUCCESS)) return err; @@ -7131,22 +7086,22 @@ retry:; goto retry; } env->me_txn0->mt_txnid = head_txnid; - mdbx_assert(env, head_txnid == meta_txnid(env, head)); - mdbx_assert(env, head_txnid == mdbx_recent_committed_txnid(env)); + eASSERT(env, head_txnid == meta_txnid(env, head)); + eASSERT(env, head_txnid == recent_committed_txnid(env)); find_oldest_reader(env); flags |= MDBX_SHRINK_ALLOWED; } - mdbx_assert(env, inside_txn || locked); - mdbx_assert(env, !inside_txn || (flags & MDBX_SHRINK_ALLOWED) == 0); + eASSERT(env, inside_txn || locked); + eASSERT(env, !inside_txn || (flags & MDBX_SHRINK_ALLOWED) == 0); if (!META_IS_STEADY(head) || ((flags & MDBX_SAFE_NOSYNC) == 0 && unsynced_pages)) { - mdbx_debug("meta-head %" PRIaPGNO ", %s, sync_pending %" PRIaPGNO, - data_page((const void *)head)->mp_pgno, mdbx_durable_str(head), - unsynced_pages); + DEBUG("meta-head %" PRIaPGNO ", %s, sync_pending %" PRIaPGNO, + data_page((const void *)head)->mp_pgno, durable_caption(head), + unsynced_pages); MDBX_meta meta = *head; - rc = mdbx_sync_locked(env, flags, &meta); + rc = sync_locked(env, flags, &meta); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } @@ -7159,10 +7114,10 @@ retry:; env->me_lck->mti_pgop_stat.wops.weak += 1; #endif /* MDBX_ENABLE_PGOP_STAT */ rc = (flags & MDBX_WRITEMAP) - ? mdbx_msync(&env->me_dxb_mmap, 0, + ? osal_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, NUM_METAS), MDBX_SYNC_DATA | MDBX_SYNC_IODQ) - : mdbx_fsync(env->me_lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); + : osal_fsync(env->me_lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); if (likely(rc == MDBX_SUCCESS)) atomic_store32(&env->me_lck->mti_meta_sync_txnid, (uint32_t)head_txnid, mo_Relaxed); @@ -7182,7 +7137,7 @@ static __inline int check_env(const MDBX_env *env, const bool wanna_active) { return MDBX_EBADSIGN; #if MDBX_ENV_CHECKPID - if (unlikely(env->me_pid != mdbx_getpid())) { + if (unlikely(env->me_pid != osal_getpid())) { ((MDBX_env *)env)->me_flags |= MDBX_FATAL_ERROR; return MDBX_PANIC; } @@ -7194,7 +7149,7 @@ static __inline int check_env(const MDBX_env *env, const bool wanna_active) { if (wanna_active) { if (unlikely((env->me_flags & MDBX_ENV_ACTIVE) == 0)) return MDBX_EPERM; - mdbx_assert(env, env->me_map != nullptr); + eASSERT(env, env->me_map != nullptr); } return MDBX_SUCCESS; @@ -7205,7 +7160,7 @@ __cold int mdbx_env_sync_ex(MDBX_env *env, bool force, bool nonblock) { if (unlikely(rc != MDBX_SUCCESS)) return rc; - return mdbx_env_sync_internal(env, force, nonblock); + return env_sync(env, force, nonblock); } #ifndef LIBMDBX_NO_EXPORTS_LEGACY_API @@ -7217,7 +7172,7 @@ __cold int mdbx_env_sync_poll(MDBX_env *env) { #endif /* LIBMDBX_NO_EXPORTS_LEGACY_API */ /* Back up parent txn's cursors, then grab the originals for tracking */ -static int mdbx_cursor_shadow(MDBX_txn *parent, MDBX_txn *nested) { +static int cursor_shadow(MDBX_txn *parent, MDBX_txn *nested) { for (int i = parent->mt_numdbs; --i >= 0;) { nested->mt_cursors[i] = NULL; MDBX_cursor *mc = parent->mt_cursors[i]; @@ -7228,7 +7183,7 @@ static int mdbx_cursor_shadow(MDBX_txn *parent, MDBX_txn *nested) { bk = mc; if (mc->mc_signature != MDBX_MC_LIVE) continue; - bk = mdbx_malloc(size); + bk = osal_malloc(size); if (unlikely(!bk)) return MDBX_ENOMEM; #if MDBX_DEBUG @@ -7262,7 +7217,7 @@ static int mdbx_cursor_shadow(MDBX_txn *parent, MDBX_txn *nested) { * [in] merge true to keep changes to parent cursors, false to revert. * * Returns 0 on success, non-zero on failure. */ -static void mdbx_cursors_eot(MDBX_txn *txn, const bool merge) { +static void cursors_eot(MDBX_txn *txn, const bool merge) { for (int i = txn->mt_numdbs; --i >= 0;) { MDBX_cursor *next, *mc = txn->mt_cursors[i]; if (!mc) @@ -7272,14 +7227,14 @@ static void mdbx_cursors_eot(MDBX_txn *txn, const bool merge) { const unsigned stage = mc->mc_signature; MDBX_cursor *bk = mc->mc_backup; next = mc->mc_next; - mdbx_ensure(txn->mt_env, - stage == MDBX_MC_LIVE || (stage == MDBX_MC_WAIT4EOT && bk)); - mdbx_cassert(mc, mc->mc_dbi == (unsigned)i); + ENSURE(txn->mt_env, + stage == MDBX_MC_LIVE || (stage == MDBX_MC_WAIT4EOT && bk)); + cASSERT(mc, mc->mc_dbi == (unsigned)i); if (bk) { MDBX_xcursor *mx = mc->mc_xcursor; - mdbx_cassert(mc, mx == bk->mc_xcursor); - mdbx_tassert(txn, txn->mt_parent != NULL); - mdbx_ensure(txn->mt_env, bk->mc_signature == MDBX_MC_LIVE); + cASSERT(mc, mx == bk->mc_xcursor); + tASSERT(txn, txn->mt_parent != NULL); + ENSURE(txn->mt_env, bk->mc_signature == MDBX_MC_LIVE); if (stage == MDBX_MC_WAIT4EOT /* Cursor was closed by user */) mc->mc_signature = stage /* Promote closed state to parent txn */; else if (merge) { @@ -7303,9 +7258,9 @@ static void mdbx_cursors_eot(MDBX_txn *txn, const bool merge) { *mx = *(MDBX_xcursor *)(bk + 1); } bk->mc_signature = 0; - mdbx_free(bk); + osal_free(bk); } else { - mdbx_ensure(txn->mt_env, stage == MDBX_MC_LIVE); + ENSURE(txn->mt_env, stage == MDBX_MC_LIVE); mc->mc_signature = MDBX_MC_READY4CLOSE /* Cursor may be reused */; mc->mc_flags = 0 /* reset C_UNTRACK */; } @@ -7315,7 +7270,7 @@ static void mdbx_cursors_eot(MDBX_txn *txn, const bool merge) { #if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__) /* Find largest mvcc-snapshot still referenced by this process. */ -static pgno_t mdbx_find_largest_this(MDBX_env *env, pgno_t largest) { +static pgno_t find_largest_this(MDBX_env *env, pgno_t largest) { MDBX_lockinfo *const lck = env->me_lck_mmap.lck; if (likely(lck != NULL /* exclusive mode */)) { const unsigned snap_nreaders = @@ -7324,7 +7279,7 @@ static pgno_t mdbx_find_largest_this(MDBX_env *env, pgno_t largest) { retry: if (atomic_load32(&lck->mti_readers[i].mr_pid, mo_AcquireRelease) == env->me_pid) { - /* mdbx_jitter4testing(true); */ + /* jitter4testing(true); */ const pgno_t snap_pages = atomic_load32( &lck->mti_readers[i].mr_snapshot_pages_used, mo_Relaxed); const txnid_t snap_txnid = safe64_read(&lck->mti_readers[i].mr_txnid); @@ -7345,7 +7300,7 @@ static pgno_t mdbx_find_largest_this(MDBX_env *env, pgno_t largest) { return largest; } -static void mdbx_txn_valgrind(MDBX_env *env, MDBX_txn *txn) { +static void txn_valgrind(MDBX_env *env, MDBX_txn *txn) { #if !defined(__SANITIZE_ADDRESS__) if (!RUNNING_ON_VALGRIND) return; @@ -7361,7 +7316,7 @@ static void mdbx_txn_valgrind(MDBX_env *env, MDBX_txn *txn) { } else { /* transaction end */ bool should_unlock = false; pgno_t last = MAX_PAGENO + 1; - if (env->me_txn0 && env->me_txn0->mt_owner == mdbx_thread_self()) { + if (env->me_txn0 && env->me_txn0->mt_owner == osal_thread_self()) { /* inside write-txn */ const MDBX_meta *head = constmeta_prefer_last(env); last = head->mm_geo.next; @@ -7378,10 +7333,10 @@ static void mdbx_txn_valgrind(MDBX_env *env, MDBX_txn *txn) { return; } - last = mdbx_find_largest_this(env, last); + last = find_largest_this(env, last); const pgno_t edge = env->me_poison_edge; if (edge > last) { - mdbx_assert(env, last >= NUM_METAS); + eASSERT(env, last >= NUM_METAS); env->me_poison_edge = last; VALGRIND_MAKE_MEM_NOACCESS(env->me_map + pgno2bytes(env, last), pgno2bytes(env, edge - last)); @@ -7400,28 +7355,28 @@ typedef struct { } bind_rslot_result; static bind_rslot_result bind_rslot(MDBX_env *env, const uintptr_t tid) { - mdbx_assert(env, env->me_lck_mmap.lck); - mdbx_assert(env, env->me_lck->mti_magic_and_version == MDBX_LOCK_MAGIC); - mdbx_assert(env, env->me_lck->mti_os_and_format == MDBX_LOCK_FORMAT); + eASSERT(env, env->me_lck_mmap.lck); + eASSERT(env, env->me_lck->mti_magic_and_version == MDBX_LOCK_MAGIC); + eASSERT(env, env->me_lck->mti_os_and_format == MDBX_LOCK_FORMAT); - bind_rslot_result result = {mdbx_rdt_lock(env), nullptr}; + bind_rslot_result result = {osal_rdt_lock(env), nullptr}; if (unlikely(MDBX_IS_ERROR(result.err))) return result; if (unlikely(env->me_flags & MDBX_FATAL_ERROR)) { - mdbx_rdt_unlock(env); + osal_rdt_unlock(env); result.err = MDBX_PANIC; return result; } if (unlikely(!env->me_map)) { - mdbx_rdt_unlock(env); + osal_rdt_unlock(env); result.err = MDBX_EPERM; return result; } if (unlikely(env->me_live_reader != env->me_pid)) { - result.err = mdbx_rpid_set(env); + result.err = osal_rpid_set(env); if (unlikely(result.err != MDBX_SUCCESS)) { - mdbx_rdt_unlock(env); + osal_rdt_unlock(env); return result; } env->me_live_reader = env->me_pid; @@ -7439,9 +7394,9 @@ static bind_rslot_result bind_rslot(MDBX_env *env, const uintptr_t tid) { if (likely(slot < env->me_maxreaders)) break; - result.err = mdbx_cleanup_dead_readers(env, true, NULL); + result.err = cleanup_dead_readers(env, true, NULL); if (result.err != MDBX_RESULT_TRUE) { - mdbx_rdt_unlock(env); + osal_rdt_unlock(env); result.err = (result.err == MDBX_SUCCESS) ? MDBX_READERS_FULL : result.err; return result; @@ -7460,10 +7415,10 @@ static bind_rslot_result bind_rslot(MDBX_env *env, const uintptr_t tid) { env->me_lck->mti_numreaders.weak = ++nreaders; result.rslot->mr_tid.weak = (env->me_flags & MDBX_NOTLS) ? 0 : tid; atomic_store32(&result.rslot->mr_pid, env->me_pid, mo_AcquireRelease); - mdbx_rdt_unlock(env); + osal_rdt_unlock(env); if (likely(env->me_flags & MDBX_ENV_TXKEY)) { - mdbx_assert(env, env->me_live_reader == env->me_pid); + eASSERT(env, env->me_live_reader == env->me_pid); thread_rthc_set(env->me_txkey, result.rslot); } return result; @@ -7478,22 +7433,22 @@ __cold int mdbx_thread_register(const MDBX_env *env) { return (env->me_flags & MDBX_EXCLUSIVE) ? MDBX_EINVAL : MDBX_EPERM; if (unlikely((env->me_flags & MDBX_ENV_TXKEY) == 0)) { - mdbx_assert(env, !env->me_lck_mmap.lck || (env->me_flags & MDBX_NOTLS)); + eASSERT(env, !env->me_lck_mmap.lck || (env->me_flags & MDBX_NOTLS)); return MDBX_EINVAL /* MDBX_NOTLS mode */; } - mdbx_assert(env, (env->me_flags & (MDBX_NOTLS | MDBX_ENV_TXKEY | - MDBX_EXCLUSIVE)) == MDBX_ENV_TXKEY); + eASSERT(env, (env->me_flags & (MDBX_NOTLS | MDBX_ENV_TXKEY | + MDBX_EXCLUSIVE)) == MDBX_ENV_TXKEY); MDBX_reader *r = thread_rthc_get(env->me_txkey); if (unlikely(r != NULL)) { - mdbx_assert(env, r->mr_pid.weak == env->me_pid); - mdbx_assert(env, r->mr_tid.weak == mdbx_thread_self()); + eASSERT(env, r->mr_pid.weak == env->me_pid); + eASSERT(env, r->mr_tid.weak == osal_thread_self()); if (unlikely(r->mr_pid.weak != env->me_pid)) return MDBX_BAD_RSLOT; return MDBX_RESULT_TRUE /* already registered */; } - const uintptr_t tid = mdbx_thread_self(); + const uintptr_t tid = osal_thread_self(); if (env->me_txn0 && unlikely(env->me_txn0->mt_owner == tid)) return MDBX_TXN_OVERLAPPING; return bind_rslot((MDBX_env *)env, tid).err; @@ -7508,23 +7463,23 @@ __cold int mdbx_thread_unregister(const MDBX_env *env) { return MDBX_RESULT_TRUE; if (unlikely((env->me_flags & MDBX_ENV_TXKEY) == 0)) { - mdbx_assert(env, !env->me_lck_mmap.lck || (env->me_flags & MDBX_NOTLS)); + eASSERT(env, !env->me_lck_mmap.lck || (env->me_flags & MDBX_NOTLS)); return MDBX_RESULT_TRUE /* MDBX_NOTLS mode */; } - mdbx_assert(env, (env->me_flags & (MDBX_NOTLS | MDBX_ENV_TXKEY | - MDBX_EXCLUSIVE)) == MDBX_ENV_TXKEY); + eASSERT(env, (env->me_flags & (MDBX_NOTLS | MDBX_ENV_TXKEY | + MDBX_EXCLUSIVE)) == MDBX_ENV_TXKEY); MDBX_reader *r = thread_rthc_get(env->me_txkey); if (unlikely(r == NULL)) return MDBX_RESULT_TRUE /* not registered */; - mdbx_assert(env, r->mr_pid.weak == env->me_pid); - mdbx_assert(env, r->mr_tid.weak == mdbx_thread_self()); + eASSERT(env, r->mr_pid.weak == env->me_pid); + eASSERT(env, r->mr_tid.weak == osal_thread_self()); if (unlikely(r->mr_pid.weak != env->me_pid || - r->mr_tid.weak != mdbx_thread_self())) + r->mr_tid.weak != osal_thread_self())) return MDBX_BAD_RSLOT; - mdbx_assert(env, r->mr_txnid.weak >= SAFE64_INVALID_THRESHOLD); + eASSERT(env, r->mr_txnid.weak >= SAFE64_INVALID_THRESHOLD); if (unlikely(r->mr_txnid.weak < SAFE64_INVALID_THRESHOLD)) return MDBX_BUSY /* transaction is still active */; @@ -7559,22 +7514,20 @@ static bool meta_checktxnid(const MDBX_env *env, const volatile MDBX_meta *meta, (!freedb_mod_txnid && freedb_root && likely(magic_and_version == MDBX_DATA_MAGIC)))) { if (report) - mdbx_warning( - "catch invalid %sdb.mod_txnid %" PRIaTXN " for meta_txnid %" PRIaTXN - " %s", - "free", freedb_mod_txnid, head_txnid, - "(workaround for incoherent flaw of unified page/buffer cache)"); + WARNING("catch invalid %sdb.mod_txnid %" PRIaTXN + " for meta_txnid %" PRIaTXN " %s", + "free", freedb_mod_txnid, head_txnid, + "(workaround for incoherent flaw of unified page/buffer cache)"); ok = false; } if (unlikely(head_txnid < maindb_mod_txnid || (!maindb_mod_txnid && maindb_root && likely(magic_and_version == MDBX_DATA_MAGIC)))) { if (report) - mdbx_warning( - "catch invalid %sdb.mod_txnid %" PRIaTXN " for meta_txnid %" PRIaTXN - " %s", - "main", maindb_mod_txnid, head_txnid, - "(workaround for incoherent flaw of unified page/buffer cache)"); + WARNING("catch invalid %sdb.mod_txnid %" PRIaTXN + " for meta_txnid %" PRIaTXN " %s", + "main", maindb_mod_txnid, head_txnid, + "(workaround for incoherent flaw of unified page/buffer cache)"); ok = false; } if (likely(freedb_root && freedb_mod_txnid)) { @@ -7584,7 +7537,7 @@ static bool meta_checktxnid(const MDBX_env *env, const volatile MDBX_meta *meta, const txnid_t root_txnid = freedb_root->mp_txnid; if (unlikely(root_txnid != freedb_mod_txnid)) { if (report) - mdbx_warning( + WARNING( "catch invalid root_page %" PRIaPGNO " mod_txnid %" PRIaTXN " for %sdb.mod_txnid %" PRIaTXN " %s", freedb_root_pgno, root_txnid, "free", freedb_mod_txnid, @@ -7599,7 +7552,7 @@ static bool meta_checktxnid(const MDBX_env *env, const volatile MDBX_meta *meta, const txnid_t root_txnid = maindb_root->mp_txnid; if (unlikely(root_txnid != maindb_mod_txnid)) { if (report) - mdbx_warning( + WARNING( "catch invalid root_page %" PRIaPGNO " mod_txnid %" PRIaTXN " for %sdb.mod_txnid %" PRIaTXN " %s", maindb_root_pgno, root_txnid, "main", maindb_mod_txnid, @@ -7612,10 +7565,10 @@ static bool meta_checktxnid(const MDBX_env *env, const volatile MDBX_meta *meta, __cold static bool is_timeout(uint64_t *timestamp) { if (likely(!*timestamp)) { - *timestamp = mdbx_osal_monotime(); + *timestamp = osal_monotime(); return false; } - return mdbx_osal_monotime() - *timestamp > 65536 / 10; + return osal_monotime() - *timestamp > 65536 / 10; } /* check with timeout as the workaround @@ -7626,7 +7579,7 @@ static int meta_waittxnid(const MDBX_env *env, const volatile MDBX_meta *meta, return MDBX_SUCCESS; if (likely(!is_timeout(timestamp))) { - mdbx_memory_fence(mo_AcquireRelease, true); + osal_memory_fence(mo_AcquireRelease, true); #if defined(_WIN32) || defined(_WIN64) SwitchToThread(); #elif defined(__linux__) || defined(__gnu_linux__) || defined(_UNIX03_SOURCE) @@ -7639,18 +7592,18 @@ static int meta_waittxnid(const MDBX_env *env, const volatile MDBX_meta *meta, return MDBX_RESULT_TRUE; } - mdbx_error("bailout waiting for valid snapshot (%s)", - "workaround for incoherent flaw of unified page/buffer cache"); + ERROR("bailout waiting for valid snapshot (%s)", + "workaround for incoherent flaw of unified page/buffer cache"); return MDBX_CORRUPTED; } /* Common code for mdbx_txn_begin() and mdbx_txn_renew(). */ -static int mdbx_txn_renew0(MDBX_txn *txn, const unsigned flags) { +static int txn_renew(MDBX_txn *txn, const unsigned flags) { MDBX_env *env = txn->mt_env; int rc; #if MDBX_ENV_CHECKPID - if (unlikely(env->me_pid != mdbx_getpid())) { + if (unlikely(env->me_pid != osal_getpid())) { env->me_flags |= MDBX_FATAL_ERROR; return MDBX_PANIC; } @@ -7669,28 +7622,28 @@ static int mdbx_txn_renew0(MDBX_txn *txn, const unsigned flags) { STATIC_ASSERT(offsetof(MDBX_lockinfo, mti_readers) % MDBX_CACHELINE_SIZE == 0); - const uintptr_t tid = mdbx_thread_self(); + const uintptr_t tid = osal_thread_self(); if (flags & MDBX_TXN_RDONLY) { - mdbx_assert(env, (flags & ~(MDBX_TXN_RO_BEGIN_FLAGS | MDBX_WRITEMAP)) == 0); + eASSERT(env, (flags & ~(MDBX_TXN_RO_BEGIN_FLAGS | MDBX_WRITEMAP)) == 0); txn->mt_flags = MDBX_TXN_RDONLY | (env->me_flags & (MDBX_NOTLS | MDBX_WRITEMAP)); MDBX_reader *r = txn->to.reader; STATIC_ASSERT(sizeof(uintptr_t) <= sizeof(r->mr_tid)); if (likely(env->me_flags & MDBX_ENV_TXKEY)) { - mdbx_assert(env, !(env->me_flags & MDBX_NOTLS)); + eASSERT(env, !(env->me_flags & MDBX_NOTLS)); r = thread_rthc_get(env->me_txkey); if (likely(r)) { if (unlikely(!r->mr_pid.weak) && - (mdbx_runtime_flags & MDBX_DBG_LEGACY_MULTIOPEN)) { + (runtime_flags & MDBX_DBG_LEGACY_MULTIOPEN)) { thread_rthc_set(env->me_txkey, nullptr); r = nullptr; } else { - mdbx_assert(env, r->mr_pid.weak == env->me_pid); - mdbx_assert(env, r->mr_tid.weak == mdbx_thread_self()); + eASSERT(env, r->mr_pid.weak == env->me_pid); + eASSERT(env, r->mr_tid.weak == osal_thread_self()); } } } else { - mdbx_assert(env, !env->me_lck_mmap.lck || (env->me_flags & MDBX_NOTLS)); + eASSERT(env, !env->me_lck_mmap.lck || (env->me_flags & MDBX_NOTLS)); } if (likely(r)) { @@ -7705,12 +7658,12 @@ static int mdbx_txn_renew0(MDBX_txn *txn, const unsigned flags) { } txn->to.reader = r; if (flags & (MDBX_TXN_RDONLY_PREPARE - MDBX_TXN_RDONLY)) { - mdbx_assert(env, txn->mt_txnid == 0); - mdbx_assert(env, txn->mt_owner == 0); - mdbx_assert(env, txn->mt_numdbs == 0); + eASSERT(env, txn->mt_txnid == 0); + eASSERT(env, txn->mt_owner == 0); + eASSERT(env, txn->mt_numdbs == 0); if (likely(r)) { - mdbx_assert(env, r->mr_snapshot_pages_used.weak == 0); - mdbx_assert(env, r->mr_txnid.weak >= SAFE64_INVALID_THRESHOLD); + eASSERT(env, r->mr_snapshot_pages_used.weak == 0); + eASSERT(env, r->mr_txnid.weak >= SAFE64_INVALID_THRESHOLD); atomic_store32(&r->mr_snapshot_pages_used, 0, mo_Relaxed); } txn->mt_flags = MDBX_TXN_RDONLY | MDBX_TXN_FINISHED; @@ -7726,9 +7679,9 @@ static int mdbx_txn_renew0(MDBX_txn *txn, const unsigned flags) { likely(env->me_stuck_meta < 0) ? /* regular */ meta_prefer_last(env) : /* recovery mode */ METAPAGE(env, env->me_stuck_meta); - mdbx_jitter4testing(false); + jitter4testing(false); const txnid_t target_txnid = meta_txnid(env, meta); - mdbx_jitter4testing(false); + jitter4testing(false); if (likely(r)) { safe64_reset(&r->mr_txnid, false); atomic_store32(&r->mr_snapshot_pages_used, meta->mm_geo.next, @@ -7737,23 +7690,22 @@ static int mdbx_txn_renew0(MDBX_txn *txn, const unsigned flags) { unaligned_peek_u64_volatile(4, meta->mm_pages_retired), mo_Relaxed); safe64_write(&r->mr_txnid, target_txnid); - mdbx_jitter4testing(false); - mdbx_assert(env, r->mr_pid.weak == mdbx_getpid()); - mdbx_assert( - env, r->mr_tid.weak == - ((env->me_flags & MDBX_NOTLS) ? 0 : mdbx_thread_self())); - mdbx_assert(env, - r->mr_txnid.weak == target_txnid || - (r->mr_txnid.weak >= SAFE64_INVALID_THRESHOLD && - target_txnid < env->me_lck->mti_oldest_reader.weak)); + jitter4testing(false); + eASSERT(env, r->mr_pid.weak == osal_getpid()); + eASSERT(env, + r->mr_tid.weak == + ((env->me_flags & MDBX_NOTLS) ? 0 : osal_thread_self())); + eASSERT(env, r->mr_txnid.weak == target_txnid || + (r->mr_txnid.weak >= SAFE64_INVALID_THRESHOLD && + target_txnid < env->me_lck->mti_oldest_reader.weak)); atomic_store32(&env->me_lck->mti_readers_refresh_flag, true, mo_AcquireRelease); } else { /* exclusive mode without lck */ - mdbx_assert(env, !env->me_lck_mmap.lck && - env->me_lck == (void *)&env->x_lckless_stub); + eASSERT(env, !env->me_lck_mmap.lck && + env->me_lck == (void *)&env->x_lckless_stub); } - mdbx_jitter4testing(true); + jitter4testing(true); /* Snap the state from current meta-head */ txn->mt_txnid = target_txnid; @@ -7764,15 +7716,15 @@ static int mdbx_txn_renew0(MDBX_txn *txn, const unsigned flags) { txn->mt_canary = meta->mm_canary; /* LY: Retry on a race, ITS#7970. */ - mdbx_memory_fence(mo_AcquireRelease, false); + osal_memory_fence(mo_AcquireRelease, false); const txnid_t oldest = atomic_load64(&env->me_lck->mti_oldest_reader, mo_AcquireRelease); if (unlikely(target_txnid < oldest || (meta != meta_prefer_last(env) && env->me_stuck_meta < 0) || target_txnid != meta_txnid(env, meta))) { if (unlikely(++loop > 42)) { - mdbx_error("bailout waiting for valid snapshot (%s)", - "metapages are too volatile"); + ERROR("bailout waiting for valid snapshot (%s)", + "metapages are too volatile"); rc = MDBX_PROBLEM; goto bailout; } @@ -7781,7 +7733,7 @@ static int mdbx_txn_renew0(MDBX_txn *txn, const unsigned flags) { } rc = meta_waittxnid(env, meta, ×tamp); - mdbx_jitter4testing(false); + jitter4testing(false); if (likely(rc == MDBX_SUCCESS)) break; if (unlikely(rc != MDBX_RESULT_TRUE)) @@ -7789,25 +7741,25 @@ static int mdbx_txn_renew0(MDBX_txn *txn, const unsigned flags) { } if (unlikely(txn->mt_txnid < MIN_TXNID || txn->mt_txnid > MAX_TXNID)) { - mdbx_error("%s", "environment corrupted by died writer, must shutdown!"); + ERROR("%s", "environment corrupted by died writer, must shutdown!"); rc = MDBX_CORRUPTED; goto bailout; } - mdbx_assert(env, txn->mt_txnid >= env->me_lck->mti_oldest_reader.weak); + eASSERT(env, txn->mt_txnid >= env->me_lck->mti_oldest_reader.weak); txn->mt_dbxs = env->me_dbxs; /* mostly static anyway */ - mdbx_ensure(env, txn->mt_txnid >= - /* paranoia is appropriate here */ env->me_lck - ->mti_oldest_reader.weak); + ENSURE(env, txn->mt_txnid >= + /* paranoia is appropriate here */ env->me_lck + ->mti_oldest_reader.weak); txn->mt_numdbs = env->me_numdbs; } else { - mdbx_assert(env, (flags & ~(MDBX_TXN_RW_BEGIN_FLAGS | MDBX_TXN_SPILLS | - MDBX_WRITEMAP)) == 0); + eASSERT(env, (flags & ~(MDBX_TXN_RW_BEGIN_FLAGS | MDBX_TXN_SPILLS | + MDBX_WRITEMAP)) == 0); if (unlikely(txn->mt_owner == tid || /* not recovery mode */ env->me_stuck_meta >= 0)) return MDBX_BUSY; MDBX_lockinfo *const lck = env->me_lck_mmap.lck; if (lck && (env->me_flags & MDBX_NOTLS) == 0 && - (mdbx_runtime_flags & MDBX_DBG_LEGACY_OVERLAP) == 0) { + (runtime_flags & MDBX_DBG_LEGACY_OVERLAP) == 0) { const unsigned snap_nreaders = atomic_load32(&lck->mti_numreaders, mo_AcquireRelease); for (unsigned i = 0; i < snap_nreaders; ++i) { @@ -7823,7 +7775,7 @@ static int mdbx_txn_renew0(MDBX_txn *txn, const unsigned flags) { } /* Not yet touching txn == env->me_txn0, it may be active */ - mdbx_jitter4testing(false); + jitter4testing(false); rc = mdbx_txn_lock(env, F_ISSET(flags, MDBX_TXN_TRY)); if (unlikely(rc)) return rc; @@ -7839,7 +7791,7 @@ static int mdbx_txn_renew0(MDBX_txn *txn, const unsigned flags) { #endif /* Windows */ meta_cache_clear(env); - mdbx_jitter4testing(false); + jitter4testing(false); const MDBX_meta *meta = constmeta_prefer_last(env); uint64_t timestamp = 0; while ( @@ -7850,13 +7802,13 @@ static int mdbx_txn_renew0(MDBX_txn *txn, const unsigned flags) { if (unlikely(rc != MDBX_RESULT_TRUE)) goto bailout; } - mdbx_jitter4testing(false); + jitter4testing(false); txn->mt_canary = meta->mm_canary; const txnid_t snap = constmeta_txnid(env, meta); txn->mt_txnid = safe64_txnid_next(snap); if (unlikely(txn->mt_txnid > MAX_TXNID)) { rc = MDBX_TXN_FULL; - mdbx_error("txnid overflow, raise %d", rc); + ERROR("txnid overflow, raise %d", rc); goto bailout; } @@ -7881,7 +7833,7 @@ static int mdbx_txn_renew0(MDBX_txn *txn, const unsigned flags) { /* Moved to here to avoid a data race in read TXNs */ txn->mt_geo = meta->mm_geo; - rc = mdbx_dpl_alloc(txn); + rc = dpl_alloc(txn); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; txn->tw.dirtyroom = txn->mt_env->me_options.dp_limit; @@ -7889,7 +7841,7 @@ static int mdbx_txn_renew0(MDBX_txn *txn, const unsigned flags) { } /* Setup db info */ - mdbx_compiler_barrier(); + osal_compiler_barrier(); memset(txn->mt_cursors, 0, sizeof(MDBX_cursor *) * txn->mt_numdbs); for (unsigned i = CORE_DBS; i < txn->mt_numdbs; i++) { const unsigned db_flags = env->me_dbflags[i]; @@ -7903,7 +7855,7 @@ static int mdbx_txn_renew0(MDBX_txn *txn, const unsigned flags) { txn->mt_txnid + ((flags & (MDBX_WRITEMAP | MDBX_RDONLY)) == 0); if (unlikely(env->me_flags & MDBX_FATAL_ERROR)) { - mdbx_warning("%s", "environment had fatal error, must shutdown!"); + WARNING("%s", "environment had fatal error, must shutdown!"); rc = MDBX_PANIC; } else { const size_t size = @@ -7916,9 +7868,9 @@ static int mdbx_txn_renew0(MDBX_txn *txn, const unsigned flags) { rc = MDBX_UNABLE_EXTEND_MAPSIZE; goto bailout; } - rc = mdbx_mapresize(env, txn->mt_next_pgno, txn->mt_end_pgno, - txn->mt_geo.upper, - (txn->mt_flags & MDBX_TXN_RDONLY) ? true : false); + rc = map_resize(env, txn->mt_next_pgno, txn->mt_end_pgno, + txn->mt_geo.upper, + (txn->mt_flags & MDBX_TXN_RDONLY) ? true : false); if (rc != MDBX_SUCCESS) goto bailout; } else { @@ -7936,19 +7888,19 @@ static int mdbx_txn_renew0(MDBX_txn *txn, const unsigned flags) { size < env->me_dbgeo.upper && env->me_dbgeo.grow)) && /* avoid recursive use SRW */ (txn->mt_flags & MDBX_NOTLS) == 0) { txn->mt_flags |= MDBX_SHRINK_ALLOWED; - mdbx_srwlock_AcquireShared(&env->me_remap_guard); + osal_srwlock_AcquireShared(&env->me_remap_guard); } #endif /* Windows */ } #if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__) - mdbx_txn_valgrind(env, txn); + txn_valgrind(env, txn); #endif txn->mt_owner = tid; return MDBX_SUCCESS; } bailout: - mdbx_tassert(txn, rc != MDBX_SUCCESS); - mdbx_txn_end(txn, MDBX_END_SLOT | MDBX_END_FAIL_BEGIN); + tASSERT(txn, rc != MDBX_SUCCESS); + txn_end(txn, MDBX_END_SLOT | MDBX_END_FAIL_BEGIN); return rc; } @@ -7962,12 +7914,12 @@ static __always_inline int check_txn(const MDBX_txn *txn, int bad_bits) { if (unlikely(txn->mt_flags & bad_bits)) return MDBX_BAD_TXN; - mdbx_tassert(txn, (txn->mt_flags & MDBX_NOTLS) == - ((txn->mt_flags & MDBX_TXN_RDONLY) - ? txn->mt_env->me_flags & MDBX_NOTLS - : 0)); + tASSERT(txn, (txn->mt_flags & MDBX_NOTLS) == + ((txn->mt_flags & MDBX_TXN_RDONLY) + ? txn->mt_env->me_flags & MDBX_NOTLS + : 0)); #if MDBX_TXN_CHECKOWNER - if (unlikely(txn->mt_owner != mdbx_thread_self()) && + if (unlikely(txn->mt_owner != osal_thread_self()) && (txn->mt_flags & (MDBX_NOTLS | MDBX_TXN_FINISHED)) == 0) return txn->mt_owner ? MDBX_THREAD_MISMATCH : MDBX_BAD_TXN; #endif /* MDBX_TXN_CHECKOWNER */ @@ -8006,14 +7958,14 @@ int mdbx_txn_renew(MDBX_txn *txn) { return rc; } - rc = mdbx_txn_renew0(txn, MDBX_TXN_RDONLY); + rc = txn_renew(txn, MDBX_TXN_RDONLY); if (rc == MDBX_SUCCESS) { - txn->mt_owner = mdbx_thread_self(); - mdbx_debug("renew txn %" PRIaTXN "%c %p on env %p, root page %" PRIaPGNO - "/%" PRIaPGNO, - txn->mt_txnid, (txn->mt_flags & MDBX_TXN_RDONLY) ? 'r' : 'w', - (void *)txn, (void *)txn->mt_env, txn->mt_dbs[MAIN_DBI].md_root, - txn->mt_dbs[FREE_DBI].md_root); + txn->mt_owner = osal_thread_self(); + DEBUG("renew txn %" PRIaTXN "%c %p on env %p, root page %" PRIaPGNO + "/%" PRIaPGNO, + txn->mt_txnid, (txn->mt_flags & MDBX_TXN_RDONLY) ? 'r' : 'w', + (void *)txn, (void *)txn->mt_env, txn->mt_dbs[MAIN_DBI].md_root, + txn->mt_dbs[FREE_DBI].md_root); } return rc; } @@ -8070,31 +8022,31 @@ int mdbx_txn_begin_ex(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags, if (env->me_options.spill_parent4child_denominator) { /* Spill dirty-pages of parent to provide dirtyroom for child txn */ - rc = mdbx_txn_spill(parent, nullptr, - parent->tw.dirtylist->length / - env->me_options.spill_parent4child_denominator); + rc = txn_spill(parent, nullptr, + parent->tw.dirtylist->length / + env->me_options.spill_parent4child_denominator); if (unlikely(rc != MDBX_SUCCESS)) return rc; } - mdbx_tassert(parent, mdbx_audit_ex(parent, 0, false) == 0); + tASSERT(parent, audit_ex(parent, 0, false) == 0); flags |= parent->mt_flags & (MDBX_TXN_RW_BEGIN_FLAGS | MDBX_TXN_SPILLS); } else if (flags & MDBX_TXN_RDONLY) { if (env->me_txn0 && - unlikely(env->me_txn0->mt_owner == mdbx_thread_self()) && - (mdbx_runtime_flags & MDBX_DBG_LEGACY_OVERLAP) == 0) + unlikely(env->me_txn0->mt_owner == osal_thread_self()) && + (runtime_flags & MDBX_DBG_LEGACY_OVERLAP) == 0) return MDBX_TXN_OVERLAPPING; } else { /* Reuse preallocated write txn. However, do not touch it until - * mdbx_txn_renew0() succeeds, since it currently may be active. */ + * txn_renew() succeeds, since it currently may be active. */ txn = env->me_txn0; goto renew; } size = env->me_maxdbs * (sizeof(MDBX_db) + sizeof(MDBX_cursor *) + 1); size += tsize = sizeof(MDBX_txn); - if (unlikely((txn = mdbx_malloc(size)) == NULL)) { - mdbx_debug("calloc: %s", "failed"); + if (unlikely((txn = osal_malloc(size)) == NULL)) { + DEBUG("calloc: %s", "failed"); return MDBX_ENOMEM; } #if MDBX_DEBUG @@ -8110,23 +8062,23 @@ int mdbx_txn_begin_ex(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags, txn->mt_env = env; if (parent) { - mdbx_tassert(parent, mdbx_dirtylist_check(parent)); + tASSERT(parent, dirtylist_check(parent)); txn->mt_dbiseqs = parent->mt_dbiseqs; txn->mt_geo = parent->mt_geo; - rc = mdbx_dpl_alloc(txn); + rc = dpl_alloc(txn); if (likely(rc == MDBX_SUCCESS)) { const unsigned len = MDBX_PNL_SIZE(parent->tw.reclaimed_pglist) + parent->tw.loose_count; txn->tw.reclaimed_pglist = - mdbx_pnl_alloc((len > MDBX_PNL_INITIAL) ? len : MDBX_PNL_INITIAL); + pnl_alloc((len > MDBX_PNL_INITIAL) ? len : MDBX_PNL_INITIAL); if (unlikely(!txn->tw.reclaimed_pglist)) rc = MDBX_ENOMEM; } if (unlikely(rc != MDBX_SUCCESS)) { nested_failed: - mdbx_pnl_free(txn->tw.reclaimed_pglist); - mdbx_dpl_free(txn); - mdbx_free(txn); + pnl_free(txn->tw.reclaimed_pglist); + dpl_free(txn); + osal_free(txn); return rc; } @@ -8134,40 +8086,39 @@ int mdbx_txn_begin_ex(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags, if (parent->tw.loose_count) { do { MDBX_page *lp = parent->tw.loose_pages; - const unsigned di = mdbx_dpl_exist(parent, lp->mp_pgno); - mdbx_tassert(parent, di && parent->tw.dirtylist->items[di].ptr == lp); - mdbx_tassert(parent, lp->mp_flags == P_LOOSE); - rc = - mdbx_pnl_insert_range(&parent->tw.reclaimed_pglist, lp->mp_pgno, 1); + const unsigned di = dpl_exist(parent, lp->mp_pgno); + tASSERT(parent, di && parent->tw.dirtylist->items[di].ptr == lp); + tASSERT(parent, lp->mp_flags == P_LOOSE); + rc = pnl_insert_range(&parent->tw.reclaimed_pglist, lp->mp_pgno, 1); if (unlikely(rc != MDBX_SUCCESS)) goto nested_failed; parent->tw.loose_pages = lp->mp_next; /* Remove from dirty list */ - mdbx_page_wash(parent, di, lp, 1); + page_wash(parent, di, lp, 1); } while (parent->tw.loose_pages); parent->tw.loose_count = 0; #if MDBX_ENABLE_REFUND parent->tw.loose_refund_wl = 0; #endif /* MDBX_ENABLE_REFUND */ - mdbx_tassert(parent, mdbx_dirtylist_check(parent)); + tASSERT(parent, dirtylist_check(parent)); } txn->tw.dirtyroom = parent->tw.dirtyroom; txn->tw.dirtylru = parent->tw.dirtylru; dpl_sort(parent); if (parent->tw.spill_pages) - mdbx_spill_purge(parent); + spill_purge(parent); - mdbx_tassert(txn, MDBX_PNL_ALLOCLEN(txn->tw.reclaimed_pglist) >= - MDBX_PNL_SIZE(parent->tw.reclaimed_pglist)); + tASSERT(txn, MDBX_PNL_ALLOCLEN(txn->tw.reclaimed_pglist) >= + MDBX_PNL_SIZE(parent->tw.reclaimed_pglist)); memcpy(txn->tw.reclaimed_pglist, parent->tw.reclaimed_pglist, MDBX_PNL_SIZEOF(parent->tw.reclaimed_pglist)); - mdbx_assert(env, pnl_check_allocated( - txn->tw.reclaimed_pglist, - (txn->mt_next_pgno /* LY: intentional assignment here, - only for assertion */ - = parent->mt_next_pgno) - - MDBX_ENABLE_REFUND)); + eASSERT(env, pnl_check_allocated( + txn->tw.reclaimed_pglist, + (txn->mt_next_pgno /* LY: intentional assignment here, + only for assertion */ + = parent->mt_next_pgno) - + MDBX_ENABLE_REFUND)); txn->tw.last_reclaimed = parent->tw.last_reclaimed; if (parent->tw.lifo_reclaimed) { @@ -8196,51 +8147,51 @@ int mdbx_txn_begin_ex(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags, for (unsigned i = 0; i < txn->mt_numdbs; i++) txn->mt_dbistate[i] = parent->mt_dbistate[i] & ~(DBI_FRESH | DBI_CREAT | DBI_DIRTY); - mdbx_tassert(parent, - parent->tw.dirtyroom + parent->tw.dirtylist->length == - (parent->mt_parent ? parent->mt_parent->tw.dirtyroom - : parent->mt_env->me_options.dp_limit)); - mdbx_tassert(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length == - (txn->mt_parent ? txn->mt_parent->tw.dirtyroom - : txn->mt_env->me_options.dp_limit)); + tASSERT(parent, + parent->tw.dirtyroom + parent->tw.dirtylist->length == + (parent->mt_parent ? parent->mt_parent->tw.dirtyroom + : parent->mt_env->me_options.dp_limit)); + tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length == + (txn->mt_parent ? txn->mt_parent->tw.dirtyroom + : txn->mt_env->me_options.dp_limit)); env->me_txn = txn; - rc = mdbx_cursor_shadow(parent, txn); - if (mdbx_audit_enabled() && mdbx_assert_enabled()) { + rc = cursor_shadow(parent, txn); + if (AUDIT_ENABLED() && ASSERT_ENABLED()) { txn->mt_signature = MDBX_MT_SIGNATURE; - mdbx_tassert(txn, mdbx_audit_ex(txn, 0, false) == 0); + tASSERT(txn, audit_ex(txn, 0, false) == 0); } if (unlikely(rc != MDBX_SUCCESS)) - mdbx_txn_end(txn, MDBX_END_FAIL_BEGINCHILD); + txn_end(txn, MDBX_END_FAIL_BEGINCHILD); } else { /* MDBX_TXN_RDONLY */ txn->mt_dbiseqs = env->me_dbiseqs; renew: - rc = mdbx_txn_renew0(txn, flags); + rc = txn_renew(txn, flags); } if (unlikely(rc != MDBX_SUCCESS)) { if (txn != env->me_txn0) - mdbx_free(txn); + osal_free(txn); } else { if (flags & (MDBX_TXN_RDONLY_PREPARE - MDBX_TXN_RDONLY)) - mdbx_assert(env, txn->mt_flags == (MDBX_TXN_RDONLY | MDBX_TXN_FINISHED)); + eASSERT(env, txn->mt_flags == (MDBX_TXN_RDONLY | MDBX_TXN_FINISHED)); else if (flags & MDBX_TXN_RDONLY) - mdbx_assert(env, (txn->mt_flags & - ~(MDBX_NOTLS | MDBX_TXN_RDONLY | MDBX_WRITEMAP | - /* Win32: SRWL flag */ MDBX_SHRINK_ALLOWED)) == 0); + eASSERT(env, (txn->mt_flags & + ~(MDBX_NOTLS | MDBX_TXN_RDONLY | MDBX_WRITEMAP | + /* Win32: SRWL flag */ MDBX_SHRINK_ALLOWED)) == 0); else { - mdbx_assert(env, (txn->mt_flags & ~(MDBX_WRITEMAP | MDBX_SHRINK_ALLOWED | - MDBX_NOMETASYNC | MDBX_SAFE_NOSYNC | - MDBX_TXN_SPILLS)) == 0); + eASSERT(env, (txn->mt_flags & + ~(MDBX_WRITEMAP | MDBX_SHRINK_ALLOWED | MDBX_NOMETASYNC | + MDBX_SAFE_NOSYNC | MDBX_TXN_SPILLS)) == 0); assert(!txn->tw.spill_pages && !txn->tw.spill_least_removed); } txn->mt_signature = MDBX_MT_SIGNATURE; txn->mt_userctx = context; *ret = txn; - mdbx_debug("begin txn %" PRIaTXN "%c %p on env %p, root page %" PRIaPGNO - "/%" PRIaPGNO, - txn->mt_txnid, (flags & MDBX_TXN_RDONLY) ? 'r' : 'w', - (void *)txn, (void *)env, txn->mt_dbs[MAIN_DBI].md_root, - txn->mt_dbs[FREE_DBI].md_root); + DEBUG("begin txn %" PRIaTXN "%c %p on env %p, root page %" PRIaPGNO + "/%" PRIaPGNO, + txn->mt_txnid, (flags & MDBX_TXN_RDONLY) ? 'r' : 'w', (void *)txn, + (void *)env, txn->mt_dbs[MAIN_DBI].md_root, + txn->mt_dbs[FREE_DBI].md_root); } return rc; @@ -8256,7 +8207,7 @@ int mdbx_txn_info(const MDBX_txn *txn, MDBX_txn_info *info, bool scan_rlt) { MDBX_env *const env = txn->mt_env; #if MDBX_ENV_CHECKPID - if (unlikely(env->me_pid != mdbx_getpid())) { + if (unlikely(env->me_pid != osal_getpid())) { env->me_flags |= MDBX_FATAL_ERROR; return MDBX_PANIC; } @@ -8279,7 +8230,7 @@ int mdbx_txn_info(const MDBX_txn *txn, MDBX_txn_info *info, bool scan_rlt) { info->txn_space_limit_hard = pgno2bytes(env, head_meta->mm_geo.upper); info->txn_space_leftover = pgno2bytes(env, head_meta->mm_geo.now - head_meta->mm_geo.next); - mdbx_memory_fence(mo_AcquireRelease, false); + osal_memory_fence(mo_AcquireRelease, false); } while (unlikely(head_meta != meta_prefer_last(env) || head_txnid != meta_txnid(env, head_meta))); @@ -8303,7 +8254,7 @@ int mdbx_txn_info(const MDBX_txn *txn, MDBX_txn_info *info, bool scan_rlt) { for (unsigned i = 0; i < snap_nreaders; ++i) { retry: if (atomic_load32(&lck->mti_readers[i].mr_pid, mo_AcquireRelease)) { - mdbx_jitter4testing(true); + jitter4testing(true); const txnid_t snap_txnid = safe64_read(&lck->mti_readers[i].mr_txnid); const uint64_t snap_retired = @@ -8414,15 +8365,15 @@ static void dbi_import_locked(MDBX_txn *txn) { (txn->mt_dbistate[i] & (DBI_CREAT | DBI_DIRTY | DBI_FRESH)) == 0) || ((env->me_dbflags[i] & DB_VALID) && !(txn->mt_dbistate[i] & DBI_VALID))) { - mdbx_tassert(txn, (txn->mt_dbistate[i] & - (DBI_CREAT | DBI_DIRTY | DBI_FRESH)) == 0); + tASSERT(txn, + (txn->mt_dbistate[i] & (DBI_CREAT | DBI_DIRTY | DBI_FRESH)) == 0); txn->mt_dbiseqs[i] = env->me_dbiseqs[i]; txn->mt_dbs[i].md_flags = env->me_dbflags[i] & DB_PERSISTENT_FLAGS; txn->mt_dbistate[i] = 0; if (env->me_dbflags[i] & DB_VALID) { txn->mt_dbistate[i] = DBI_VALID | DBI_USRVALID | DBI_STALE; - mdbx_tassert(txn, txn->mt_dbxs[i].md_cmp != NULL); - mdbx_tassert(txn, txn->mt_dbxs[i].md_name.iov_base != NULL); + tASSERT(txn, txn->mt_dbxs[i].md_cmp != NULL); + tASSERT(txn, txn->mt_dbxs[i].md_name.iov_base != NULL); } } } @@ -8447,17 +8398,17 @@ __cold static bool dbi_import(MDBX_txn *txn, MDBX_dbi dbi) { (dbi >= txn->mt_numdbs && dbi >= txn->mt_env->me_numdbs)) return false; - mdbx_ensure(txn->mt_env, mdbx_fastmutex_acquire(&txn->mt_env->me_dbi_lock) == - MDBX_SUCCESS); + ENSURE(txn->mt_env, + osal_fastmutex_acquire(&txn->mt_env->me_dbi_lock) == MDBX_SUCCESS); dbi_import_locked(txn); - mdbx_ensure(txn->mt_env, mdbx_fastmutex_release(&txn->mt_env->me_dbi_lock) == - MDBX_SUCCESS); + ENSURE(txn->mt_env, + osal_fastmutex_release(&txn->mt_env->me_dbi_lock) == MDBX_SUCCESS); return txn->mt_dbistate[dbi] & DBI_USRVALID; } /* Export or close DBI handles opened in this txn. */ static void dbi_update(MDBX_txn *txn, int keep) { - mdbx_tassert(txn, !txn->mt_parent && txn == txn->mt_env->me_txn0); + tASSERT(txn, !txn->mt_parent && txn == txn->mt_env->me_txn0); MDBX_dbi n = txn->mt_numdbs; if (n) { bool locked = false; @@ -8467,8 +8418,7 @@ static void dbi_update(MDBX_txn *txn, int keep) { if (likely((txn->mt_dbistate[i] & DBI_CREAT) == 0)) continue; if (!locked) { - mdbx_ensure(env, - mdbx_fastmutex_acquire(&env->me_dbi_lock) == MDBX_SUCCESS); + ENSURE(env, osal_fastmutex_acquire(&env->me_dbi_lock) == MDBX_SUCCESS); locked = true; } if (env->me_numdbs <= i || txn->mt_dbiseqs[i] != env->me_dbiseqs[i]) @@ -8479,11 +8429,11 @@ static void dbi_update(MDBX_txn *txn, int keep) { char *ptr = env->me_dbxs[i].md_name.iov_base; if (ptr) { env->me_dbxs[i].md_name.iov_len = 0; - mdbx_memory_fence(mo_AcquireRelease, true); - mdbx_assert(env, env->me_dbflags[i] == 0); + osal_memory_fence(mo_AcquireRelease, true); + eASSERT(env, env->me_dbflags[i] == 0); env->me_dbiseqs[i] = dbi_seq(env, i); env->me_dbxs[i].md_name.iov_base = NULL; - mdbx_free(ptr); + osal_free(ptr); } } } @@ -8491,8 +8441,7 @@ static void dbi_update(MDBX_txn *txn, int keep) { n = env->me_numdbs; if (n > CORE_DBS && unlikely(!(env->me_dbflags[n - 1] & DB_VALID))) { if (!locked) { - mdbx_ensure(env, - mdbx_fastmutex_acquire(&env->me_dbi_lock) == MDBX_SUCCESS); + ENSURE(env, osal_fastmutex_acquire(&env->me_dbi_lock) == MDBX_SUCCESS); locked = true; } @@ -8503,30 +8452,27 @@ static void dbi_update(MDBX_txn *txn, int keep) { } if (unlikely(locked)) - mdbx_ensure(env, - mdbx_fastmutex_release(&env->me_dbi_lock) == MDBX_SUCCESS); + ENSURE(env, osal_fastmutex_release(&env->me_dbi_lock) == MDBX_SUCCESS); } } /* Filter-out pgno list from transaction's dirty-page list */ -static void mdbx_dpl_sift(MDBX_txn *const txn, MDBX_PNL pl, - const bool spilled) { +static void dpl_sift(MDBX_txn *const txn, MDBX_PNL pl, const bool spilled) { if (MDBX_PNL_SIZE(pl) && txn->tw.dirtylist->length) { - mdbx_tassert(txn, - pnl_check_allocated(pl, (size_t)txn->mt_next_pgno << spilled)); + tASSERT(txn, pnl_check_allocated(pl, (size_t)txn->mt_next_pgno << spilled)); MDBX_dpl *dl = dpl_sort(txn); /* Scanning in ascend order */ const int step = MDBX_PNL_ASCENDING ? 1 : -1; const int begin = MDBX_PNL_ASCENDING ? 1 : MDBX_PNL_SIZE(pl); const int end = MDBX_PNL_ASCENDING ? MDBX_PNL_SIZE(pl) + 1 : 0; - mdbx_tassert(txn, pl[begin] <= pl[end - step]); + tASSERT(txn, pl[begin] <= pl[end - step]); unsigned r = dpl_search(txn, pl[begin] >> spilled); - mdbx_tassert(txn, dl->sorted == dl->length); + tASSERT(txn, dl->sorted == dl->length); for (int i = begin; r <= dl->length;) { /* scan loop */ assert(i != end); - mdbx_tassert(txn, !spilled || (pl[i] & 1) == 0); + tASSERT(txn, !spilled || (pl[i] & 1) == 0); pgno_t pl_pgno = pl[i] >> spilled; pgno_t dp_pgno = dl->items[r].pgno; if (likely(dp_pgno != pl_pgno)) { @@ -8543,7 +8489,7 @@ static void mdbx_dpl_sift(MDBX_txn *const txn, MDBX_PNL pl, remove_dl: if ((txn->mt_env->me_flags & MDBX_WRITEMAP) == 0) { MDBX_page *dp = dl->items[r].ptr; - mdbx_dpage_free(txn->mt_env, dp, dpl_npages(dl, r)); + dpage_free(txn->mt_env, dp, dpl_npages(dl, r)); } ++r; next_i: @@ -8554,7 +8500,7 @@ static void mdbx_dpl_sift(MDBX_txn *const txn, MDBX_PNL pl, } else { while (r <= dl->length) { assert(i != end); - mdbx_tassert(txn, !spilled || (pl[i] & 1) == 0); + tASSERT(txn, !spilled || (pl[i] & 1) == 0); pl_pgno = pl[i] >> spilled; dp_pgno = dl->items[r].pgno; if (dp_pgno < pl_pgno) @@ -8567,8 +8513,7 @@ static void mdbx_dpl_sift(MDBX_txn *const txn, MDBX_PNL pl, } dl->sorted = dpl_setlen(dl, w - 1); txn->tw.dirtyroom += r - w; - mdbx_tassert(txn, - txn->tw.dirtyroom + txn->tw.dirtylist->length == + tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length == (txn->mt_parent ? txn->mt_parent->tw.dirtyroom : txn->mt_env->me_options.dp_limit)); return; @@ -8580,50 +8525,49 @@ static void mdbx_dpl_sift(MDBX_txn *const txn, MDBX_PNL pl, * May be called twice for readonly txns: First reset it, then abort. * [in] txn the transaction handle to end * [in] mode why and how to end the transaction */ -static int mdbx_txn_end(MDBX_txn *txn, const unsigned mode) { +static int txn_end(MDBX_txn *txn, const unsigned mode) { MDBX_env *env = txn->mt_env; static const char *const names[] = MDBX_END_NAMES; #if MDBX_ENV_CHECKPID - if (unlikely(txn->mt_env->me_pid != mdbx_getpid())) { + if (unlikely(txn->mt_env->me_pid != osal_getpid())) { env->me_flags |= MDBX_FATAL_ERROR; return MDBX_PANIC; } #endif /* MDBX_ENV_CHECKPID */ - mdbx_debug("%s txn %" PRIaTXN "%c %p on mdbenv %p, root page %" PRIaPGNO - "/%" PRIaPGNO, - names[mode & MDBX_END_OPMASK], txn->mt_txnid, - (txn->mt_flags & MDBX_TXN_RDONLY) ? 'r' : 'w', (void *)txn, - (void *)env, txn->mt_dbs[MAIN_DBI].md_root, - txn->mt_dbs[FREE_DBI].md_root); + DEBUG("%s txn %" PRIaTXN "%c %p on mdbenv %p, root page %" PRIaPGNO + "/%" PRIaPGNO, + names[mode & MDBX_END_OPMASK], txn->mt_txnid, + (txn->mt_flags & MDBX_TXN_RDONLY) ? 'r' : 'w', (void *)txn, (void *)env, + txn->mt_dbs[MAIN_DBI].md_root, txn->mt_dbs[FREE_DBI].md_root); - mdbx_ensure(env, txn->mt_txnid >= - /* paranoia is appropriate here */ env->me_lck - ->mti_oldest_reader.weak); + ENSURE(env, txn->mt_txnid >= + /* paranoia is appropriate here */ env->me_lck + ->mti_oldest_reader.weak); if (!(mode & MDBX_END_EOTDONE)) /* !(already closed cursors) */ - mdbx_cursors_eot(txn, false); + cursors_eot(txn, false); int rc = MDBX_SUCCESS; if (F_ISSET(txn->mt_flags, MDBX_TXN_RDONLY)) { if (txn->to.reader) { MDBX_reader *slot = txn->to.reader; - mdbx_assert(env, slot->mr_pid.weak == env->me_pid); + eASSERT(env, slot->mr_pid.weak == env->me_pid); if (likely(!F_ISSET(txn->mt_flags, MDBX_TXN_FINISHED))) { - mdbx_assert(env, txn->mt_txnid == slot->mr_txnid.weak && - slot->mr_txnid.weak >= - env->me_lck->mti_oldest_reader.weak); + eASSERT(env, + txn->mt_txnid == slot->mr_txnid.weak && + slot->mr_txnid.weak >= env->me_lck->mti_oldest_reader.weak); #if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__) - mdbx_txn_valgrind(env, nullptr); + txn_valgrind(env, nullptr); #endif atomic_store32(&slot->mr_snapshot_pages_used, 0, mo_Relaxed); safe64_reset(&slot->mr_txnid, false); atomic_store32(&env->me_lck->mti_readers_refresh_flag, true, mo_Relaxed); } else { - mdbx_assert(env, slot->mr_pid.weak == env->me_pid); - mdbx_assert(env, slot->mr_txnid.weak >= SAFE64_INVALID_THRESHOLD); + eASSERT(env, slot->mr_pid.weak == env->me_pid); + eASSERT(env, slot->mr_txnid.weak >= SAFE64_INVALID_THRESHOLD); } if (mode & MDBX_END_SLOT) { if ((env->me_flags & MDBX_ENV_TXKEY) == 0) @@ -8633,7 +8577,7 @@ static int mdbx_txn_end(MDBX_txn *txn, const unsigned mode) { } #if defined(_WIN32) || defined(_WIN64) if (txn->mt_flags & MDBX_SHRINK_ALLOWED) - mdbx_srwlock_ReleaseShared(&env->me_remap_guard); + osal_srwlock_ReleaseShared(&env->me_remap_guard); #endif txn->mt_numdbs = 0; /* prevent further DBI activity */ txn->mt_flags = MDBX_TXN_RDONLY | MDBX_TXN_FINISHED; @@ -8641,45 +8585,44 @@ static int mdbx_txn_end(MDBX_txn *txn, const unsigned mode) { } else if (!F_ISSET(txn->mt_flags, MDBX_TXN_FINISHED)) { #if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__) if (txn == env->me_txn0) - mdbx_txn_valgrind(env, nullptr); + txn_valgrind(env, nullptr); #endif txn->mt_flags = MDBX_TXN_FINISHED; txn->mt_owner = 0; env->me_txn = txn->mt_parent; - mdbx_pnl_free(txn->tw.spill_pages); + pnl_free(txn->tw.spill_pages); txn->tw.spill_pages = nullptr; if (txn == env->me_txn0) { - mdbx_assert(env, txn->mt_parent == NULL); + eASSERT(env, txn->mt_parent == NULL); /* Export or close DBI handles created in this txn */ dbi_update(txn, mode & MDBX_END_UPDATE); - mdbx_pnl_shrink(&txn->tw.retired_pages); - mdbx_pnl_shrink(&txn->tw.reclaimed_pglist); + pnl_shrink(&txn->tw.retired_pages); + pnl_shrink(&txn->tw.reclaimed_pglist); if (!(env->me_flags & MDBX_WRITEMAP)) - mdbx_dlist_free(txn); + dlist_free(txn); /* The writer mutex was locked in mdbx_txn_begin. */ mdbx_txn_unlock(env); } else { - mdbx_assert(env, txn->mt_parent != NULL); + eASSERT(env, txn->mt_parent != NULL); MDBX_txn *const parent = txn->mt_parent; - mdbx_assert(env, parent->mt_signature == MDBX_MT_SIGNATURE); - mdbx_assert(env, parent->mt_child == txn && - (parent->mt_flags & MDBX_TXN_HAS_CHILD) != 0); - mdbx_assert(env, - pnl_check_allocated(txn->tw.reclaimed_pglist, - txn->mt_next_pgno - MDBX_ENABLE_REFUND)); + eASSERT(env, parent->mt_signature == MDBX_MT_SIGNATURE); + eASSERT(env, parent->mt_child == txn && + (parent->mt_flags & MDBX_TXN_HAS_CHILD) != 0); + eASSERT(env, pnl_check_allocated(txn->tw.reclaimed_pglist, + txn->mt_next_pgno - MDBX_ENABLE_REFUND)); if (txn->tw.lifo_reclaimed) { - mdbx_assert(env, MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) >= - (unsigned)(uintptr_t)parent->tw.lifo_reclaimed); + eASSERT(env, MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) >= + (unsigned)(uintptr_t)parent->tw.lifo_reclaimed); MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) = (unsigned)(uintptr_t)parent->tw.lifo_reclaimed; parent->tw.lifo_reclaimed = txn->tw.lifo_reclaimed; } if (txn->tw.retired_pages) { - mdbx_assert(env, MDBX_PNL_SIZE(txn->tw.retired_pages) >= - (unsigned)(uintptr_t)parent->tw.retired_pages); + eASSERT(env, MDBX_PNL_SIZE(txn->tw.retired_pages) >= + (unsigned)(uintptr_t)parent->tw.retired_pages); MDBX_PNL_SIZE(txn->tw.retired_pages) = (unsigned)(uintptr_t)parent->tw.retired_pages; parent->tw.retired_pages = txn->tw.retired_pages; @@ -8688,33 +8631,33 @@ static int mdbx_txn_end(MDBX_txn *txn, const unsigned mode) { parent->mt_child = nullptr; parent->mt_flags &= ~MDBX_TXN_HAS_CHILD; parent->tw.dirtylru = txn->tw.dirtylru; - mdbx_tassert(parent, mdbx_dirtylist_check(parent)); - mdbx_tassert(parent, mdbx_audit_ex(parent, 0, false) == 0); + tASSERT(parent, dirtylist_check(parent)); + tASSERT(parent, audit_ex(parent, 0, false) == 0); if (!(env->me_flags & MDBX_WRITEMAP)) - mdbx_dlist_free(txn); - mdbx_dpl_free(txn); - mdbx_pnl_free(txn->tw.reclaimed_pglist); + dlist_free(txn); + dpl_free(txn); + pnl_free(txn->tw.reclaimed_pglist); if (parent->mt_geo.upper != txn->mt_geo.upper || parent->mt_geo.now != txn->mt_geo.now) { /* undo resize performed by child txn */ - rc = mdbx_mapresize_implicit(env, parent->mt_next_pgno, - parent->mt_geo.now, parent->mt_geo.upper); + rc = map_resize_implicit(env, parent->mt_next_pgno, parent->mt_geo.now, + parent->mt_geo.upper); if (rc == MDBX_EPERM) { /* unable undo resize (it is regular for Windows), * therefore promote size changes from child to the parent txn */ - mdbx_warning("unable undo resize performed by child txn, promote to " - "the parent (%u->%u, %u->%u)", - txn->mt_geo.now, parent->mt_geo.now, txn->mt_geo.upper, - parent->mt_geo.upper); + WARNING("unable undo resize performed by child txn, promote to " + "the parent (%u->%u, %u->%u)", + txn->mt_geo.now, parent->mt_geo.now, txn->mt_geo.upper, + parent->mt_geo.upper); parent->mt_geo.now = txn->mt_geo.now; parent->mt_geo.upper = txn->mt_geo.upper; parent->mt_flags |= MDBX_TXN_DIRTY; rc = MDBX_SUCCESS; } else if (unlikely(rc != MDBX_SUCCESS)) { - mdbx_error("error %d while undo resize performed by child txn, fail " - "the parent", - rc); + ERROR("error %d while undo resize performed by child txn, fail " + "the parent", + rc); parent->mt_flags |= MDBX_TXN_ERROR; if (!env->me_dxb_mmap.address) env->me_flags |= MDBX_FATAL_ERROR; @@ -8723,10 +8666,10 @@ static int mdbx_txn_end(MDBX_txn *txn, const unsigned mode) { } } - mdbx_assert(env, txn == env->me_txn0 || txn->mt_owner == 0); + eASSERT(env, txn == env->me_txn0 || txn->mt_owner == 0); if ((mode & MDBX_END_FREE) != 0 && txn != env->me_txn0) { txn->mt_signature = 0; - mdbx_free(txn); + osal_free(txn); } return rc; @@ -8742,10 +8685,10 @@ int mdbx_txn_reset(MDBX_txn *txn) { return MDBX_EINVAL; /* LY: don't close DBI-handles */ - rc = mdbx_txn_end(txn, MDBX_END_RESET | MDBX_END_UPDATE); + rc = txn_end(txn, MDBX_END_RESET | MDBX_END_UPDATE); if (rc == MDBX_SUCCESS) { - mdbx_tassert(txn, txn->mt_signature == MDBX_MT_SIGNATURE); - mdbx_tassert(txn, txn->mt_owner == 0); + tASSERT(txn, txn->mt_signature == MDBX_MT_SIGNATURE); + tASSERT(txn, txn->mt_owner == 0); } return rc; } @@ -8770,20 +8713,20 @@ int mdbx_txn_abort(MDBX_txn *txn) { if (F_ISSET(txn->mt_flags, MDBX_TXN_RDONLY)) /* LY: don't close DBI-handles */ - return mdbx_txn_end(txn, MDBX_END_ABORT | MDBX_END_UPDATE | MDBX_END_SLOT | - MDBX_END_FREE); + return txn_end(txn, MDBX_END_ABORT | MDBX_END_UPDATE | MDBX_END_SLOT | + MDBX_END_FREE); if (txn->mt_child) mdbx_txn_abort(txn->mt_child); - mdbx_tassert(txn, mdbx_dirtylist_check(txn)); - return mdbx_txn_end(txn, MDBX_END_ABORT | MDBX_END_SLOT | MDBX_END_FREE); + tASSERT(txn, dirtylist_check(txn)); + return txn_end(txn, MDBX_END_ABORT | MDBX_END_SLOT | MDBX_END_FREE); } /* Count all the pages in each DB and in the GC and make sure * it matches the actual number of pages being used. */ -__cold static int mdbx_audit_ex(MDBX_txn *txn, unsigned retired_stored, - bool dont_filter_gc) { +__cold static int audit_ex(MDBX_txn *txn, unsigned retired_stored, + bool dont_filter_gc) { pgno_t pending = 0; if ((txn->mt_flags & MDBX_TXN_RDONLY) == 0) { pending = txn->tw.loose_count + MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) + @@ -8791,7 +8734,7 @@ __cold static int mdbx_audit_ex(MDBX_txn *txn, unsigned retired_stored, } MDBX_cursor_couple cx; - int rc = mdbx_cursor_init(&cx.outer, txn, FREE_DBI); + int rc = cursor_init(&cx.outer, txn, FREE_DBI); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -8813,7 +8756,7 @@ __cold static int mdbx_audit_ex(MDBX_txn *txn, unsigned retired_stored, gc += *(pgno_t *)data.iov_base; skip:; } - mdbx_tassert(txn, rc == MDBX_NOTFOUND); + tASSERT(txn, rc == MDBX_NOTFOUND); for (MDBX_dbi i = FREE_DBI; i < txn->mt_numdbs; i++) txn->mt_dbistate[i] &= ~DBI_AUDITED; @@ -8822,7 +8765,7 @@ __cold static int mdbx_audit_ex(MDBX_txn *txn, unsigned retired_stored, for (MDBX_dbi i = FREE_DBI; i <= MAIN_DBI; i++) { if (!(txn->mt_dbistate[i] & DBI_VALID)) continue; - rc = mdbx_cursor_init(&cx.outer, txn, i); + rc = cursor_init(&cx.outer, txn, i); if (unlikely(rc != MDBX_SUCCESS)) return rc; txn->mt_dbistate[i] |= DBI_AUDITED; @@ -8833,7 +8776,7 @@ __cold static int mdbx_audit_ex(MDBX_txn *txn, unsigned retired_stored, if (i != MAIN_DBI) continue; - rc = mdbx_page_search(&cx.outer, NULL, MDBX_PS_FIRST); + rc = page_search(&cx.outer, NULL, MDBX_PS_FIRST); while (rc == MDBX_SUCCESS) { MDBX_page *mp = cx.outer.mc_pg[cx.outer.mc_top]; for (unsigned j = 0; j < page_numkeys(mp); j++) { @@ -8861,9 +8804,9 @@ __cold static int mdbx_audit_ex(MDBX_txn *txn, unsigned retired_stored, db->md_branch_pages + db->md_leaf_pages + db->md_overflow_pages; } } - rc = mdbx_cursor_sibling(&cx.outer, SIBLING_RIGHT); + rc = cursor_sibling(&cx.outer, SIBLING_RIGHT); } - mdbx_tassert(txn, rc == MDBX_NOTFOUND); + tASSERT(txn, rc == MDBX_NOTFOUND); } for (MDBX_dbi i = FREE_DBI; i < txn->mt_numdbs; i++) { @@ -8878,12 +8821,12 @@ __cold static int mdbx_audit_ex(MDBX_txn *txn, unsigned retired_stored, break; } if (!(txn->mt_dbistate[i] & DBI_AUDITED)) { - mdbx_warning("audit %s@%" PRIaTXN - ": unable account dbi %d / \"%*s\", state 0x%02x", - txn->mt_parent ? "nested-" : "", txn->mt_txnid, i, - (int)txn->mt_dbxs[i].md_name.iov_len, - (const char *)txn->mt_dbxs[i].md_name.iov_base, - txn->mt_dbistate[i]); + WARNING("audit %s@%" PRIaTXN + ": unable account dbi %d / \"%*s\", state 0x%02x", + txn->mt_parent ? "nested-" : "", txn->mt_txnid, i, + (int)txn->mt_dbxs[i].md_name.iov_len, + (const char *)txn->mt_dbxs[i].md_name.iov_base, + txn->mt_dbistate[i]); } } @@ -8891,17 +8834,17 @@ __cold static int mdbx_audit_ex(MDBX_txn *txn, unsigned retired_stored, return MDBX_SUCCESS; if ((txn->mt_flags & MDBX_TXN_RDONLY) == 0) - mdbx_error("audit @%" PRIaTXN ": %u(pending) = %u(loose) + " - "%u(reclaimed) + %u(retired-pending) - %u(retired-stored)", - txn->mt_txnid, pending, txn->tw.loose_count, - MDBX_PNL_SIZE(txn->tw.reclaimed_pglist), - txn->tw.retired_pages ? MDBX_PNL_SIZE(txn->tw.retired_pages) : 0, - retired_stored); - mdbx_error("audit @%" PRIaTXN ": %" PRIaPGNO "(pending) + %" PRIaPGNO - "(gc) + %" PRIaPGNO "(count) = %" PRIaPGNO "(total) <> %" PRIaPGNO - "(allocated)", - txn->mt_txnid, pending, gc, used, pending + gc + used, - txn->mt_next_pgno); + ERROR("audit @%" PRIaTXN ": %u(pending) = %u(loose) + " + "%u(reclaimed) + %u(retired-pending) - %u(retired-stored)", + txn->mt_txnid, pending, txn->tw.loose_count, + MDBX_PNL_SIZE(txn->tw.reclaimed_pglist), + txn->tw.retired_pages ? MDBX_PNL_SIZE(txn->tw.retired_pages) : 0, + retired_stored); + ERROR("audit @%" PRIaTXN ": %" PRIaPGNO "(pending) + %" PRIaPGNO + "(gc) + %" PRIaPGNO "(count) = %" PRIaPGNO "(total) <> %" PRIaPGNO + "(allocated)", + txn->mt_txnid, pending, gc, used, pending + gc + used, + txn->mt_next_pgno); return MDBX_PROBLEM; } @@ -8922,7 +8865,7 @@ static __inline int gcu_context_init(MDBX_txn *txn, gcu_context_t *ctx) { #if MDBX_ENABLE_BIGFOOT ctx->bigfoot = txn->mt_txnid; #endif /* MDBX_ENABLE_BIGFOOT */ - return mdbx_cursor_init(&ctx->cursor.outer, txn, FREE_DBI); + return cursor_init(&ctx->cursor.outer, txn, FREE_DBI); } static __always_inline unsigned gcu_backlog_size(MDBX_txn *txn) { @@ -8941,12 +8884,12 @@ static int gcu_clean_stored_retired(MDBX_txn *txn, gcu_context_t *ctx) { #endif /* MDBX_ENABLE_BIGFOOT */ key.iov_len = sizeof(txnid_t); const struct cursor_set_result csr = - mdbx_cursor_set(&ctx->cursor.outer, &key, &val, MDBX_SET); + cursor_set(&ctx->cursor.outer, &key, &val, MDBX_SET); if (csr.err == MDBX_SUCCESS && csr.exact) { ctx->retired_stored = 0; err = mdbx_cursor_del(&ctx->cursor.outer, 0); - mdbx_trace("== clear-4linear, backlog %u, err %d", - gcu_backlog_size(txn), err); + TRACE("== clear-4linear, backlog %u, err %d", gcu_backlog_size(txn), + err); } } #if MDBX_ENABLE_BIGFOOT @@ -8975,10 +8918,9 @@ static int gcu_prepare_backlog(MDBX_txn *txn, gcu_context_t *ctx, : (backlog4cow + backlog4rebalance)))) return MDBX_SUCCESS; - mdbx_trace( - ">> reserve4retired %c, backlog %u, 4list %u, 4cow %u, 4rebalance %u", - reserve4retired ? 'Y' : 'N', gcu_backlog_size(txn), pages4retiredlist, - backlog4cow, backlog4rebalance); + TRACE(">> reserve4retired %c, backlog %u, 4list %u, 4cow %u, 4rebalance %u", + reserve4retired ? 'Y' : 'N', gcu_backlog_size(txn), pages4retiredlist, + backlog4cow, backlog4rebalance); int err; if (unlikely(pages4retiredlist > 2)) { @@ -8986,30 +8928,28 @@ static int gcu_prepare_backlog(MDBX_txn *txn, gcu_context_t *ctx, key.iov_base = val.iov_base = nullptr; key.iov_len = sizeof(txnid_t); val.iov_len = MDBX_PNL_SIZEOF(txn->tw.retired_pages); - err = mdbx_cursor_spill(&ctx->cursor.outer, &key, &val); + err = cursor_spill(&ctx->cursor.outer, &key, &val); if (unlikely(err != MDBX_SUCCESS)) return err; } ctx->cursor.outer.mc_flags &= ~C_RECLAIMING; - err = mdbx_cursor_touch(&ctx->cursor.outer); - mdbx_trace("== after-touch, backlog %u, err %d", gcu_backlog_size(txn), err); + err = cursor_touch(&ctx->cursor.outer); + TRACE("== after-touch, backlog %u, err %d", gcu_backlog_size(txn), err); if (unlikely(pages4retiredlist > 1) && MDBX_PNL_SIZE(txn->tw.retired_pages) != ctx->retired_stored && err == MDBX_SUCCESS) { - mdbx_tassert(txn, reserve4retired); + tASSERT(txn, reserve4retired); err = gcu_clean_stored_retired(txn, ctx); if (unlikely(err != MDBX_SUCCESS)) return err; err = page_alloc_slowpath(&ctx->cursor.outer, pages4retiredlist, MDBX_ALLOC_GC | MDBX_ALLOC_FAKE) .err; - mdbx_trace("== after-4linear, backlog %u, err %d", gcu_backlog_size(txn), - err); - mdbx_cassert(&ctx->cursor.outer, - gcu_backlog_size(txn) >= pages4retiredlist || - err != MDBX_SUCCESS); + TRACE("== after-4linear, backlog %u, err %d", gcu_backlog_size(txn), err); + cASSERT(&ctx->cursor.outer, + gcu_backlog_size(txn) >= pages4retiredlist || err != MDBX_SUCCESS); } while (gcu_backlog_size(txn) < backlog4cow + pages4retiredlist && @@ -9020,7 +8960,7 @@ static int gcu_prepare_backlog(MDBX_txn *txn, gcu_context_t *ctx, .err; ctx->cursor.outer.mc_flags |= C_RECLAIMING; - mdbx_trace("<< backlog %u, err %d", gcu_backlog_size(txn), err); + TRACE("<< backlog %u, err %d", gcu_backlog_size(txn), err); return (err != MDBX_NOTFOUND) ? err : MDBX_SUCCESS; } @@ -9043,8 +8983,8 @@ static __inline void gcu_clean_reserved(MDBX_env *env, MDBX_val pnl) { * "checks and balances") to partially bypass the fundamental design problems * inherited from LMDB. So do not try to understand it completely in order to * avoid your madness. */ -static int mdbx_update_gc(MDBX_txn *txn, gcu_context_t *ctx) { - mdbx_trace("\n>>> @%" PRIaTXN, txn->mt_txnid); +static int update_gc(MDBX_txn *txn, gcu_context_t *ctx) { + TRACE("\n>>> @%" PRIaTXN, txn->mt_txnid); MDBX_env *const env = txn->mt_env; const char *const dbg_prefix_mode = ctx->lifo ? " lifo" : " fifo"; (void)dbg_prefix_mode; @@ -9058,14 +8998,13 @@ static int mdbx_update_gc(MDBX_txn *txn, gcu_context_t *ctx) { retry: ++ctx->loop; - mdbx_trace("%s", " >> restart"); + TRACE("%s", " >> restart"); int rc = MDBX_SUCCESS; - mdbx_tassert(txn, - pnl_check_allocated(txn->tw.reclaimed_pglist, + tASSERT(txn, pnl_check_allocated(txn->tw.reclaimed_pglist, txn->mt_next_pgno - MDBX_ENABLE_REFUND)); - mdbx_tassert(txn, mdbx_dirtylist_check(txn)); + tASSERT(txn, dirtylist_check(txn)); if (unlikely(/* paranoia */ ctx->loop > ((MDBX_DEBUG > 0) ? 12 : 42))) { - mdbx_error("too more loops %u, bailout", ctx->loop); + ERROR("too more loops %u, bailout", ctx->loop); rc = MDBX_PROBLEM; goto bailout; } @@ -9085,7 +9024,7 @@ retry: while (true) { /* Come back here after each Put() in case retired-list changed */ MDBX_val key, data; - mdbx_trace("%s", " >> continue"); + TRACE("%s", " >> continue"); if (ctx->retired_stored != MDBX_PNL_SIZE(txn->tw.retired_pages) && (MDBX_PNL_SIZE(txn->tw.retired_pages) > env->me_maxgc_ov1page || @@ -9095,8 +9034,7 @@ retry: goto bailout; } - mdbx_tassert(txn, - pnl_check_allocated(txn->tw.reclaimed_pglist, + tASSERT(txn, pnl_check_allocated(txn->tw.reclaimed_pglist, txn->mt_next_pgno - MDBX_ENABLE_REFUND)); if (ctx->lifo) { if (ctx->cleaned_slot < (txn->tw.lifo_reclaimed @@ -9109,9 +9047,9 @@ retry: /* LY: cleanup reclaimed records. */ do { ctx->cleaned_id = txn->tw.lifo_reclaimed[++ctx->cleaned_slot]; - mdbx_tassert(txn, ctx->cleaned_slot > 0 && - ctx->cleaned_id <= - env->me_lck->mti_oldest_reader.weak); + tASSERT(txn, + ctx->cleaned_slot > 0 && + ctx->cleaned_id <= env->me_lck->mti_oldest_reader.weak); key.iov_base = &ctx->cleaned_id; key.iov_len = sizeof(ctx->cleaned_id); rc = mdbx_cursor_get(&ctx->cursor.outer, &key, NULL, MDBX_SET); @@ -9124,22 +9062,21 @@ retry: if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } - mdbx_tassert(txn, - ctx->cleaned_id <= env->me_lck->mti_oldest_reader.weak); - mdbx_trace("%s: cleanup-reclaimed-id [%u]%" PRIaTXN, dbg_prefix_mode, - ctx->cleaned_slot, ctx->cleaned_id); - mdbx_tassert(txn, *txn->mt_cursors == &ctx->cursor.outer); + tASSERT(txn, ctx->cleaned_id <= env->me_lck->mti_oldest_reader.weak); + TRACE("%s: cleanup-reclaimed-id [%u]%" PRIaTXN, dbg_prefix_mode, + ctx->cleaned_slot, ctx->cleaned_id); + tASSERT(txn, *txn->mt_cursors == &ctx->cursor.outer); rc = mdbx_cursor_del(&ctx->cursor.outer, 0); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } while (ctx->cleaned_slot < MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)); - mdbx_txl_sort(txn->tw.lifo_reclaimed); + txl_sort(txn->tw.lifo_reclaimed); } } else { /* If using records from GC which we have not yet deleted, * now delete them and any we reserved for tw.reclaimed_pglist. */ while (ctx->cleaned_id <= txn->tw.last_reclaimed) { - rc = mdbx_cursor_first(&ctx->cursor.outer, &key, NULL); + rc = cursor_first(&ctx->cursor.outer, &key, NULL); if (rc == MDBX_NOTFOUND) break; if (unlikely(rc != MDBX_SUCCESS)) @@ -9160,35 +9097,32 @@ retry: if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } - mdbx_tassert(txn, ctx->cleaned_id <= txn->tw.last_reclaimed); - mdbx_tassert(txn, - ctx->cleaned_id <= env->me_lck->mti_oldest_reader.weak); - mdbx_trace("%s: cleanup-reclaimed-id %" PRIaTXN, dbg_prefix_mode, - ctx->cleaned_id); - mdbx_tassert(txn, *txn->mt_cursors == &ctx->cursor.outer); + tASSERT(txn, ctx->cleaned_id <= txn->tw.last_reclaimed); + tASSERT(txn, ctx->cleaned_id <= env->me_lck->mti_oldest_reader.weak); + TRACE("%s: cleanup-reclaimed-id %" PRIaTXN, dbg_prefix_mode, + ctx->cleaned_id); + tASSERT(txn, *txn->mt_cursors == &ctx->cursor.outer); rc = mdbx_cursor_del(&ctx->cursor.outer, 0); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } } - mdbx_tassert(txn, - pnl_check_allocated(txn->tw.reclaimed_pglist, + tASSERT(txn, pnl_check_allocated(txn->tw.reclaimed_pglist, txn->mt_next_pgno - MDBX_ENABLE_REFUND)); - mdbx_tassert(txn, mdbx_dirtylist_check(txn)); - if (mdbx_audit_enabled()) { - rc = mdbx_audit_ex(txn, ctx->retired_stored, false); + tASSERT(txn, dirtylist_check(txn)); + if (AUDIT_ENABLED()) { + rc = audit_ex(txn, ctx->retired_stored, false); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } /* return suitable into unallocated space */ - if (mdbx_refund(txn)) { - mdbx_tassert(txn, - pnl_check_allocated(txn->tw.reclaimed_pglist, + if (txn_refund(txn)) { + tASSERT(txn, pnl_check_allocated(txn->tw.reclaimed_pglist, txn->mt_next_pgno - MDBX_ENABLE_REFUND)); - if (mdbx_audit_enabled()) { - rc = mdbx_audit_ex(txn, ctx->retired_stored, false); + if (AUDIT_ENABLED()) { + rc = audit_ex(txn, ctx->retired_stored, false); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } @@ -9201,32 +9135,31 @@ retry: * The pages themselves remain in dirtylist. */ if (unlikely(!txn->tw.lifo_reclaimed && txn->tw.last_reclaimed < 1)) { if (txn->tw.loose_count > 0) { - mdbx_trace("%s: try allocate gc-slot for %u loose-pages", - dbg_prefix_mode, txn->tw.loose_count); + TRACE("%s: try allocate gc-slot for %u loose-pages", dbg_prefix_mode, + txn->tw.loose_count); rc = page_alloc_slowpath(&ctx->cursor.outer, 0, MDBX_ALLOC_GC | MDBX_ALLOC_SLOT | MDBX_ALLOC_FAKE) .err; if (rc == MDBX_SUCCESS) { - mdbx_trace("%s: retry since gc-slot for %u loose-pages available", - dbg_prefix_mode, txn->tw.loose_count); + TRACE("%s: retry since gc-slot for %u loose-pages available", + dbg_prefix_mode, txn->tw.loose_count); continue; } /* Put loose page numbers in tw.retired_pages, * since unable to return them to tw.reclaimed_pglist. */ - if (unlikely((rc = mdbx_pnl_need(&txn->tw.retired_pages, - txn->tw.loose_count)) != 0)) + if (unlikely((rc = pnl_need(&txn->tw.retired_pages, + txn->tw.loose_count)) != 0)) goto bailout; for (MDBX_page *mp = txn->tw.loose_pages; mp; mp = mp->mp_next) - mdbx_pnl_xappend(txn->tw.retired_pages, mp->mp_pgno); - mdbx_trace("%s: append %u loose-pages to retired-pages", - dbg_prefix_mode, txn->tw.loose_count); + pnl_xappend(txn->tw.retired_pages, mp->mp_pgno); + TRACE("%s: append %u loose-pages to retired-pages", dbg_prefix_mode, + txn->tw.loose_count); } } else { /* Room for loose pages + temp PNL with same */ - rc = mdbx_pnl_need(&txn->tw.reclaimed_pglist, - 2 * txn->tw.loose_count + 2); + rc = pnl_need(&txn->tw.reclaimed_pglist, 2 * txn->tw.loose_count + 2); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; MDBX_PNL loose = txn->tw.reclaimed_pglist + @@ -9234,15 +9167,15 @@ retry: txn->tw.loose_count - 1; unsigned count = 0; for (MDBX_page *mp = txn->tw.loose_pages; mp; mp = mp->mp_next) { - mdbx_tassert(txn, mp->mp_flags == P_LOOSE); + tASSERT(txn, mp->mp_flags == P_LOOSE); loose[++count] = mp->mp_pgno; } - mdbx_tassert(txn, count == txn->tw.loose_count); + tASSERT(txn, count == txn->tw.loose_count); MDBX_PNL_SIZE(loose) = count; - mdbx_pnl_sort(loose, txn->mt_next_pgno); + pnl_sort(loose, txn->mt_next_pgno); pnl_merge(txn->tw.reclaimed_pglist, loose); - mdbx_trace("%s: append %u loose-pages to reclaimed-pages", - dbg_prefix_mode, txn->tw.loose_count); + TRACE("%s: append %u loose-pages to reclaimed-pages", dbg_prefix_mode, + txn->tw.loose_count); } /* filter-out list of dirty-pages from loose-pages */ @@ -9250,25 +9183,24 @@ retry: unsigned w = 0; for (unsigned r = w; ++r <= dl->length;) { MDBX_page *dp = dl->items[r].ptr; - mdbx_tassert(txn, dp->mp_flags == P_LOOSE || IS_MODIFIABLE(txn, dp)); - mdbx_tassert(txn, dpl_endpgno(dl, r) <= txn->mt_next_pgno); + tASSERT(txn, dp->mp_flags == P_LOOSE || IS_MODIFIABLE(txn, dp)); + tASSERT(txn, dpl_endpgno(dl, r) <= txn->mt_next_pgno); if ((dp->mp_flags & P_LOOSE) == 0) { if (++w != r) dl->items[w] = dl->items[r]; } else { - mdbx_tassert(txn, dp->mp_flags == P_LOOSE); + tASSERT(txn, dp->mp_flags == P_LOOSE); if ((env->me_flags & MDBX_WRITEMAP) == 0) - mdbx_dpage_free(env, dp, 1); + dpage_free(env, dp, 1); } } - mdbx_trace("%s: filtered-out loose-pages from %u -> %u dirty-pages", - dbg_prefix_mode, dl->length, w); - mdbx_tassert(txn, txn->tw.loose_count == dl->length - w); + TRACE("%s: filtered-out loose-pages from %u -> %u dirty-pages", + dbg_prefix_mode, dl->length, w); + tASSERT(txn, txn->tw.loose_count == dl->length - w); dpl_setlen(dl, w); dl->sorted = 0; txn->tw.dirtyroom += txn->tw.loose_count; - mdbx_tassert(txn, - txn->tw.dirtyroom + txn->tw.dirtylist->length == + tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length == (txn->mt_parent ? txn->mt_parent->tw.dirtyroom : txn->mt_env->me_options.dp_limit)); txn->tw.loose_pages = NULL; @@ -9284,8 +9216,8 @@ retry: if (unlikely(!ctx->retired_stored)) { /* Make sure last page of GC is touched and on retired-list */ ctx->cursor.outer.mc_flags &= ~C_RECLAIMING; - rc = mdbx_page_search(&ctx->cursor.outer, NULL, - MDBX_PS_LAST | MDBX_PS_MODIFY); + rc = page_search(&ctx->cursor.outer, NULL, + MDBX_PS_LAST | MDBX_PS_MODIFY); ctx->cursor.outer.mc_flags |= C_RECLAIMING; if (unlikely(rc != MDBX_SUCCESS) && rc != MDBX_NOTFOUND) goto bailout; @@ -9296,7 +9228,7 @@ retry: do { if (ctx->bigfoot > txn->mt_txnid) { rc = gcu_clean_stored_retired(txn, ctx); - mdbx_tassert(txn, ctx->bigfoot <= txn->mt_txnid); + tASSERT(txn, ctx->bigfoot <= txn->mt_txnid); } retired_pages_before = MDBX_PNL_SIZE(txn->tw.retired_pages); @@ -9304,7 +9236,7 @@ retry: if (unlikely(rc != MDBX_SUCCESS)) goto bailout; - mdbx_pnl_sort(txn->tw.retired_pages, txn->mt_next_pgno); + pnl_sort(txn->tw.retired_pages, txn->mt_next_pgno); ctx->retired_stored = 0; ctx->bigfoot = txn->mt_txnid; do { @@ -9337,11 +9269,11 @@ retry: *begin = chunk; memcpy(data.iov_base, begin, data.iov_len); *begin = save; - mdbx_trace("%s: put-retired/bigfoot @ %" PRIaTXN - " (slice #%u) #%u [%u..%u] of %u", - dbg_prefix_mode, ctx->bigfoot, - (unsigned)(ctx->bigfoot - txn->mt_txnid), chunk, at, - at + chunk, retired_pages_before); + TRACE("%s: put-retired/bigfoot @ %" PRIaTXN + " (slice #%u) #%u [%u..%u] of %u", + dbg_prefix_mode, ctx->bigfoot, + (unsigned)(ctx->bigfoot - txn->mt_txnid), chunk, at, + at + chunk, retired_pages_before); } ctx->retired_stored += chunk; } while (ctx->retired_stored < MDBX_PNL_SIZE(txn->tw.retired_pages) && @@ -9361,27 +9293,25 @@ retry: } while (data.iov_len < MDBX_PNL_SIZEOF(txn->tw.retired_pages)); ctx->retired_stored = (unsigned)MDBX_PNL_SIZE(txn->tw.retired_pages); - mdbx_pnl_sort(txn->tw.retired_pages, txn->mt_next_pgno); - mdbx_assert(env, data.iov_len == MDBX_PNL_SIZEOF(txn->tw.retired_pages)); + pnl_sort(txn->tw.retired_pages, txn->mt_next_pgno); + eASSERT(env, data.iov_len == MDBX_PNL_SIZEOF(txn->tw.retired_pages)); memcpy(data.iov_base, txn->tw.retired_pages, data.iov_len); - mdbx_trace("%s: put-retired #%u @ %" PRIaTXN, dbg_prefix_mode, - ctx->retired_stored, txn->mt_txnid); + TRACE("%s: put-retired #%u @ %" PRIaTXN, dbg_prefix_mode, + ctx->retired_stored, txn->mt_txnid); #endif /* MDBX_ENABLE_BIGFOOT */ - if (mdbx_log_enabled(MDBX_LOG_EXTRA)) { + if (LOG_ENABLED(MDBX_LOG_EXTRA)) { unsigned i = ctx->retired_stored; - mdbx_debug_extra("txn %" PRIaTXN " root %" PRIaPGNO - " num %u, retired-PNL", - txn->mt_txnid, txn->mt_dbs[FREE_DBI].md_root, i); + DEBUG_EXTRA("txn %" PRIaTXN " root %" PRIaPGNO " num %u, retired-PNL", + txn->mt_txnid, txn->mt_dbs[FREE_DBI].md_root, i); for (; i; i--) - mdbx_debug_extra_print(" %" PRIaPGNO, txn->tw.retired_pages[i]); - mdbx_debug_extra_print("%s\n", "."); + DEBUG_EXTRA_PRINT(" %" PRIaPGNO, txn->tw.retired_pages[i]); + DEBUG_EXTRA_PRINT("%s\n", "."); } if (unlikely(amount != MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) && ctx->settled)) { - mdbx_trace("%s: reclaimed-list changed %u -> %u, retry", - dbg_prefix_mode, amount, - (unsigned)MDBX_PNL_SIZE(txn->tw.reclaimed_pglist)); + TRACE("%s: reclaimed-list changed %u -> %u, retry", dbg_prefix_mode, + amount, (unsigned)MDBX_PNL_SIZE(txn->tw.reclaimed_pglist)); goto retry /* rare case, but avoids GC fragmentation and one cycle. */ ; @@ -9390,25 +9320,24 @@ retry: } /* handle reclaimed and lost pages - merge and store both into gc */ - mdbx_tassert(txn, - pnl_check_allocated(txn->tw.reclaimed_pglist, + tASSERT(txn, pnl_check_allocated(txn->tw.reclaimed_pglist, txn->mt_next_pgno - MDBX_ENABLE_REFUND)); - mdbx_tassert(txn, txn->tw.loose_count == 0); + tASSERT(txn, txn->tw.loose_count == 0); - mdbx_trace("%s", " >> reserving"); - if (mdbx_audit_enabled()) { - rc = mdbx_audit_ex(txn, ctx->retired_stored, false); + TRACE("%s", " >> reserving"); + if (AUDIT_ENABLED()) { + rc = audit_ex(txn, ctx->retired_stored, false); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } const unsigned left = amount - ctx->settled; - mdbx_trace("%s: amount %u, settled %d, left %d, lifo-reclaimed-slots %u, " - "reused-gc-slots %u", - dbg_prefix_mode, amount, ctx->settled, (int)left, - txn->tw.lifo_reclaimed - ? (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) - : 0, - ctx->reused_slot); + TRACE("%s: amount %u, settled %d, left %d, lifo-reclaimed-slots %u, " + "reused-gc-slots %u", + dbg_prefix_mode, amount, ctx->settled, (int)left, + txn->tw.lifo_reclaimed + ? (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) + : 0, + ctx->reused_slot); if (0 >= (int)left) break; @@ -9416,7 +9345,7 @@ retry: txnid_t reservation_gc_id; if (ctx->lifo) { if (txn->tw.lifo_reclaimed == nullptr) { - txn->tw.lifo_reclaimed = mdbx_txl_alloc(); + txn->tw.lifo_reclaimed = txl_alloc(); if (unlikely(!txn->tw.lifo_reclaimed)) { rc = MDBX_ENOMEM; goto bailout; @@ -9440,8 +9369,8 @@ retry: MDBX_ALLOC_FAKE) .err; if (likely(rc == MDBX_SUCCESS)) { - mdbx_trace("%s: took @%" PRIaTXN " from GC", dbg_prefix_mode, - MDBX_PNL_LAST(txn->tw.lifo_reclaimed)); + TRACE("%s: took @%" PRIaTXN " from GC", dbg_prefix_mode, + MDBX_PNL_LAST(txn->tw.lifo_reclaimed)); need_cleanup = true; } } while (rc == MDBX_SUCCESS && @@ -9453,7 +9382,7 @@ retry: ctx->cursor.outer.mc_flags |= C_RECLAIMING; if (likely(rc == MDBX_SUCCESS)) { - mdbx_trace("%s: got enough from GC.", dbg_prefix_mode); + TRACE("%s: got enough from GC.", dbg_prefix_mode); continue; } else if (unlikely(rc != MDBX_NOTFOUND)) /* LY: some troubles... */ @@ -9461,12 +9390,12 @@ retry: if (MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)) { if (need_cleanup) { - mdbx_txl_sort(txn->tw.lifo_reclaimed); + txl_sort(txn->tw.lifo_reclaimed); ctx->cleaned_slot = 0; } ctx->rid = MDBX_PNL_LAST(txn->tw.lifo_reclaimed); } else { - mdbx_tassert(txn, txn->tw.last_reclaimed == 0); + tASSERT(txn, txn->tw.last_reclaimed == 0); if (unlikely(find_oldest_reader(env) != snap_oldest)) /* should retry page_alloc_slowpath(MDBX_ALLOC_GC) * if the oldest reader changes since the last attempt */ @@ -9474,8 +9403,8 @@ retry: /* no reclaimable GC entries, * therefore no entries with ID < mdbx_find_oldest(txn) */ txn->tw.last_reclaimed = ctx->rid = snap_oldest; - mdbx_trace("%s: none recycled yet, set rid to @%" PRIaTXN, - dbg_prefix_mode, ctx->rid); + TRACE("%s: none recycled yet, set rid to @%" PRIaTXN, dbg_prefix_mode, + ctx->rid); } /* LY: GC is empty, will look any free txn-id in high2low order. */ @@ -9486,28 +9415,27 @@ retry: if (unlikely(ctx->rid <= MIN_TXNID)) { if (unlikely(MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) <= ctx->reused_slot)) { - mdbx_notice("** restart: reserve depleted (reused_gc_slot %u >= " - "lifo_reclaimed %u" PRIaTXN, - ctx->reused_slot, - (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)); + NOTICE("** restart: reserve depleted (reused_gc_slot %u >= " + "lifo_reclaimed %u" PRIaTXN, + ctx->reused_slot, + (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)); goto retry; } break; } - mdbx_tassert(txn, ctx->rid >= MIN_TXNID && ctx->rid <= MAX_TXNID); + tASSERT(txn, ctx->rid >= MIN_TXNID && ctx->rid <= MAX_TXNID); --ctx->rid; key.iov_base = &ctx->rid; key.iov_len = sizeof(ctx->rid); rc = mdbx_cursor_get(&ctx->cursor.outer, &key, &data, MDBX_SET_KEY); if (unlikely(rc == MDBX_SUCCESS)) { - mdbx_debug("%s: GC's id %" PRIaTXN - " is used, continue bottom-up search", - dbg_prefix_mode, ctx->rid); + DEBUG("%s: GC's id %" PRIaTXN " is used, continue bottom-up search", + dbg_prefix_mode, ctx->rid); ++ctx->rid; rc = mdbx_cursor_get(&ctx->cursor.outer, &key, &data, MDBX_FIRST); if (rc == MDBX_NOTFOUND) { - mdbx_debug("%s: GC is empty (going dense-mode)", dbg_prefix_mode); + DEBUG("%s: GC is empty (going dense-mode)", dbg_prefix_mode); ctx->dense = true; break; } @@ -9518,17 +9446,17 @@ retry: } txnid_t gc_first = unaligned_peek_u64(4, key.iov_base); if (gc_first <= MIN_TXNID) { - mdbx_debug("%s: no free GC's id(s) less than %" PRIaTXN - " (going dense-mode)", - dbg_prefix_mode, ctx->rid); + DEBUG("%s: no free GC's id(s) less than %" PRIaTXN + " (going dense-mode)", + dbg_prefix_mode, ctx->rid); ctx->dense = true; break; } ctx->rid = gc_first - 1; } - mdbx_assert(env, !ctx->dense); - rc = mdbx_txl_append(&txn->tw.lifo_reclaimed, ctx->rid); + eASSERT(env, !ctx->dense); + rc = txl_append(&txn->tw.lifo_reclaimed, ctx->rid); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; @@ -9540,16 +9468,15 @@ retry: ctx->cleaned_slot += 1 /* mark cleanup is not needed for added slot. */; - mdbx_trace("%s: append @%" PRIaTXN - " to lifo-reclaimed, cleaned-gc-slot = %u", - dbg_prefix_mode, ctx->rid, ctx->cleaned_slot); + TRACE("%s: append @%" PRIaTXN + " to lifo-reclaimed, cleaned-gc-slot = %u", + dbg_prefix_mode, ctx->rid, ctx->cleaned_slot); } if (need_cleanup || ctx->dense) { if (ctx->cleaned_slot) - mdbx_trace( - "%s: restart inner-loop to clear and re-create GC entries", - dbg_prefix_mode); + TRACE("%s: restart inner-loop to clear and re-create GC entries", + dbg_prefix_mode); ctx->cleaned_slot = 0; continue; } @@ -9557,12 +9484,12 @@ retry: const unsigned i = (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) - ctx->reused_slot; - mdbx_tassert(txn, i > 0 && i <= MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)); + tASSERT(txn, i > 0 && i <= MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)); reservation_gc_id = txn->tw.lifo_reclaimed[i]; - mdbx_trace("%s: take @%" PRIaTXN " from lifo-reclaimed[%u]", - dbg_prefix_mode, reservation_gc_id, i); + TRACE("%s: take @%" PRIaTXN " from lifo-reclaimed[%u]", dbg_prefix_mode, + reservation_gc_id, i); } else { - mdbx_tassert(txn, txn->tw.lifo_reclaimed == NULL); + tASSERT(txn, txn->tw.lifo_reclaimed == NULL); if (unlikely(ctx->rid == 0)) { ctx->rid = find_oldest_reader(env); rc = mdbx_cursor_get(&ctx->cursor.outer, &key, NULL, MDBX_FIRST); @@ -9575,7 +9502,7 @@ retry: if (ctx->rid >= gc_first) ctx->rid = gc_first - 1; if (unlikely(ctx->rid == 0)) { - mdbx_error("%s", "** no GC tail-space to store (going dense-mode)"); + ERROR("%s", "** no GC tail-space to store (going dense-mode)"); ctx->dense = true; goto retry; } @@ -9585,8 +9512,8 @@ retry: ctx->cleaned_id = ctx->rid + 1; } reservation_gc_id = ctx->rid--; - mdbx_trace("%s: take @%" PRIaTXN " from head-gc-id", dbg_prefix_mode, - reservation_gc_id); + TRACE("%s: take @%" PRIaTXN " from head-gc-id", dbg_prefix_mode, + reservation_gc_id); } ++ctx->reused_slot; @@ -9638,22 +9565,22 @@ retry: } } } - mdbx_tassert(txn, chunk > 0); + tASSERT(txn, chunk > 0); - mdbx_trace("%s: gc_rid %" PRIaTXN ", reused_gc_slot %u, reservation-id " - "%" PRIaTXN, - dbg_prefix_mode, ctx->rid, ctx->reused_slot, reservation_gc_id); + TRACE("%s: gc_rid %" PRIaTXN ", reused_gc_slot %u, reservation-id " + "%" PRIaTXN, + dbg_prefix_mode, ctx->rid, ctx->reused_slot, reservation_gc_id); - mdbx_trace("%s: chunk %u, gc-per-ovpage %u", dbg_prefix_mode, chunk, - env->me_maxgc_ov1page); + TRACE("%s: chunk %u, gc-per-ovpage %u", dbg_prefix_mode, chunk, + env->me_maxgc_ov1page); - mdbx_tassert(txn, reservation_gc_id <= env->me_lck->mti_oldest_reader.weak); + tASSERT(txn, reservation_gc_id <= env->me_lck->mti_oldest_reader.weak); if (unlikely( reservation_gc_id < MIN_TXNID || reservation_gc_id > atomic_load64(&env->me_lck->mti_oldest_reader, mo_Relaxed))) { - mdbx_error("** internal error (reservation_gc_id %" PRIaTXN ")", - reservation_gc_id); + ERROR("** internal error (reservation_gc_id %" PRIaTXN ")", + reservation_gc_id); rc = MDBX_PROBLEM; goto bailout; } @@ -9661,50 +9588,47 @@ retry: key.iov_len = sizeof(reservation_gc_id); key.iov_base = &reservation_gc_id; data.iov_len = (chunk + 1) * sizeof(pgno_t); - mdbx_trace("%s: reserve %u [%u...%u) @%" PRIaTXN, dbg_prefix_mode, chunk, - ctx->settled + 1, ctx->settled + chunk + 1, reservation_gc_id); + TRACE("%s: reserve %u [%u...%u) @%" PRIaTXN, dbg_prefix_mode, chunk, + ctx->settled + 1, ctx->settled + chunk + 1, reservation_gc_id); gcu_prepare_backlog(txn, ctx, true); rc = mdbx_cursor_put(&ctx->cursor.outer, &key, &data, MDBX_RESERVE | MDBX_NOOVERWRITE); - mdbx_tassert(txn, - pnl_check_allocated(txn->tw.reclaimed_pglist, + tASSERT(txn, pnl_check_allocated(txn->tw.reclaimed_pglist, txn->mt_next_pgno - MDBX_ENABLE_REFUND)); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; gcu_clean_reserved(env, data); ctx->settled += chunk; - mdbx_trace("%s: settled %u (+%u), continue", dbg_prefix_mode, ctx->settled, - chunk); + TRACE("%s: settled %u (+%u), continue", dbg_prefix_mode, ctx->settled, + chunk); if (txn->tw.lifo_reclaimed && unlikely(amount < MDBX_PNL_SIZE(txn->tw.reclaimed_pglist)) && (ctx->loop < 5 || MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) - amount > env->me_maxgc_ov1page)) { - mdbx_notice("** restart: reclaimed-list growth %u -> %u", amount, - (unsigned)MDBX_PNL_SIZE(txn->tw.reclaimed_pglist)); + NOTICE("** restart: reclaimed-list growth %u -> %u", amount, + (unsigned)MDBX_PNL_SIZE(txn->tw.reclaimed_pglist)); goto retry; } continue; } - mdbx_tassert( - txn, - ctx->cleaned_slot == - (txn->tw.lifo_reclaimed ? MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) : 0)); + tASSERT(txn, ctx->cleaned_slot == (txn->tw.lifo_reclaimed + ? MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) + : 0)); - mdbx_trace("%s", " >> filling"); + TRACE("%s", " >> filling"); /* Fill in the reserved records */ ctx->filled_slot = txn->tw.lifo_reclaimed ? (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) - ctx->reused_slot : ctx->reused_slot; rc = MDBX_SUCCESS; - mdbx_tassert(txn, - pnl_check_allocated(txn->tw.reclaimed_pglist, + tASSERT(txn, pnl_check_allocated(txn->tw.reclaimed_pglist, txn->mt_next_pgno - MDBX_ENABLE_REFUND)); - mdbx_tassert(txn, mdbx_dirtylist_check(txn)); + tASSERT(txn, dirtylist_check(txn)); if (MDBX_PNL_SIZE(txn->tw.reclaimed_pglist)) { MDBX_val key, data; key.iov_len = data.iov_len = 0; /* avoid MSVC warning */ @@ -9713,62 +9637,62 @@ retry: const unsigned amount = MDBX_PNL_SIZE(txn->tw.reclaimed_pglist); unsigned left = amount; if (txn->tw.lifo_reclaimed == nullptr) { - mdbx_tassert(txn, ctx->lifo == 0); - rc = mdbx_cursor_first(&ctx->cursor.outer, &key, &data); + tASSERT(txn, ctx->lifo == 0); + rc = cursor_first(&ctx->cursor.outer, &key, &data); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } else { - mdbx_tassert(txn, ctx->lifo != 0); + tASSERT(txn, ctx->lifo != 0); } while (true) { txnid_t fill_gc_id; - mdbx_trace("%s: left %u of %u", dbg_prefix_mode, left, - (unsigned)MDBX_PNL_SIZE(txn->tw.reclaimed_pglist)); + TRACE("%s: left %u of %u", dbg_prefix_mode, left, + (unsigned)MDBX_PNL_SIZE(txn->tw.reclaimed_pglist)); if (txn->tw.lifo_reclaimed == nullptr) { - mdbx_tassert(txn, ctx->lifo == 0); + tASSERT(txn, ctx->lifo == 0); fill_gc_id = unaligned_peek_u64(4, key.iov_base); if (ctx->filled_slot-- == 0 || fill_gc_id > txn->tw.last_reclaimed) { - mdbx_notice( + NOTICE( "** restart: reserve depleted (filled_slot %u, fill_id %" PRIaTXN " > last_reclaimed %" PRIaTXN, ctx->filled_slot, fill_gc_id, txn->tw.last_reclaimed); goto retry; } } else { - mdbx_tassert(txn, ctx->lifo != 0); + tASSERT(txn, ctx->lifo != 0); if (++ctx->filled_slot > (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)) { - mdbx_notice("** restart: reserve depleted (filled_gc_slot %u > " - "lifo_reclaimed %u" PRIaTXN, - ctx->filled_slot, - (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)); + NOTICE("** restart: reserve depleted (filled_gc_slot %u > " + "lifo_reclaimed %u" PRIaTXN, + ctx->filled_slot, + (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)); goto retry; } fill_gc_id = txn->tw.lifo_reclaimed[ctx->filled_slot]; - mdbx_trace("%s: seek-reservation @%" PRIaTXN " at lifo_reclaimed[%u]", - dbg_prefix_mode, fill_gc_id, ctx->filled_slot); + TRACE("%s: seek-reservation @%" PRIaTXN " at lifo_reclaimed[%u]", + dbg_prefix_mode, fill_gc_id, ctx->filled_slot); key.iov_base = &fill_gc_id; key.iov_len = sizeof(fill_gc_id); rc = mdbx_cursor_get(&ctx->cursor.outer, &key, &data, MDBX_SET_KEY); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } - mdbx_tassert(txn, ctx->cleaned_slot == - (txn->tw.lifo_reclaimed - ? MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) - : 0)); - mdbx_tassert(txn, fill_gc_id > 0 && - fill_gc_id <= env->me_lck->mti_oldest_reader.weak); + tASSERT(txn, + ctx->cleaned_slot == (txn->tw.lifo_reclaimed + ? MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) + : 0)); + tASSERT(txn, fill_gc_id > 0 && + fill_gc_id <= env->me_lck->mti_oldest_reader.weak); key.iov_base = &fill_gc_id; key.iov_len = sizeof(fill_gc_id); - mdbx_tassert(txn, data.iov_len >= sizeof(pgno_t) * 2); + tASSERT(txn, data.iov_len >= sizeof(pgno_t) * 2); ctx->cursor.outer.mc_flags |= C_GCFREEZE; unsigned chunk = (unsigned)(data.iov_len / sizeof(pgno_t)) - 1; if (unlikely(chunk > left)) { - mdbx_trace("%s: chunk %u > left %u, @%" PRIaTXN, dbg_prefix_mode, chunk, - left, fill_gc_id); + TRACE("%s: chunk %u > left %u, @%" PRIaTXN, dbg_prefix_mode, chunk, + left, fill_gc_id); if ((ctx->loop < 5 && chunk - left > ctx->loop / 2) || chunk - left > env->me_maxgc_ov1page) { data.iov_len = (left + 1) * sizeof(pgno_t); @@ -9786,24 +9710,24 @@ retry: if (unlikely(txn->tw.loose_count || amount != MDBX_PNL_SIZE(txn->tw.reclaimed_pglist))) { - mdbx_notice("** restart: reclaimed-list growth (%u -> %u, loose +%u)", - amount, MDBX_PNL_SIZE(txn->tw.reclaimed_pglist), - txn->tw.loose_count); + NOTICE("** restart: reclaimed-list growth (%u -> %u, loose +%u)", + amount, MDBX_PNL_SIZE(txn->tw.reclaimed_pglist), + txn->tw.loose_count); goto retry; } if (unlikely(txn->tw.lifo_reclaimed ? ctx->cleaned_slot < MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) : ctx->cleaned_id < txn->tw.last_reclaimed)) { - mdbx_notice("%s", "** restart: reclaimed-slots changed"); + NOTICE("%s", "** restart: reclaimed-slots changed"); goto retry; } if (unlikely(ctx->retired_stored != MDBX_PNL_SIZE(txn->tw.retired_pages))) { - mdbx_tassert(txn, ctx->retired_stored < - MDBX_PNL_SIZE(txn->tw.retired_pages)); - mdbx_notice("** restart: retired-list growth (%u -> %u)", - ctx->retired_stored, MDBX_PNL_SIZE(txn->tw.retired_pages)); + tASSERT(txn, + ctx->retired_stored < MDBX_PNL_SIZE(txn->tw.retired_pages)); + NOTICE("** restart: retired-list growth (%u -> %u)", + ctx->retired_stored, MDBX_PNL_SIZE(txn->tw.retired_pages)); goto retry; } @@ -9812,15 +9736,14 @@ retry: pgno_t *src = MDBX_PNL_BEGIN(txn->tw.reclaimed_pglist) + left - chunk; memcpy(dst, src, chunk * sizeof(pgno_t)); pgno_t *from = src, *to = src + chunk; - mdbx_trace("%s: fill %u [ %u:%" PRIaPGNO "...%u:%" PRIaPGNO - "] @%" PRIaTXN, - dbg_prefix_mode, chunk, - (unsigned)(from - txn->tw.reclaimed_pglist), from[0], - (unsigned)(to - txn->tw.reclaimed_pglist), to[-1], fill_gc_id); + TRACE("%s: fill %u [ %u:%" PRIaPGNO "...%u:%" PRIaPGNO "] @%" PRIaTXN, + dbg_prefix_mode, chunk, (unsigned)(from - txn->tw.reclaimed_pglist), + from[0], (unsigned)(to - txn->tw.reclaimed_pglist), to[-1], + fill_gc_id); left -= chunk; - if (mdbx_audit_enabled()) { - rc = mdbx_audit_ex(txn, ctx->retired_stored + amount - left, true); + if (AUDIT_ENABLED()) { + rc = audit_ex(txn, ctx->retired_stored + amount - left, true); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } @@ -9830,19 +9753,19 @@ retry: } if (txn->tw.lifo_reclaimed == nullptr) { - mdbx_tassert(txn, ctx->lifo == 0); - rc = mdbx_cursor_next(&ctx->cursor.outer, &key, &data, MDBX_NEXT); + tASSERT(txn, ctx->lifo == 0); + rc = cursor_next(&ctx->cursor.outer, &key, &data, MDBX_NEXT); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } else { - mdbx_tassert(txn, ctx->lifo != 0); + tASSERT(txn, ctx->lifo != 0); } } } - mdbx_tassert(txn, rc == MDBX_SUCCESS); + tASSERT(txn, rc == MDBX_SUCCESS); if (unlikely(txn->tw.loose_count != 0)) { - mdbx_notice("** restart: got %u loose pages", txn->tw.loose_count); + NOTICE("** restart: got %u loose pages", txn->tw.loose_count); goto retry; } if (unlikely(ctx->filled_slot != @@ -9851,25 +9774,24 @@ retry: : 0))) { const bool will_retry = ctx->loop < 9; - mdbx_notice("** %s: reserve excess (filled-slot %u, loop %u)", - will_retry ? "restart" : "ignore", ctx->filled_slot, ctx->loop); + NOTICE("** %s: reserve excess (filled-slot %u, loop %u)", + will_retry ? "restart" : "ignore", ctx->filled_slot, ctx->loop); if (will_retry) goto retry; } - mdbx_tassert(txn, - txn->tw.lifo_reclaimed == NULL || + tASSERT(txn, txn->tw.lifo_reclaimed == NULL || ctx->cleaned_slot == MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)); bailout: txn->mt_cursors[FREE_DBI] = ctx->cursor.outer.mc_next; MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) = 0; - mdbx_trace("<<< %u loops, rc = %d", ctx->loop, rc); + TRACE("<<< %u loops, rc = %d", ctx->loop, rc); return rc; } -static int mdbx_txn_write(MDBX_txn *txn, struct mdbx_iov_ctx *ctx) { +static int txn_write(MDBX_txn *txn, struct iov_ctx *ctx) { MDBX_dpl *const dl = (txn->mt_flags & MDBX_WRITEMAP) ? txn->tw.dirtylist : dpl_sort(txn); int rc = MDBX_SUCCESS; @@ -9887,16 +9809,16 @@ static int mdbx_txn_write(MDBX_txn *txn, struct mdbx_iov_ctx *ctx) { } if (ctx->iov_items) - rc = mdbx_iov_write(txn, ctx); + rc = iov_write(txn, ctx); while (r <= dl->length) dl->items[++w] = dl->items[r++]; dl->sorted = dpl_setlen(dl, w); txn->tw.dirtyroom += r - 1 - w; - mdbx_tassert(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length == - (txn->mt_parent ? txn->mt_parent->tw.dirtyroom - : txn->mt_env->me_options.dp_limit)); + tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length == + (txn->mt_parent ? txn->mt_parent->tw.dirtyroom + : txn->mt_env->me_options.dp_limit)); return rc; } @@ -9904,7 +9826,7 @@ static int mdbx_txn_write(MDBX_txn *txn, struct mdbx_iov_ctx *ctx) { static __always_inline bool check_dbi(MDBX_txn *txn, MDBX_dbi dbi, unsigned validity) { if (likely(dbi < txn->mt_numdbs)) { - mdbx_memory_fence(mo_AcquireRelease, false); + osal_memory_fence(mo_AcquireRelease, false); if (likely(!TXN_DBI_CHANGED(txn, dbi))) { if (likely(txn->mt_dbistate[dbi] & validity)) return true; @@ -9921,8 +9843,8 @@ int mdbx_txn_commit(MDBX_txn *txn) { return __inline_mdbx_txn_commit(txn); } #endif /* LIBMDBX_NO_EXPORTS_LEGACY_API */ /* Merge child txn into parent */ -static __inline void mdbx_txn_merge(MDBX_txn *const parent, MDBX_txn *const txn, - const unsigned parent_retired_len) { +static __inline void txn_merge(MDBX_txn *const parent, MDBX_txn *const txn, + const unsigned parent_retired_len) { MDBX_dpl *const src = dpl_sort(txn); /* Remove refunded pages from parent's dirty list */ @@ -9932,37 +9854,37 @@ static __inline void mdbx_txn_merge(MDBX_txn *const parent, MDBX_txn *const txn, while (n && dst->items[n].pgno >= parent->mt_next_pgno) { if (!(txn->mt_env->me_flags & MDBX_WRITEMAP)) { MDBX_page *dp = dst->items[n].ptr; - mdbx_dpage_free(txn->mt_env, dp, dpl_npages(dst, n)); + dpage_free(txn->mt_env, dp, dpl_npages(dst, n)); } --n; } parent->tw.dirtyroom += dst->sorted - n; dst->sorted = dpl_setlen(dst, n); - mdbx_tassert(parent, - parent->tw.dirtyroom + parent->tw.dirtylist->length == - (parent->mt_parent ? parent->mt_parent->tw.dirtyroom - : parent->mt_env->me_options.dp_limit)); + tASSERT(parent, + parent->tw.dirtyroom + parent->tw.dirtylist->length == + (parent->mt_parent ? parent->mt_parent->tw.dirtyroom + : parent->mt_env->me_options.dp_limit)); } /* Remove reclaimed pages from parent's dirty list */ const MDBX_PNL reclaimed_list = parent->tw.reclaimed_pglist; - mdbx_dpl_sift(parent, reclaimed_list, false); + dpl_sift(parent, reclaimed_list, false); /* Move retired pages from parent's dirty & spilled list to reclaimed */ unsigned r, w, d, s, l; for (r = w = parent_retired_len; ++r <= MDBX_PNL_SIZE(parent->tw.retired_pages);) { const pgno_t pgno = parent->tw.retired_pages[r]; - const unsigned di = mdbx_dpl_exist(parent, pgno); - const unsigned si = !di ? mdbx_search_spilled(parent, pgno) : 0; + const unsigned di = dpl_exist(parent, pgno); + const unsigned si = !di ? search_spilled(parent, pgno) : 0; unsigned npages; const char *kind; if (di) { MDBX_page *dp = dst->items[di].ptr; - mdbx_tassert(parent, (dp->mp_flags & ~(P_LEAF | P_LEAF2 | P_BRANCH | - P_OVERFLOW | P_SPILLED)) == 0); + tASSERT(parent, (dp->mp_flags & ~(P_LEAF | P_LEAF2 | P_BRANCH | + P_OVERFLOW | P_SPILLED)) == 0); npages = dpl_npages(dst, di); - mdbx_page_wash(parent, di, dp, npages); + page_wash(parent, di, dp, npages); kind = "dirty"; l = 1; if (unlikely(npages > l)) { @@ -9994,23 +9916,23 @@ static __inline void mdbx_txn_merge(MDBX_txn *const parent, MDBX_txn *const txn, } } else if (unlikely(si)) { l = npages = 1; - mdbx_spill_remove(parent, si, 1); + spill_remove(parent, si, 1); kind = "spilled"; } else { parent->tw.retired_pages[++w] = pgno; continue; } - mdbx_debug("reclaim retired parent's %u->%u %s page %" PRIaPGNO, npages, l, - kind, pgno); - int err = mdbx_pnl_insert_range(&parent->tw.reclaimed_pglist, pgno, l); - mdbx_ensure(txn->mt_env, err == MDBX_SUCCESS); + DEBUG("reclaim retired parent's %u->%u %s page %" PRIaPGNO, npages, l, kind, + pgno); + int err = pnl_insert_range(&parent->tw.reclaimed_pglist, pgno, l); + ENSURE(txn->mt_env, err == MDBX_SUCCESS); } MDBX_PNL_SIZE(parent->tw.retired_pages) = w; /* Filter-out parent spill list */ if (parent->tw.spill_pages && MDBX_PNL_SIZE(parent->tw.spill_pages) > 0) { - const MDBX_PNL sl = mdbx_spill_purge(parent); + const MDBX_PNL sl = spill_purge(parent); unsigned len = MDBX_PNL_SIZE(sl); if (len) { /* Remove refunded pages from parent's spill list */ @@ -10021,7 +9943,7 @@ static __inline void mdbx_txn_merge(MDBX_txn *const parent, MDBX_txn *const txn, assert(MDBX_PNL_MOST(sl) == MDBX_PNL_LAST(sl)); do { if ((sl[i] & 1) == 0) - mdbx_debug("refund parent's spilled page %" PRIaPGNO, sl[i] >> 1); + DEBUG("refund parent's spilled page %" PRIaPGNO, sl[i] >> 1); i -= 1; } while (i && sl[i] >= (parent->mt_next_pgno << 1)); MDBX_PNL_SIZE(sl) = i; @@ -10031,14 +9953,13 @@ static __inline void mdbx_txn_merge(MDBX_txn *const parent, MDBX_txn *const txn, do { ++i; if ((sl[i] & 1) == 0) - mdbx_debug("refund parent's spilled page %" PRIaPGNO, sl[i] >> 1); + DEBUG("refund parent's spilled page %" PRIaPGNO, sl[i] >> 1); } while (i < len && sl[i + 1] >= (parent->mt_next_pgno << 1)); MDBX_PNL_SIZE(sl) = len -= i; memmove(sl + 1, sl + 1 + i, len * sizeof(sl[0])); #endif } - mdbx_tassert(txn, - pnl_check_allocated(sl, (size_t)parent->mt_next_pgno << 1)); + tASSERT(txn, pnl_check_allocated(sl, (size_t)parent->mt_next_pgno << 1)); /* Remove reclaimed pages from parent's spill list */ s = MDBX_PNL_SIZE(sl), r = MDBX_PNL_SIZE(reclaimed_list); @@ -10055,9 +9976,9 @@ static __inline void mdbx_txn_merge(MDBX_txn *const parent, MDBX_txn *const txn, s -= !cmp; r -= cmp; } else { - mdbx_debug("remove reclaimed parent's spilled page %" PRIaPGNO, - reclaimed_pgno); - mdbx_spill_remove(parent, s, 1); + DEBUG("remove reclaimed parent's spilled page %" PRIaPGNO, + reclaimed_pgno); + spill_remove(parent, s, 1); --s; --r; } @@ -10086,41 +10007,40 @@ static __inline void mdbx_txn_merge(MDBX_txn *const parent, MDBX_txn *const txn, continue; } - mdbx_debug("remove dirtied parent's spilled %u page %" PRIaPGNO, npages, - dirty_pgno_form); - mdbx_spill_remove(parent, s, 1); + DEBUG("remove dirtied parent's spilled %u page %" PRIaPGNO, npages, + dirty_pgno_form); + spill_remove(parent, s, 1); s += step; } /* Squash deleted pagenums if we deleted any */ - mdbx_spill_purge(parent); + spill_purge(parent); } } /* Remove anything in our spill list from parent's dirty list */ if (txn->tw.spill_pages) { - mdbx_tassert(txn, pnl_check_allocated(txn->tw.spill_pages, - (size_t)parent->mt_next_pgno << 1)); - mdbx_dpl_sift(parent, txn->tw.spill_pages, true); - mdbx_tassert(parent, - parent->tw.dirtyroom + parent->tw.dirtylist->length == - (parent->mt_parent ? parent->mt_parent->tw.dirtyroom - : parent->mt_env->me_options.dp_limit)); + tASSERT(txn, pnl_check_allocated(txn->tw.spill_pages, + (size_t)parent->mt_next_pgno << 1)); + dpl_sift(parent, txn->tw.spill_pages, true); + tASSERT(parent, + parent->tw.dirtyroom + parent->tw.dirtylist->length == + (parent->mt_parent ? parent->mt_parent->tw.dirtyroom + : parent->mt_env->me_options.dp_limit)); } /* Find length of merging our dirty list with parent's and release * filter-out pages */ for (l = 0, d = dst->length, s = src->length; d > 0 && s > 0;) { MDBX_page *sp = src->items[s].ptr; - mdbx_tassert(parent, - (sp->mp_flags & ~(P_LEAF | P_LEAF2 | P_BRANCH | P_OVERFLOW | - P_LOOSE | P_SPILLED)) == 0); + tASSERT(parent, (sp->mp_flags & ~(P_LEAF | P_LEAF2 | P_BRANCH | P_OVERFLOW | + P_LOOSE | P_SPILLED)) == 0); const unsigned s_npages = dpl_npages(src, s); const pgno_t s_pgno = src->items[s].pgno; MDBX_page *dp = dst->items[d].ptr; - mdbx_tassert(parent, (dp->mp_flags & ~(P_LEAF | P_LEAF2 | P_BRANCH | - P_OVERFLOW | P_SPILLED)) == 0); + tASSERT(parent, (dp->mp_flags & ~(P_LEAF | P_LEAF2 | P_BRANCH | P_OVERFLOW | + P_SPILLED)) == 0); const unsigned d_npages = dpl_npages(dst, d); const pgno_t d_pgno = dst->items[d].pgno; @@ -10137,18 +10057,17 @@ static __inline void mdbx_txn_merge(MDBX_txn *const parent, MDBX_txn *const txn, } else { dst->items[d--].ptr = nullptr; if ((txn->mt_flags & MDBX_WRITEMAP) == 0) - mdbx_dpage_free(txn->mt_env, dp, d_npages); + dpage_free(txn->mt_env, dp, d_npages); } } assert(dst->sorted == dst->length); - mdbx_tassert(parent, dst->detent >= l + d + s); + tASSERT(parent, dst->detent >= l + d + s); dst->sorted = l + d + s; /* the merged length */ while (s > 0) { MDBX_page *sp = src->items[s].ptr; - mdbx_tassert(parent, - (sp->mp_flags & ~(P_LEAF | P_LEAF2 | P_BRANCH | P_OVERFLOW | - P_LOOSE | P_SPILLED)) == 0); + tASSERT(parent, (sp->mp_flags & ~(P_LEAF | P_LEAF2 | P_BRANCH | P_OVERFLOW | + P_LOOSE | P_SPILLED)) == 0); if (sp->mp_flags != P_LOOSE) { sp->mp_txnid = parent->mt_front; sp->mp_flags &= ~P_SPILLED; @@ -10170,7 +10089,7 @@ static __inline void mdbx_txn_merge(MDBX_txn *const parent, MDBX_txn *const txn, } ++w; } - mdbx_notice("squash to begin for extending-merge %u -> %u", d, w - 1); + NOTICE("squash to begin for extending-merge %u -> %u", d, w - 1); d = w - 1; continue; } @@ -10212,7 +10131,7 @@ static __inline void mdbx_txn_merge(MDBX_txn *const parent, MDBX_txn *const txn, } --w; } - mdbx_notice("squash to end for shrinking-merge %u -> %u", d, w + 1); + NOTICE("squash to end for shrinking-merge %u -> %u", d, w + 1); d = w + 1; continue; } @@ -10246,19 +10165,19 @@ static __inline void mdbx_txn_merge(MDBX_txn *const parent, MDBX_txn *const txn, assert(parent->tw.dirtyroom <= parent->mt_env->me_options.dp_limit); dpl_setlen(dst, dst->sorted); parent->tw.dirtylru = txn->tw.dirtylru; - mdbx_tassert(parent, mdbx_dirtylist_check(parent)); - mdbx_dpl_free(txn); + tASSERT(parent, dirtylist_check(parent)); + dpl_free(txn); if (txn->tw.spill_pages) { if (parent->tw.spill_pages) { /* Must not fail since space was preserved above. */ pnl_merge(parent->tw.spill_pages, txn->tw.spill_pages); - mdbx_pnl_free(txn->tw.spill_pages); + pnl_free(txn->tw.spill_pages); } else { parent->tw.spill_pages = txn->tw.spill_pages; parent->tw.spill_least_removed = txn->tw.spill_least_removed; } - mdbx_tassert(parent, mdbx_dirtylist_check(parent)); + tASSERT(parent, dirtylist_check(parent)); } parent->mt_flags &= ~MDBX_TXN_HAS_CHILD; @@ -10273,7 +10192,7 @@ static __inline void mdbx_txn_merge(MDBX_txn *const parent, MDBX_txn *const txn, int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { STATIC_ASSERT(MDBX_TXN_FINISHED == MDBX_TXN_BLOCKED - MDBX_TXN_HAS_CHILD - MDBX_TXN_ERROR); - const uint64_t ts_0 = latency ? mdbx_osal_monotime() : 0; + const uint64_t ts_0 = latency ? osal_monotime() : 0; uint64_t ts_1 = 0, ts_2 = 0, ts_3 = 0, ts_4 = 0; uint32_t audit_duration = 0; @@ -10288,14 +10207,14 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { MDBX_env *env = txn->mt_env; #if MDBX_ENV_CHECKPID - if (unlikely(env->me_pid != mdbx_getpid())) { + if (unlikely(env->me_pid != osal_getpid())) { env->me_flags |= MDBX_FATAL_ERROR; rc = MDBX_PANIC; goto provide_latency; } #endif /* MDBX_ENV_CHECKPID */ - /* mdbx_txn_end() mode for a commit which writes nothing */ + /* txn_end() mode for a commit which writes nothing */ unsigned end_mode = MDBX_END_PURE_COMMIT | MDBX_END_UPDATE | MDBX_END_SLOT | MDBX_END_FREE; if (unlikely(F_ISSET(txn->mt_flags, MDBX_TXN_RDONLY))) @@ -10303,43 +10222,43 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { if (txn->mt_child) { rc = mdbx_txn_commit_ex(txn->mt_child, NULL); - mdbx_tassert(txn, txn->mt_child == NULL); + tASSERT(txn, txn->mt_child == NULL); if (unlikely(rc != MDBX_SUCCESS)) goto fail; } if (unlikely(txn != env->me_txn)) { - mdbx_debug("%s", "attempt to commit unknown transaction"); + DEBUG("%s", "attempt to commit unknown transaction"); rc = MDBX_EINVAL; goto fail; } if (txn->mt_parent) { - mdbx_tassert(txn, mdbx_audit_ex(txn, 0, false) == 0); - mdbx_assert(env, txn != env->me_txn0); + tASSERT(txn, audit_ex(txn, 0, false) == 0); + eASSERT(env, txn != env->me_txn0); MDBX_txn *const parent = txn->mt_parent; - mdbx_assert(env, parent->mt_signature == MDBX_MT_SIGNATURE); - mdbx_assert(env, parent->mt_child == txn && - (parent->mt_flags & MDBX_TXN_HAS_CHILD) != 0); - mdbx_assert(env, mdbx_dirtylist_check(txn)); + eASSERT(env, parent->mt_signature == MDBX_MT_SIGNATURE); + eASSERT(env, parent->mt_child == txn && + (parent->mt_flags & MDBX_TXN_HAS_CHILD) != 0); + eASSERT(env, dirtylist_check(txn)); if (txn->tw.dirtylist->length == 0 && !(txn->mt_flags & MDBX_TXN_DIRTY) && parent->mt_numdbs == txn->mt_numdbs) { for (int i = txn->mt_numdbs; --i >= 0;) { - mdbx_tassert(txn, (txn->mt_dbistate[i] & DBI_DIRTY) == 0); + tASSERT(txn, (txn->mt_dbistate[i] & DBI_DIRTY) == 0); if ((txn->mt_dbistate[i] & DBI_STALE) && !(parent->mt_dbistate[i] & DBI_STALE)) - mdbx_tassert(txn, memcmp(&parent->mt_dbs[i], &txn->mt_dbs[i], - sizeof(MDBX_db)) == 0); + tASSERT(txn, memcmp(&parent->mt_dbs[i], &txn->mt_dbs[i], + sizeof(MDBX_db)) == 0); } - mdbx_tassert(txn, memcmp(&parent->mt_geo, &txn->mt_geo, - sizeof(parent->mt_geo)) == 0); - mdbx_tassert(txn, memcmp(&parent->mt_canary, &txn->mt_canary, - sizeof(parent->mt_canary)) == 0); - mdbx_tassert(txn, !txn->tw.spill_pages || - MDBX_PNL_SIZE(txn->tw.spill_pages) == 0); - mdbx_tassert(txn, txn->tw.loose_count == 0); + tASSERT(txn, memcmp(&parent->mt_geo, &txn->mt_geo, + sizeof(parent->mt_geo)) == 0); + tASSERT(txn, memcmp(&parent->mt_canary, &txn->mt_canary, + sizeof(parent->mt_canary)) == 0); + tASSERT(txn, + !txn->tw.spill_pages || MDBX_PNL_SIZE(txn->tw.spill_pages) == 0); + tASSERT(txn, txn->tw.loose_count == 0); /* fast completion of pure nested transaction */ end_mode = MDBX_END_PURE_COMMIT | MDBX_END_SLOT | MDBX_END_FREE; @@ -10350,30 +10269,29 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { * if allocation fails. */ const unsigned parent_retired_len = (unsigned)(uintptr_t)parent->tw.retired_pages; - mdbx_tassert(txn, - parent_retired_len <= MDBX_PNL_SIZE(txn->tw.retired_pages)); + tASSERT(txn, parent_retired_len <= MDBX_PNL_SIZE(txn->tw.retired_pages)); const unsigned retired_delta = MDBX_PNL_SIZE(txn->tw.retired_pages) - parent_retired_len; if (retired_delta) { - rc = mdbx_pnl_need(&txn->tw.reclaimed_pglist, retired_delta); + rc = pnl_need(&txn->tw.reclaimed_pglist, retired_delta); if (unlikely(rc != MDBX_SUCCESS)) goto fail; } if (txn->tw.spill_pages) { if (parent->tw.spill_pages) { - rc = mdbx_pnl_need(&parent->tw.spill_pages, - MDBX_PNL_SIZE(txn->tw.spill_pages)); + rc = pnl_need(&parent->tw.spill_pages, + MDBX_PNL_SIZE(txn->tw.spill_pages)); if (unlikely(rc != MDBX_SUCCESS)) goto fail; } - mdbx_spill_purge(txn); + spill_purge(txn); } if (unlikely(txn->tw.dirtylist->length + parent->tw.dirtylist->length > parent->tw.dirtylist->detent && - !mdbx_dpl_reserve(parent, txn->tw.dirtylist->length + - parent->tw.dirtylist->length))) { + !dpl_reserve(parent, txn->tw.dirtylist->length + + parent->tw.dirtylist->length))) { rc = MDBX_ENOMEM; goto fail; } @@ -10386,7 +10304,7 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { parent->tw.retired_pages = txn->tw.retired_pages; txn->tw.retired_pages = NULL; - mdbx_pnl_free(parent->tw.reclaimed_pglist); + pnl_free(parent->tw.reclaimed_pglist); parent->tw.reclaimed_pglist = txn->tw.reclaimed_pglist; txn->tw.reclaimed_pglist = NULL; parent->tw.last_reclaimed = txn->tw.last_reclaimed; @@ -10403,7 +10321,7 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { parent->tw.loose_pages = txn->tw.loose_pages; /* Merge our cursors into parent's and close them */ - mdbx_cursors_eot(txn, true); + cursors_eot(txn, true); end_mode |= MDBX_END_EOTDONE; /* Update parent's DBs array */ @@ -10414,53 +10332,53 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { const uint8_t state = txn->mt_dbistate[i] | (parent->mt_dbistate[i] & (DBI_CREAT | DBI_FRESH | DBI_DIRTY)); - mdbx_debug("db %u dbi-state %s 0x%02x -> 0x%02x", i, - (parent->mt_dbistate[i] != state) ? "update" : "still", - parent->mt_dbistate[i], state); + DEBUG("db %u dbi-state %s 0x%02x -> 0x%02x", i, + (parent->mt_dbistate[i] != state) ? "update" : "still", + parent->mt_dbistate[i], state); parent->mt_dbistate[i] = state; } - ts_1 = latency ? mdbx_osal_monotime() : 0; - mdbx_txn_merge(parent, txn, parent_retired_len); - ts_2 = latency ? mdbx_osal_monotime() : 0; + ts_1 = latency ? osal_monotime() : 0; + txn_merge(parent, txn, parent_retired_len); + ts_2 = latency ? osal_monotime() : 0; env->me_txn = parent; parent->mt_child = NULL; - mdbx_tassert(parent, mdbx_dirtylist_check(parent)); + tASSERT(parent, dirtylist_check(parent)); #if MDBX_ENABLE_REFUND - mdbx_refund(parent); - if (mdbx_assert_enabled()) { + txn_refund(parent); + if (ASSERT_ENABLED()) { /* Check parent's loose pages not suitable for refund */ for (MDBX_page *lp = parent->tw.loose_pages; lp; lp = lp->mp_next) - mdbx_tassert(parent, lp->mp_pgno < parent->tw.loose_refund_wl && - lp->mp_pgno + 1 < parent->mt_next_pgno); + tASSERT(parent, lp->mp_pgno < parent->tw.loose_refund_wl && + lp->mp_pgno + 1 < parent->mt_next_pgno); /* Check parent's reclaimed pages not suitable for refund */ if (MDBX_PNL_SIZE(parent->tw.reclaimed_pglist)) - mdbx_tassert(parent, MDBX_PNL_MOST(parent->tw.reclaimed_pglist) + 1 < - parent->mt_next_pgno); + tASSERT(parent, MDBX_PNL_MOST(parent->tw.reclaimed_pglist) + 1 < + parent->mt_next_pgno); } #endif /* MDBX_ENABLE_REFUND */ - ts_4 = ts_3 = latency ? mdbx_osal_monotime() : 0; + ts_4 = ts_3 = latency ? osal_monotime() : 0; txn->mt_signature = 0; - mdbx_free(txn); - mdbx_tassert(parent, mdbx_audit_ex(parent, 0, false) == 0); + osal_free(txn); + tASSERT(parent, audit_ex(parent, 0, false) == 0); rc = MDBX_SUCCESS; goto provide_latency; } - mdbx_tassert(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length == - (txn->mt_parent ? txn->mt_parent->tw.dirtyroom - : txn->mt_env->me_options.dp_limit)); - mdbx_cursors_eot(txn, false); + tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length == + (txn->mt_parent ? txn->mt_parent->tw.dirtyroom + : txn->mt_env->me_options.dp_limit)); + cursors_eot(txn, false); end_mode |= MDBX_END_EOTDONE; if (txn->tw.dirtylist->length == 0 && (txn->mt_flags & (MDBX_TXN_DIRTY | MDBX_TXN_SPILLS)) == 0) { for (int i = txn->mt_numdbs; --i >= 0;) - mdbx_tassert(txn, (txn->mt_dbistate[i] & DBI_DIRTY) == 0); + tASSERT(txn, (txn->mt_dbistate[i] & DBI_DIRTY) == 0); #if defined(MDBX_NOSUCCESS_EMPTY_COMMIT) && MDBX_NOSUCCESS_EMPTY_COMMIT - rc = mdbx_txn_end(txn, end_mode); + rc = txn_end(txn, end_mode); if (unlikely(rc != MDBX_SUCCESS)) goto fail; rc = MDBX_RESULT_TRUE; @@ -10470,10 +10388,10 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { #endif /* MDBX_NOSUCCESS_EMPTY_COMMIT */ } - mdbx_debug("committing txn %" PRIaTXN " %p on mdbenv %p, root page %" PRIaPGNO - "/%" PRIaPGNO, - txn->mt_txnid, (void *)txn, (void *)env, - txn->mt_dbs[MAIN_DBI].md_root, txn->mt_dbs[FREE_DBI].md_root); + DEBUG("committing txn %" PRIaTXN " %p on mdbenv %p, root page %" PRIaPGNO + "/%" PRIaPGNO, + txn->mt_txnid, (void *)txn, (void *)env, txn->mt_dbs[MAIN_DBI].md_root, + txn->mt_dbs[FREE_DBI].md_root); /* Update DB root pointers */ if (txn->mt_numdbs > CORE_DBS) { @@ -10481,15 +10399,15 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { MDBX_val data; data.iov_len = sizeof(MDBX_db); - rc = mdbx_cursor_init(&couple.outer, txn, MAIN_DBI); + rc = cursor_init(&couple.outer, txn, MAIN_DBI); if (unlikely(rc != MDBX_SUCCESS)) goto fail; for (MDBX_dbi i = CORE_DBS; i < txn->mt_numdbs; i++) { if (txn->mt_dbistate[i] & DBI_DIRTY) { MDBX_db *db = &txn->mt_dbs[i]; - mdbx_debug("update main's entry for sub-db %u, mod_txnid %" PRIaTXN - " -> %" PRIaTXN, - i, db->md_mod_txnid, txn->mt_txnid); + DEBUG("update main's entry for sub-db %u, mod_txnid %" PRIaTXN + " -> %" PRIaTXN, + i, db->md_mod_txnid, txn->mt_txnid); db->md_mod_txnid = txn->mt_txnid; data.iov_base = db; WITH_CURSOR_TRACKING(couple.outer, @@ -10502,12 +10420,12 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { } } - ts_1 = latency ? mdbx_osal_monotime() : 0; + ts_1 = latency ? osal_monotime() : 0; gcu_context_t gcu_ctx; rc = gcu_context_init(txn, &gcu_ctx); if (unlikely(rc != MDBX_SUCCESS)) goto fail; - rc = mdbx_update_gc(txn, &gcu_ctx); + rc = update_gc(txn, &gcu_ctx); if (unlikely(rc != MDBX_SUCCESS)) goto fail; @@ -10519,23 +10437,23 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { ? txn->mt_txnid : txn->mt_dbs[MAIN_DBI].md_mod_txnid; - ts_2 = latency ? mdbx_osal_monotime() : 0; - if (mdbx_audit_enabled()) { - rc = mdbx_audit_ex(txn, MDBX_PNL_SIZE(txn->tw.retired_pages), true); - const uint64_t audit_end = mdbx_osal_monotime(); - audit_duration = mdbx_osal_monotime_to_16dot16(audit_end - ts_2); + ts_2 = latency ? osal_monotime() : 0; + if (AUDIT_ENABLED()) { + rc = audit_ex(txn, MDBX_PNL_SIZE(txn->tw.retired_pages), true); + const uint64_t audit_end = osal_monotime(); + audit_duration = osal_monotime_to_16dot16(audit_end - ts_2); ts_2 = audit_end; if (unlikely(rc != MDBX_SUCCESS)) goto fail; } - struct mdbx_iov_ctx write_ctx; - mdbx_iov_init(txn, &write_ctx); - rc = mdbx_txn_write(txn, &write_ctx); + struct iov_ctx write_ctx; + iov_init(txn, &write_ctx); + rc = txn_write(txn, &write_ctx); if (likely(rc == MDBX_SUCCESS)) - mdbx_iov_done(txn, &write_ctx); + iov_done(txn, &write_ctx); /* TODO: use ctx.flush_begin & ctx.flush_end for range-sync */ - ts_3 = latency ? mdbx_osal_monotime() : 0; + ts_3 = latency ? osal_monotime() : 0; if (likely(rc == MDBX_SUCCESS)) { const MDBX_meta *head = constmeta_prefer_last(env); @@ -10556,16 +10474,16 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { #if MDBX_ENABLE_BIGFOOT if (gcu_ctx.bigfoot > txn->mt_txnid) { commit_txnid = gcu_ctx.bigfoot; - mdbx_trace("use @%" PRIaTXN " (+%u) for commit bigfoot-txn", commit_txnid, - (unsigned)(commit_txnid - txn->mt_txnid)); + TRACE("use @%" PRIaTXN " (+%u) for commit bigfoot-txn", commit_txnid, + (unsigned)(commit_txnid - txn->mt_txnid)); } #endif meta_set_txnid(env, &meta, commit_txnid); - rc = mdbx_sync_locked( - env, env->me_flags | txn->mt_flags | MDBX_SHRINK_ALLOWED, &meta); + rc = sync_locked(env, env->me_flags | txn->mt_flags | MDBX_SHRINK_ALLOWED, + &meta); } - ts_4 = latency ? mdbx_osal_monotime() : 0; + ts_4 = latency ? osal_monotime() : 0; if (unlikely(rc != MDBX_SUCCESS)) { env->me_flags |= MDBX_FATAL_ERROR; goto fail; @@ -10574,22 +10492,18 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { end_mode = MDBX_END_COMMITTED | MDBX_END_UPDATE | MDBX_END_EOTDONE; done: - rc = mdbx_txn_end(txn, end_mode); + rc = txn_end(txn, end_mode); provide_latency: if (latency) { latency->audit = audit_duration; - latency->preparation = - ts_1 ? mdbx_osal_monotime_to_16dot16(ts_1 - ts_0) : 0; - latency->gc = - (ts_1 && ts_2) ? mdbx_osal_monotime_to_16dot16(ts_2 - ts_1) : 0; - latency->write = - (ts_2 && ts_3) ? mdbx_osal_monotime_to_16dot16(ts_3 - ts_2) : 0; - latency->sync = - (ts_3 && ts_4) ? mdbx_osal_monotime_to_16dot16(ts_4 - ts_3) : 0; - const uint64_t ts_5 = mdbx_osal_monotime(); - latency->ending = ts_4 ? mdbx_osal_monotime_to_16dot16(ts_5 - ts_4) : 0; - latency->whole = mdbx_osal_monotime_to_16dot16(ts_5 - ts_0); + latency->preparation = ts_1 ? osal_monotime_to_16dot16(ts_1 - ts_0) : 0; + latency->gc = (ts_1 && ts_2) ? osal_monotime_to_16dot16(ts_2 - ts_1) : 0; + latency->write = (ts_2 && ts_3) ? osal_monotime_to_16dot16(ts_3 - ts_2) : 0; + latency->sync = (ts_3 && ts_4) ? osal_monotime_to_16dot16(ts_4 - ts_3) : 0; + const uint64_t ts_5 = osal_monotime(); + latency->ending = ts_4 ? osal_monotime_to_16dot16(ts_5 - ts_4) : 0; + latency->whole = osal_monotime_to_16dot16(ts_5 - ts_0); } return rc; @@ -10598,48 +10512,46 @@ fail: goto provide_latency; } -static int mdbx_validate_meta(MDBX_env *env, MDBX_meta *const meta, - const MDBX_page *const page, - const unsigned meta_number, - unsigned *guess_pagesize) { +static int validate_meta(MDBX_env *env, MDBX_meta *const meta, + const MDBX_page *const page, + const unsigned meta_number, unsigned *guess_pagesize) { const uint64_t magic_and_version = unaligned_peek_u64(4, &meta->mm_magic_and_version); if (unlikely(magic_and_version != MDBX_DATA_MAGIC && magic_and_version != MDBX_DATA_MAGIC_LEGACY_COMPAT && magic_and_version != MDBX_DATA_MAGIC_LEGACY_DEVEL)) { - mdbx_error("meta[%u] has invalid magic/version %" PRIx64, meta_number, - magic_and_version); + ERROR("meta[%u] has invalid magic/version %" PRIx64, meta_number, + magic_and_version); return ((magic_and_version >> 8) != MDBX_MAGIC) ? MDBX_INVALID : MDBX_VERSION_MISMATCH; } if (unlikely(page->mp_pgno != meta_number)) { - mdbx_error("meta[%u] has invalid pageno %" PRIaPGNO, meta_number, - page->mp_pgno); + ERROR("meta[%u] has invalid pageno %" PRIaPGNO, meta_number, page->mp_pgno); return MDBX_INVALID; } if (unlikely(page->mp_flags != P_META)) { - mdbx_error("page #%u not a meta-page", meta_number); + ERROR("page #%u not a meta-page", meta_number); return MDBX_INVALID; } /* LY: check pagesize */ if (unlikely(!is_powerof2(meta->mm_psize) || meta->mm_psize < MIN_PAGESIZE || meta->mm_psize > MAX_PAGESIZE)) { - mdbx_warning("meta[%u] has invalid pagesize (%u), skip it", meta_number, - meta->mm_psize); + WARNING("meta[%u] has invalid pagesize (%u), skip it", meta_number, + meta->mm_psize); return is_powerof2(meta->mm_psize) ? MDBX_VERSION_MISMATCH : MDBX_INVALID; } if (guess_pagesize && *guess_pagesize != meta->mm_psize) { *guess_pagesize = meta->mm_psize; - mdbx_verbose("meta[%u] took pagesize %u", meta_number, meta->mm_psize); + VERBOSE("meta[%u] took pagesize %u", meta_number, meta->mm_psize); } const txnid_t txnid = unaligned_peek_u64(4, &meta->mm_txnid_a); if (unlikely(txnid != unaligned_peek_u64(4, &meta->mm_txnid_b))) { - mdbx_warning("meta[%u] not completely updated, skip it", meta_number); + WARNING("meta[%u] not completely updated, skip it", meta_number); return MDBX_RESULT_TRUE; } @@ -10647,33 +10559,32 @@ static int mdbx_validate_meta(MDBX_env *env, MDBX_meta *const meta, if (META_IS_STEADY(meta) && unlikely(unaligned_peek_u64(4, &meta->mm_datasync_sign) != meta_sign(meta))) { - mdbx_warning("meta[%u] has invalid steady-checksum (0x%" PRIx64 - " != 0x%" PRIx64 "), skip it", - meta_number, unaligned_peek_u64(4, &meta->mm_datasync_sign), - meta_sign(meta)); + WARNING("meta[%u] has invalid steady-checksum (0x%" PRIx64 " != 0x%" PRIx64 + "), skip it", + meta_number, unaligned_peek_u64(4, &meta->mm_datasync_sign), + meta_sign(meta)); return MDBX_RESULT_TRUE; } - mdbx_debug("checking meta%" PRIaPGNO " = root %" PRIaPGNO "/%" PRIaPGNO - ", geo %" PRIaPGNO "/%" PRIaPGNO "-%" PRIaPGNO "/%" PRIaPGNO - " +%u -%u, txn_id %" PRIaTXN ", %s", - page->mp_pgno, meta->mm_dbs[MAIN_DBI].md_root, - meta->mm_dbs[FREE_DBI].md_root, meta->mm_geo.lower, - meta->mm_geo.next, meta->mm_geo.now, meta->mm_geo.upper, - pv2pages(meta->mm_geo.grow_pv), pv2pages(meta->mm_geo.shrink_pv), - txnid, mdbx_durable_str(meta)); + DEBUG("checking meta%" PRIaPGNO " = root %" PRIaPGNO "/%" PRIaPGNO + ", geo %" PRIaPGNO "/%" PRIaPGNO "-%" PRIaPGNO "/%" PRIaPGNO + " +%u -%u, txn_id %" PRIaTXN ", %s", + page->mp_pgno, meta->mm_dbs[MAIN_DBI].md_root, + meta->mm_dbs[FREE_DBI].md_root, meta->mm_geo.lower, meta->mm_geo.next, + meta->mm_geo.now, meta->mm_geo.upper, pv2pages(meta->mm_geo.grow_pv), + pv2pages(meta->mm_geo.shrink_pv), txnid, durable_caption(meta)); if (unlikely(txnid < MIN_TXNID || txnid > MAX_TXNID)) { - mdbx_warning("meta[%u] has invalid txnid %" PRIaTXN ", skip it", - meta_number, txnid); + WARNING("meta[%u] has invalid txnid %" PRIaTXN ", skip it", meta_number, + txnid); return MDBX_RESULT_TRUE; } /* LY: check min-pages value */ if (unlikely(meta->mm_geo.lower < MIN_PAGENO || meta->mm_geo.lower > MAX_PAGENO + 1)) { - mdbx_warning("meta[%u] has invalid min-pages (%" PRIaPGNO "), skip it", - meta_number, meta->mm_geo.lower); + WARNING("meta[%u] has invalid min-pages (%" PRIaPGNO "), skip it", + meta_number, meta->mm_geo.lower); return MDBX_INVALID; } @@ -10681,16 +10592,16 @@ static int mdbx_validate_meta(MDBX_env *env, MDBX_meta *const meta, if (unlikely(meta->mm_geo.upper < MIN_PAGENO || meta->mm_geo.upper > MAX_PAGENO + 1 || meta->mm_geo.upper < meta->mm_geo.lower)) { - mdbx_warning("meta[%u] has invalid max-pages (%" PRIaPGNO "), skip it", - meta_number, meta->mm_geo.upper); + WARNING("meta[%u] has invalid max-pages (%" PRIaPGNO "), skip it", + meta_number, meta->mm_geo.upper); return MDBX_INVALID; } /* LY: check last_pgno */ if (unlikely(meta->mm_geo.next < MIN_PAGENO || meta->mm_geo.next - 1 > MAX_PAGENO)) { - mdbx_warning("meta[%u] has invalid next-pageno (%" PRIaPGNO "), skip it", - meta_number, meta->mm_geo.next); + WARNING("meta[%u] has invalid next-pageno (%" PRIaPGNO "), skip it", + meta_number, meta->mm_geo.next); return MDBX_CORRUPTED; } @@ -10698,20 +10609,20 @@ static int mdbx_validate_meta(MDBX_env *env, MDBX_meta *const meta, const uint64_t used_bytes = meta->mm_geo.next * (uint64_t)meta->mm_psize; if (unlikely(used_bytes > env->me_dxb_mmap.filesize)) { /* Here could be a race with DB-shrinking performed by other process */ - int err = mdbx_filesize(env->me_lazy_fd, &env->me_dxb_mmap.filesize); + int err = osal_filesize(env->me_lazy_fd, &env->me_dxb_mmap.filesize); if (unlikely(err != MDBX_SUCCESS)) return err; if (unlikely(used_bytes > env->me_dxb_mmap.filesize)) { - mdbx_warning("meta[%u] used-bytes (%" PRIu64 ") beyond filesize (%" PRIu64 - "), skip it", - meta_number, used_bytes, env->me_dxb_mmap.filesize); + WARNING("meta[%u] used-bytes (%" PRIu64 ") beyond filesize (%" PRIu64 + "), skip it", + meta_number, used_bytes, env->me_dxb_mmap.filesize); return MDBX_CORRUPTED; } } if (unlikely(meta->mm_geo.next - 1 > MAX_PAGENO || used_bytes > MAX_MAPSIZE)) { - mdbx_warning("meta[%u] has too large used-space (%" PRIu64 "), skip it", - meta_number, used_bytes); + WARNING("meta[%u] has too large used-space (%" PRIu64 "), skip it", + meta_number, used_bytes); return MDBX_TOO_LARGE; } @@ -10724,24 +10635,24 @@ static int mdbx_validate_meta(MDBX_env *env, MDBX_meta *const meta, if (unlikely(mapsize_min < MIN_MAPSIZE || mapsize_min > MAX_MAPSIZE)) { if (MAX_MAPSIZE != MAX_MAPSIZE64 && mapsize_min > MAX_MAPSIZE && mapsize_min <= MAX_MAPSIZE64) { - mdbx_assert(env, meta->mm_geo.next - 1 <= MAX_PAGENO && - used_bytes <= MAX_MAPSIZE); - mdbx_warning("meta[%u] has too large min-mapsize (%" PRIu64 "), " - "but size of used space still acceptable (%" PRIu64 ")", - meta_number, mapsize_min, used_bytes); + eASSERT(env, + meta->mm_geo.next - 1 <= MAX_PAGENO && used_bytes <= MAX_MAPSIZE); + WARNING("meta[%u] has too large min-mapsize (%" PRIu64 "), " + "but size of used space still acceptable (%" PRIu64 ")", + meta_number, mapsize_min, used_bytes); geo_lower = (pgno_t)((mapsize_min = MAX_MAPSIZE) / meta->mm_psize); if (geo_lower > MAX_PAGENO + 1) { geo_lower = MAX_PAGENO + 1; mapsize_min = geo_lower * (uint64_t)meta->mm_psize; } - mdbx_warning("meta[%u] consider get-%s pageno is %" PRIaPGNO - " instead of wrong %" PRIaPGNO - ", will be corrected on next commit(s)", - meta_number, "lower", geo_lower, meta->mm_geo.lower); + WARNING("meta[%u] consider get-%s pageno is %" PRIaPGNO + " instead of wrong %" PRIaPGNO + ", will be corrected on next commit(s)", + meta_number, "lower", geo_lower, meta->mm_geo.lower); meta->mm_geo.lower = geo_lower; } else { - mdbx_warning("meta[%u] has invalid min-mapsize (%" PRIu64 "), skip it", - meta_number, mapsize_min); + WARNING("meta[%u] has invalid min-mapsize (%" PRIu64 "), skip it", + meta_number, mapsize_min); return MDBX_VERSION_MISMATCH; } } @@ -10754,25 +10665,25 @@ static int mdbx_validate_meta(MDBX_env *env, MDBX_meta *const meta, ceil_powerof2((size_t)mapsize_max, env->me_os_psize) / (size_t)meta->mm_psize)) { if (mapsize_max > MAX_MAPSIZE64) { - mdbx_warning("meta[%u] has invalid max-mapsize (%" PRIu64 "), skip it", - meta_number, mapsize_max); + WARNING("meta[%u] has invalid max-mapsize (%" PRIu64 "), skip it", + meta_number, mapsize_max); return MDBX_VERSION_MISMATCH; } /* allow to open large DB from a 32-bit environment */ - mdbx_assert(env, meta->mm_geo.next - 1 <= MAX_PAGENO && - used_bytes <= MAX_MAPSIZE); - mdbx_warning("meta[%u] has too large max-mapsize (%" PRIu64 "), " - "but size of used space still acceptable (%" PRIu64 ")", - meta_number, mapsize_max, used_bytes); + eASSERT(env, + meta->mm_geo.next - 1 <= MAX_PAGENO && used_bytes <= MAX_MAPSIZE); + WARNING("meta[%u] has too large max-mapsize (%" PRIu64 "), " + "but size of used space still acceptable (%" PRIu64 ")", + meta_number, mapsize_max, used_bytes); geo_upper = (pgno_t)((mapsize_max = MAX_MAPSIZE) / meta->mm_psize); if (geo_upper > MAX_PAGENO + 1) { geo_upper = MAX_PAGENO + 1; mapsize_max = geo_upper * (uint64_t)meta->mm_psize; } - mdbx_warning("meta[%u] consider get-%s pageno is %" PRIaPGNO - " instead of wrong %" PRIaPGNO - ", will be corrected on next commit(s)", - meta_number, "upper", geo_upper, meta->mm_geo.upper); + WARNING("meta[%u] consider get-%s pageno is %" PRIaPGNO + " instead of wrong %" PRIaPGNO + ", will be corrected on next commit(s)", + meta_number, "upper", geo_upper, meta->mm_geo.upper); meta->mm_geo.upper = geo_upper; } @@ -10790,16 +10701,16 @@ static int mdbx_validate_meta(MDBX_env *env, MDBX_meta *const meta, geo_now = geo_upper; if (unlikely(meta->mm_geo.next > geo_now)) { - mdbx_warning("meta[%u] next-pageno (%" PRIaPGNO - ") is beyond end-pgno (%" PRIaPGNO "), skip it", - meta_number, meta->mm_geo.next, geo_now); + WARNING("meta[%u] next-pageno (%" PRIaPGNO + ") is beyond end-pgno (%" PRIaPGNO "), skip it", + meta_number, meta->mm_geo.next, geo_now); return MDBX_CORRUPTED; } if (meta->mm_geo.now != geo_now) { - mdbx_warning("meta[%u] consider geo-%s pageno is %" PRIaPGNO - " instead of wrong %" PRIaPGNO - ", will be corrected on next commit(s)", - meta_number, "now", geo_now, meta->mm_geo.now); + WARNING("meta[%u] consider geo-%s pageno is %" PRIaPGNO + " instead of wrong %" PRIaPGNO + ", will be corrected on next commit(s)", + meta_number, "now", geo_now, meta->mm_geo.now); meta->mm_geo.now = geo_now; } @@ -10810,12 +10721,12 @@ static int mdbx_validate_meta(MDBX_env *env, MDBX_meta *const meta, meta->mm_dbs[FREE_DBI].md_entries || meta->mm_dbs[FREE_DBI].md_leaf_pages || meta->mm_dbs[FREE_DBI].md_overflow_pages)) { - mdbx_warning("meta[%u] has false-empty %s, skip it", meta_number, "GC"); + WARNING("meta[%u] has false-empty %s, skip it", meta_number, "GC"); return MDBX_CORRUPTED; } } else if (unlikely(meta->mm_dbs[FREE_DBI].md_root >= meta->mm_geo.next)) { - mdbx_warning("meta[%u] has invalid %s-root %" PRIaPGNO ", skip it", - meta_number, "GC", meta->mm_dbs[FREE_DBI].md_root); + WARNING("meta[%u] has invalid %s-root %" PRIaPGNO ", skip it", meta_number, + "GC", meta->mm_dbs[FREE_DBI].md_root); return MDBX_CORRUPTED; } @@ -10826,44 +10737,43 @@ static int mdbx_validate_meta(MDBX_env *env, MDBX_meta *const meta, meta->mm_dbs[MAIN_DBI].md_entries || meta->mm_dbs[MAIN_DBI].md_leaf_pages || meta->mm_dbs[MAIN_DBI].md_overflow_pages)) { - mdbx_warning("meta[%u] has false-empty %s", meta_number, "MainDB"); + WARNING("meta[%u] has false-empty %s", meta_number, "MainDB"); return MDBX_CORRUPTED; } } else if (unlikely(meta->mm_dbs[MAIN_DBI].md_root >= meta->mm_geo.next)) { - mdbx_warning("meta[%u] has invalid %s-root %" PRIaPGNO ", skip it", - meta_number, "MainDB", meta->mm_dbs[MAIN_DBI].md_root); + WARNING("meta[%u] has invalid %s-root %" PRIaPGNO ", skip it", meta_number, + "MainDB", meta->mm_dbs[MAIN_DBI].md_root); return MDBX_CORRUPTED; } if (unlikely(meta->mm_dbs[FREE_DBI].md_mod_txnid > txnid)) { - mdbx_warning("meta[%u] has wrong md_mod_txnid %" PRIaTXN " for %s, skip it", - meta_number, meta->mm_dbs[FREE_DBI].md_mod_txnid, "GC"); + WARNING("meta[%u] has wrong md_mod_txnid %" PRIaTXN " for %s, skip it", + meta_number, meta->mm_dbs[FREE_DBI].md_mod_txnid, "GC"); return MDBX_CORRUPTED; } if (unlikely(meta->mm_dbs[MAIN_DBI].md_mod_txnid > txnid)) { - mdbx_warning("meta[%u] has wrong md_mod_txnid %" PRIaTXN " for %s, skip it", - meta_number, meta->mm_dbs[MAIN_DBI].md_mod_txnid, "MainDB"); + WARNING("meta[%u] has wrong md_mod_txnid %" PRIaTXN " for %s, skip it", + meta_number, meta->mm_dbs[MAIN_DBI].md_mod_txnid, "MainDB"); return MDBX_CORRUPTED; } return MDBX_SUCCESS; } -static int mdbx_validate_meta_copy(MDBX_env *env, const MDBX_meta *meta, - MDBX_meta *dest) { +static int validate_meta_copy(MDBX_env *env, const MDBX_meta *meta, + MDBX_meta *dest) { *dest = *meta; - return mdbx_validate_meta(env, dest, data_page(meta), - bytes2pgno(env, (uint8_t *)meta - env->me_map), - nullptr); + return validate_meta(env, dest, data_page(meta), + bytes2pgno(env, (uint8_t *)meta - env->me_map), nullptr); } /* Read the environment parameters of a DB environment * before mapping it into memory. */ -__cold static int mdbx_read_header(MDBX_env *env, MDBX_meta *dest, - const int lck_exclusive, - const mdbx_mode_t mode_bits) { - int rc = mdbx_filesize(env->me_lazy_fd, &env->me_dxb_mmap.filesize); +__cold static int read_header(MDBX_env *env, MDBX_meta *dest, + const int lck_exclusive, + const mdbx_mode_t mode_bits) { + int rc = osal_filesize(env->me_lazy_fd, &env->me_dxb_mmap.filesize); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -10885,43 +10795,42 @@ __cold static int mdbx_read_header(MDBX_env *env, MDBX_meta *dest, char buffer[MIN_PAGESIZE]; unsigned retryleft = 42; while (1) { - mdbx_trace("reading meta[%d]: offset %u, bytes %u, retry-left %u", - meta_number, offset, MIN_PAGESIZE, retryleft); - int err = mdbx_pread(env->me_lazy_fd, buffer, MIN_PAGESIZE, offset); + TRACE("reading meta[%d]: offset %u, bytes %u, retry-left %u", meta_number, + offset, MIN_PAGESIZE, retryleft); + int err = osal_pread(env->me_lazy_fd, buffer, MIN_PAGESIZE, offset); if (err != MDBX_SUCCESS) { if (err == MDBX_ENODATA && offset == 0 && loop_count == 0 && env->me_dxb_mmap.filesize == 0 && mode_bits /* non-zero for DB creation */ != 0) - mdbx_notice("read meta: empty file (%d, %s)", err, - mdbx_strerror(err)); + NOTICE("read meta: empty file (%d, %s)", err, mdbx_strerror(err)); else - mdbx_error("read meta[%u,%u]: %i, %s", offset, MIN_PAGESIZE, err, - mdbx_strerror(err)); + ERROR("read meta[%u,%u]: %i, %s", offset, MIN_PAGESIZE, err, + mdbx_strerror(err)); return err; } char again[MIN_PAGESIZE]; - err = mdbx_pread(env->me_lazy_fd, again, MIN_PAGESIZE, offset); + err = osal_pread(env->me_lazy_fd, again, MIN_PAGESIZE, offset); if (err != MDBX_SUCCESS) { - mdbx_error("read meta[%u,%u]: %i, %s", offset, MIN_PAGESIZE, err, - mdbx_strerror(err)); + ERROR("read meta[%u,%u]: %i, %s", offset, MIN_PAGESIZE, err, + mdbx_strerror(err)); return err; } if (memcmp(buffer, again, MIN_PAGESIZE) == 0 || --retryleft == 0) break; - mdbx_verbose("meta[%u] was updated, re-read it", meta_number); + VERBOSE("meta[%u] was updated, re-read it", meta_number); } if (!retryleft) { - mdbx_error("meta[%u] is too volatile, skip it", meta_number); + ERROR("meta[%u] is too volatile, skip it", meta_number); continue; } MDBX_page *const page = (MDBX_page *)buffer; MDBX_meta *const meta = page_meta(page); - rc = mdbx_validate_meta(env, meta, page, meta_number, &guess_pagesize); + rc = validate_meta(env, meta, page, meta_number, &guess_pagesize); if (rc != MDBX_SUCCESS) continue; @@ -10932,7 +10841,7 @@ __cold static int mdbx_read_header(MDBX_env *env, MDBX_meta *dest, *dest = *meta; if (!lck_exclusive && !META_IS_STEADY(dest)) loop_limit += 1; /* LY: should re-read to hush race with update */ - mdbx_verbose("latch meta[%u]", meta_number); + VERBOSE("latch meta[%u]", meta_number); } } @@ -10940,7 +10849,7 @@ __cold static int mdbx_read_header(MDBX_env *env, MDBX_meta *dest, (env->me_stuck_meta < 0 && !(META_IS_STEADY(dest) || meta_weak_acceptable(env, dest, lck_exclusive)))) { - mdbx_error("%s", "no usable meta-pages, database is corrupted"); + ERROR("%s", "no usable meta-pages, database is corrupted"); if (rc == MDBX_SUCCESS) { /* TODO: try to restore the database by fully checking b-tree structure * for the each meta page, if the corresponding option was given */ @@ -10952,15 +10861,15 @@ __cold static int mdbx_read_header(MDBX_env *env, MDBX_meta *dest, return MDBX_SUCCESS; } -__cold static MDBX_page *mdbx_meta_model(const MDBX_env *env, MDBX_page *model, - unsigned num) { - mdbx_ensure(env, is_powerof2(env->me_psize)); - mdbx_ensure(env, env->me_psize >= MIN_PAGESIZE); - mdbx_ensure(env, env->me_psize <= MAX_PAGESIZE); - mdbx_ensure(env, env->me_dbgeo.lower >= MIN_MAPSIZE); - mdbx_ensure(env, env->me_dbgeo.upper <= MAX_MAPSIZE); - mdbx_ensure(env, env->me_dbgeo.now >= env->me_dbgeo.lower); - mdbx_ensure(env, env->me_dbgeo.now <= env->me_dbgeo.upper); +__cold static MDBX_page *meta_model(const MDBX_env *env, MDBX_page *model, + unsigned num) { + ENSURE(env, is_powerof2(env->me_psize)); + ENSURE(env, env->me_psize >= MIN_PAGESIZE); + ENSURE(env, env->me_psize <= MAX_PAGESIZE); + ENSURE(env, env->me_dbgeo.lower >= MIN_MAPSIZE); + ENSURE(env, env->me_dbgeo.upper <= MAX_MAPSIZE); + ENSURE(env, env->me_dbgeo.now >= env->me_dbgeo.lower); + ENSURE(env, env->me_dbgeo.now <= env->me_dbgeo.upper); memset(model, 0, env->me_psize); model->mp_pgno = num; @@ -10976,16 +10885,16 @@ __cold static MDBX_page *mdbx_meta_model(const MDBX_env *env, MDBX_page *model, model_meta->mm_geo.now = bytes2pgno(env, env->me_dbgeo.now); model_meta->mm_geo.next = NUM_METAS; - mdbx_ensure(env, model_meta->mm_geo.lower >= MIN_PAGENO); - mdbx_ensure(env, model_meta->mm_geo.upper <= MAX_PAGENO + 1); - mdbx_ensure(env, model_meta->mm_geo.now >= model_meta->mm_geo.lower); - mdbx_ensure(env, model_meta->mm_geo.now <= model_meta->mm_geo.upper); - mdbx_ensure(env, model_meta->mm_geo.next >= MIN_PAGENO); - mdbx_ensure(env, model_meta->mm_geo.next <= model_meta->mm_geo.now); - mdbx_ensure(env, model_meta->mm_geo.grow_pv == - pages2pv(pv2pages(model_meta->mm_geo.grow_pv))); - mdbx_ensure(env, model_meta->mm_geo.shrink_pv == - pages2pv(pv2pages(model_meta->mm_geo.shrink_pv))); + ENSURE(env, model_meta->mm_geo.lower >= MIN_PAGENO); + ENSURE(env, model_meta->mm_geo.upper <= MAX_PAGENO + 1); + ENSURE(env, model_meta->mm_geo.now >= model_meta->mm_geo.lower); + ENSURE(env, model_meta->mm_geo.now <= model_meta->mm_geo.upper); + ENSURE(env, model_meta->mm_geo.next >= MIN_PAGENO); + ENSURE(env, model_meta->mm_geo.next <= model_meta->mm_geo.now); + ENSURE(env, model_meta->mm_geo.grow_pv == + pages2pv(pv2pages(model_meta->mm_geo.grow_pv))); + ENSURE(env, model_meta->mm_geo.shrink_pv == + pages2pv(pv2pages(model_meta->mm_geo.shrink_pv))); model_meta->mm_psize = env->me_psize; model_meta->mm_dbs[FREE_DBI].md_flags = MDBX_INTEGERKEY; @@ -10993,26 +10902,26 @@ __cold static MDBX_page *mdbx_meta_model(const MDBX_env *env, MDBX_page *model, model_meta->mm_dbs[MAIN_DBI].md_root = P_INVALID; meta_set_txnid(env, model_meta, MIN_TXNID + num); unaligned_poke_u64(4, model_meta->mm_datasync_sign, meta_sign(model_meta)); - mdbx_assert(env, meta_checktxnid(env, model_meta, true)); + eASSERT(env, meta_checktxnid(env, model_meta, true)); return (MDBX_page *)((uint8_t *)model + env->me_psize); } /* Fill in most of the zeroed meta-pages for an empty database environment. * Return pointer to recently (head) meta-page. */ -__cold static MDBX_meta *mdbx_init_metas(const MDBX_env *env, void *buffer) { +__cold static MDBX_meta *init_metas(const MDBX_env *env, void *buffer) { MDBX_page *page0 = (MDBX_page *)buffer; - MDBX_page *page1 = mdbx_meta_model(env, page0, 0); - MDBX_page *page2 = mdbx_meta_model(env, page1, 1); - mdbx_meta_model(env, page2, 2); - mdbx_assert(env, !meta_eq(env, page_meta(page0), page_meta(page1))); - mdbx_assert(env, !meta_eq(env, page_meta(page1), page_meta(page2))); - mdbx_assert(env, !meta_eq(env, page_meta(page2), page_meta(page0))); + MDBX_page *page1 = meta_model(env, page0, 0); + MDBX_page *page2 = meta_model(env, page1, 1); + meta_model(env, page2, 2); + eASSERT(env, !meta_eq(env, page_meta(page0), page_meta(page1))); + eASSERT(env, !meta_eq(env, page_meta(page1), page_meta(page2))); + eASSERT(env, !meta_eq(env, page_meta(page2), page_meta(page0))); return page_meta(page2); } #if MDBX_ENABLE_MADVISE && !(defined(_WIN32) || defined(_WIN64)) -static size_t mdbx_madvise_threshold(const MDBX_env *env, - const size_t largest_bytes) { +static size_t madvise_threshold(const MDBX_env *env, + const size_t largest_bytes) { /* TODO: use options */ const unsigned factor = 9; const size_t threshold = (largest_bytes < (65536ul << factor)) @@ -11024,20 +10933,20 @@ static size_t mdbx_madvise_threshold(const MDBX_env *env, } #endif /* MDBX_ENABLE_MADVISE */ -static int mdbx_sync_locked(MDBX_env *env, unsigned flags, - MDBX_meta *const pending) { - mdbx_assert(env, ((env->me_flags ^ flags) & MDBX_WRITEMAP) == 0); +static int sync_locked(MDBX_env *env, unsigned flags, + MDBX_meta *const pending) { + eASSERT(env, ((env->me_flags ^ flags) & MDBX_WRITEMAP) == 0); const MDBX_meta *const meta0 = METAPAGE(env, 0); const MDBX_meta *const meta1 = METAPAGE(env, 1); const MDBX_meta *const meta2 = METAPAGE(env, 2); const MDBX_meta *const head = constmeta_prefer_last(env); int rc; - mdbx_assert(env, meta_eq_mask(env) == 0); - mdbx_assert(env, - pending < METAPAGE(env, 0) || pending > METAPAGE(env, NUM_METAS)); - mdbx_assert(env, (env->me_flags & (MDBX_RDONLY | MDBX_FATAL_ERROR)) == 0); - mdbx_assert(env, pending->mm_geo.next <= pending->mm_geo.now); + eASSERT(env, meta_eq_mask(env) == 0); + eASSERT(env, + pending < METAPAGE(env, 0) || pending > METAPAGE(env, NUM_METAS)); + eASSERT(env, (env->me_flags & (MDBX_RDONLY | MDBX_FATAL_ERROR)) == 0); + eASSERT(env, pending->mm_geo.next <= pending->mm_geo.now); if (flags & MDBX_SAFE_NOSYNC) { /* Check auto-sync conditions */ @@ -11049,7 +10958,7 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, atomic_load32(&env->me_lck->mti_unsynced_pages, mo_Relaxed) >= autosync_threshold) || (autosync_period && - mdbx_osal_monotime() - + osal_monotime() - atomic_load64(&env->me_lck->mti_sync_timestamp, mo_Relaxed) >= autosync_period)) flags &= MDBX_WRITEMAP | MDBX_SHRINK_ALLOWED; /* force steady */ @@ -11061,7 +10970,7 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, const pgno_t largest_pgno = find_largest_snapshot( env, (head->mm_geo.next > pending->mm_geo.next) ? head->mm_geo.next : pending->mm_geo.next); - mdbx_assert(env, largest_pgno >= NUM_METAS); + eASSERT(env, largest_pgno >= NUM_METAS); #if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__) const pgno_t edge = env->me_poison_edge; if (edge > largest_pgno) { @@ -11077,31 +10986,29 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, (defined(MADV_DONTNEED) || defined(POSIX_MADV_DONTNEED)) const size_t largest_bytes = pgno2bytes(env, largest_pgno); /* threshold to avoid unreasonable frequent madvise() calls */ - const size_t madvise_threshold = mdbx_madvise_threshold(env, largest_bytes); + const size_t threshold = madvise_threshold(env, largest_bytes); const size_t discard_edge_bytes = bytes_align2os_bytes( env, ((MDBX_RDONLY & (env->me_lck_mmap.lck ? env->me_lck_mmap.lck->mti_envmode.weak : env->me_flags)) ? largest_bytes - : largest_bytes + madvise_threshold)); + : largest_bytes + threshold)); const pgno_t discard_edge_pgno = bytes2pgno(env, discard_edge_bytes); const pgno_t prev_discarded_pgno = atomic_load32(&env->me_lck->mti_discarded_tail, mo_Relaxed); - if (prev_discarded_pgno >= - discard_edge_pgno + bytes2pgno(env, madvise_threshold)) { - mdbx_notice("open-MADV_%s %u..%u", "DONTNEED", largest_pgno, - prev_discarded_pgno); + if (prev_discarded_pgno >= discard_edge_pgno + bytes2pgno(env, threshold)) { + NOTICE("open-MADV_%s %u..%u", "DONTNEED", largest_pgno, + prev_discarded_pgno); atomic_store32(&env->me_lck->mti_discarded_tail, discard_edge_pgno, mo_Relaxed); const size_t prev_discarded_bytes = ceil_powerof2(pgno2bytes(env, prev_discarded_pgno), env->me_os_psize); - mdbx_ensure(env, prev_discarded_bytes > discard_edge_bytes); + ENSURE(env, prev_discarded_bytes > discard_edge_bytes); #if defined(MADV_DONTNEED) int advise = MADV_DONTNEED; #if defined(MADV_FREE) && \ 0 /* MADV_FREE works for only anonymous vma at the moment */ - if ((env->me_flags & MDBX_WRITEMAP) && - mdbx_linux_kernel_version > 0x04050000) + if ((env->me_flags & MDBX_WRITEMAP) && linux_kernel_version > 0x04050000) advise = MADV_FREE; #endif /* MADV_FREE */ int err = madvise(env->me_map + discard_edge_bytes, @@ -11146,17 +11053,17 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, unaligned_peek_u64(4, pending->mm_txnid_a))) { const txnid_t txnid = safe64_txnid_next(unaligned_peek_u64(4, pending->mm_txnid_a)); - mdbx_notice("force-forward pending-txn %" PRIaTXN " -> %" PRIaTXN, - unaligned_peek_u64(4, pending->mm_txnid_a), txnid); - mdbx_ensure(env, env->me_txn0->mt_owner != mdbx_thread_self() && - !env->me_txn); + NOTICE("force-forward pending-txn %" PRIaTXN " -> %" PRIaTXN, + unaligned_peek_u64(4, pending->mm_txnid_a), txnid); + ENSURE(env, env->me_txn0->mt_owner != osal_thread_self() && + !env->me_txn); if (unlikely(txnid > MAX_TXNID)) { rc = MDBX_TXN_FULL; - mdbx_error("txnid overflow, raise %d", rc); + ERROR("txnid overflow, raise %d", rc); goto fail; } meta_set_txnid(env, pending, txnid); - mdbx_assert(env, meta_checktxnid(env, pending, true)); + eASSERT(env, meta_checktxnid(env, pending, true)); } } } @@ -11166,8 +11073,8 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, /* LY: step#1 - sync previously written/updated data-pages */ rc = MDBX_RESULT_FALSE /* carry steady */; if (atomic_load32(&env->me_lck->mti_unsynced_pages, mo_Relaxed)) { - mdbx_assert(env, ((flags ^ env->me_flags) & MDBX_WRITEMAP) == 0); - enum mdbx_syncmode_bits mode_bits = MDBX_SYNC_NONE; + eASSERT(env, ((flags ^ env->me_flags) & MDBX_WRITEMAP) == 0); + enum osal_syncmode_bits mode_bits = MDBX_SYNC_NONE; if ((flags & MDBX_SAFE_NOSYNC) == 0) { mode_bits = MDBX_SYNC_DATA; if (pending->mm_geo.next > meta_prefer_steady(env)->mm_geo.now) @@ -11180,20 +11087,20 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, #endif /* MDBX_ENABLE_PGOP_STAT */ if (flags & MDBX_WRITEMAP) rc = - mdbx_msync(&env->me_dxb_mmap, 0, + osal_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, pending->mm_geo.next), mode_bits); else - rc = mdbx_fsync(env->me_lazy_fd, mode_bits); + rc = osal_fsync(env->me_lazy_fd, mode_bits); if (unlikely(rc != MDBX_SUCCESS)) goto fail; rc = (flags & MDBX_SAFE_NOSYNC) ? MDBX_RESULT_TRUE /* carry non-steady */ : MDBX_RESULT_FALSE /* carry steady */; } - mdbx_assert(env, meta_checktxnid(env, pending, true)); + eASSERT(env, meta_checktxnid(env, pending, true)); /* Steady or Weak */ if (rc == MDBX_RESULT_FALSE /* carry steady */) { - atomic_store64(&env->me_lck->mti_sync_timestamp, mdbx_osal_monotime(), + atomic_store64(&env->me_lck->mti_sync_timestamp, osal_monotime(), mo_Relaxed); unaligned_poke_u64(4, pending->mm_datasync_sign, meta_sign(pending)); atomic_store32(&env->me_lck->mti_unsynced_pages, 0, mo_Relaxed); @@ -11205,17 +11112,17 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *target = nullptr; if (constmeta_txnid(env, head) == unaligned_peek_u64(4, pending->mm_txnid_a)) { - mdbx_assert(env, memcmp(&head->mm_dbs, &pending->mm_dbs, - sizeof(head->mm_dbs)) == 0); - mdbx_assert(env, memcmp(&head->mm_canary, &pending->mm_canary, - sizeof(head->mm_canary)) == 0); - mdbx_assert(env, memcmp(&head->mm_geo, &pending->mm_geo, - sizeof(pending->mm_geo)) == 0); + eASSERT(env, + memcmp(&head->mm_dbs, &pending->mm_dbs, sizeof(head->mm_dbs)) == 0); + eASSERT(env, memcmp(&head->mm_canary, &pending->mm_canary, + sizeof(head->mm_canary)) == 0); + eASSERT(env, memcmp(&head->mm_geo, &pending->mm_geo, + sizeof(pending->mm_geo)) == 0); if (!META_IS_STEADY(head) && META_IS_STEADY(pending)) target = (MDBX_meta *)head; else { - mdbx_ensure(env, meta_eq(env, head, pending)); - mdbx_debug("%s", "skip update meta"); + ENSURE(env, meta_eq(env, head, pending)); + DEBUG("%s", "skip update meta"); return MDBX_SUCCESS; } } else if (head == meta0) @@ -11223,56 +11130,51 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, else if (head == meta1) target = (MDBX_meta *)meta_ancient_prefer_weak(env, meta0, meta2); else { - mdbx_assert(env, head == meta2); + eASSERT(env, head == meta2); target = (MDBX_meta *)meta_ancient_prefer_weak(env, meta0, meta1); } /* LY: step#2 - update meta-page. */ - mdbx_debug( - "writing meta%" PRIaPGNO " = root %" PRIaPGNO "/%" PRIaPGNO - ", geo %" PRIaPGNO "/%" PRIaPGNO "-%" PRIaPGNO "/%" PRIaPGNO - " +%u -%u, txn_id %" PRIaTXN ", %s", - data_page(target)->mp_pgno, pending->mm_dbs[MAIN_DBI].md_root, - pending->mm_dbs[FREE_DBI].md_root, pending->mm_geo.lower, - pending->mm_geo.next, pending->mm_geo.now, pending->mm_geo.upper, - pv2pages(pending->mm_geo.grow_pv), pv2pages(pending->mm_geo.shrink_pv), - unaligned_peek_u64(4, pending->mm_txnid_a), mdbx_durable_str(pending)); + DEBUG("writing meta%" PRIaPGNO " = root %" PRIaPGNO "/%" PRIaPGNO + ", geo %" PRIaPGNO "/%" PRIaPGNO "-%" PRIaPGNO "/%" PRIaPGNO + " +%u -%u, txn_id %" PRIaTXN ", %s", + data_page(target)->mp_pgno, pending->mm_dbs[MAIN_DBI].md_root, + pending->mm_dbs[FREE_DBI].md_root, pending->mm_geo.lower, + pending->mm_geo.next, pending->mm_geo.now, pending->mm_geo.upper, + pv2pages(pending->mm_geo.grow_pv), pv2pages(pending->mm_geo.shrink_pv), + unaligned_peek_u64(4, pending->mm_txnid_a), durable_caption(pending)); - mdbx_debug("meta0: %s, %s, txn_id %" PRIaTXN ", root %" PRIaPGNO - "/%" PRIaPGNO, - (meta0 == head) ? "head" - : (meta0 == target) ? "tail" - : "stay", - mdbx_durable_str(meta0), meta_txnid(env, meta0), - meta0->mm_dbs[MAIN_DBI].md_root, meta0->mm_dbs[FREE_DBI].md_root); - mdbx_debug("meta1: %s, %s, txn_id %" PRIaTXN ", root %" PRIaPGNO - "/%" PRIaPGNO, - (meta1 == head) ? "head" - : (meta1 == target) ? "tail" - : "stay", - mdbx_durable_str(meta1), meta_txnid(env, meta1), - meta1->mm_dbs[MAIN_DBI].md_root, meta1->mm_dbs[FREE_DBI].md_root); - mdbx_debug("meta2: %s, %s, txn_id %" PRIaTXN ", root %" PRIaPGNO - "/%" PRIaPGNO, - (meta2 == head) ? "head" - : (meta2 == target) ? "tail" - : "stay", - mdbx_durable_str(meta2), meta_txnid(env, meta2), - meta2->mm_dbs[MAIN_DBI].md_root, meta2->mm_dbs[FREE_DBI].md_root); + DEBUG("meta0: %s, %s, txn_id %" PRIaTXN ", root %" PRIaPGNO "/%" PRIaPGNO, + (meta0 == head) ? "head" + : (meta0 == target) ? "tail" + : "stay", + durable_caption(meta0), meta_txnid(env, meta0), + meta0->mm_dbs[MAIN_DBI].md_root, meta0->mm_dbs[FREE_DBI].md_root); + DEBUG("meta1: %s, %s, txn_id %" PRIaTXN ", root %" PRIaPGNO "/%" PRIaPGNO, + (meta1 == head) ? "head" + : (meta1 == target) ? "tail" + : "stay", + durable_caption(meta1), meta_txnid(env, meta1), + meta1->mm_dbs[MAIN_DBI].md_root, meta1->mm_dbs[FREE_DBI].md_root); + DEBUG("meta2: %s, %s, txn_id %" PRIaTXN ", root %" PRIaPGNO "/%" PRIaPGNO, + (meta2 == head) ? "head" + : (meta2 == target) ? "tail" + : "stay", + durable_caption(meta2), meta_txnid(env, meta2), + meta2->mm_dbs[MAIN_DBI].md_root, meta2->mm_dbs[FREE_DBI].md_root); - mdbx_assert(env, !meta_eq(env, pending, meta0)); - mdbx_assert(env, !meta_eq(env, pending, meta1)); - mdbx_assert(env, !meta_eq(env, pending, meta2)); + eASSERT(env, !meta_eq(env, pending, meta0)); + eASSERT(env, !meta_eq(env, pending, meta1)); + eASSERT(env, !meta_eq(env, pending, meta2)); - mdbx_assert(env, ((env->me_flags ^ flags) & MDBX_WRITEMAP) == 0); - mdbx_ensure(env, - target == head || constmeta_txnid(env, target) < + eASSERT(env, ((env->me_flags ^ flags) & MDBX_WRITEMAP) == 0); + ENSURE(env, target == head || constmeta_txnid(env, target) < unaligned_peek_u64(4, pending->mm_txnid_a)); #if MDBX_ENABLE_PGOP_STAT env->me_lck->mti_pgop_stat.wops.weak += 1; #endif /* MDBX_ENABLE_PGOP_STAT */ if (flags & MDBX_WRITEMAP) { - mdbx_jitter4testing(true); + jitter4testing(true); if (likely(target != head)) { /* LY: 'invalidate' the meta. */ meta_update_begin(env, target, @@ -11284,7 +11186,7 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, uint8_t *provoke_begin = (uint8_t *)&target->mm_dbs[FREE_DBI].md_root; uint8_t *provoke_end = (uint8_t *)&target->mm_datasync_sign; memset(provoke_begin, 0xCC, provoke_end - provoke_begin); - mdbx_jitter4testing(false); + jitter4testing(false); #endif /* LY: update info */ @@ -11293,31 +11195,31 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, target->mm_dbs[MAIN_DBI] = pending->mm_dbs[MAIN_DBI]; target->mm_canary = pending->mm_canary; memcpy(target->mm_pages_retired, pending->mm_pages_retired, 8); - mdbx_jitter4testing(true); + jitter4testing(true); /* LY: 'commit' the meta */ meta_update_end(env, target, unaligned_peek_u64(4, pending->mm_txnid_b)); - mdbx_jitter4testing(true); - mdbx_assert(env, meta_checktxnid(env, target, true)); + jitter4testing(true); + eASSERT(env, meta_checktxnid(env, target, true)); } else { /* dangerous case (target == head), only mm_datasync_sign could * me updated, check assertions once again */ - mdbx_ensure(env, constmeta_txnid(env, head) == - unaligned_peek_u64(4, pending->mm_txnid_a) && - !META_IS_STEADY(head) && META_IS_STEADY(pending)); - mdbx_ensure(env, memcmp(&head->mm_geo, &pending->mm_geo, - sizeof(head->mm_geo)) == 0); - mdbx_ensure(env, memcmp(&head->mm_dbs, &pending->mm_dbs, - sizeof(head->mm_dbs)) == 0); - mdbx_ensure(env, memcmp(&head->mm_canary, &pending->mm_canary, - sizeof(head->mm_canary)) == 0); + ENSURE(env, constmeta_txnid(env, head) == + unaligned_peek_u64(4, pending->mm_txnid_a) && + !META_IS_STEADY(head) && META_IS_STEADY(pending)); + ENSURE(env, memcmp(&head->mm_geo, &pending->mm_geo, + sizeof(head->mm_geo)) == 0); + ENSURE(env, memcmp(&head->mm_dbs, &pending->mm_dbs, + sizeof(head->mm_dbs)) == 0); + ENSURE(env, memcmp(&head->mm_canary, &pending->mm_canary, + sizeof(head->mm_canary)) == 0); } memcpy(target->mm_datasync_sign, pending->mm_datasync_sign, 8); - mdbx_flush_incoherent_cpu_writeback(); - mdbx_jitter4testing(true); + osal_flush_incoherent_cpu_writeback(); + jitter4testing(true); /* sync meta-pages */ rc = - mdbx_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, NUM_METAS), + osal_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, NUM_METAS), (flags & MDBX_NOMETASYNC) ? MDBX_SYNC_NONE : MDBX_SYNC_DATA | MDBX_SYNC_IODQ); if (unlikely(rc != MDBX_SUCCESS)) @@ -11330,21 +11232,21 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, #if MDBX_ENABLE_PGOP_STAT env->me_lck->mti_pgop_stat.wops.weak += 1; #endif /* MDBX_ENABLE_PGOP_STAT */ - rc = mdbx_pwrite(fd, pending, sizeof(MDBX_meta), + rc = osal_pwrite(fd, pending, sizeof(MDBX_meta), (uint8_t *)target - env->me_map); if (unlikely(rc != MDBX_SUCCESS)) { undo: - mdbx_debug("%s", "write failed, disk error?"); + DEBUG("%s", "write failed, disk error?"); /* On a failure, the pagecache still contains the new data. * Try write some old data back, to prevent it from being used. */ - mdbx_pwrite(fd, &undo_meta, sizeof(MDBX_meta), + osal_pwrite(fd, &undo_meta, sizeof(MDBX_meta), (uint8_t *)target - env->me_map); goto fail; } - mdbx_flush_incoherent_mmap(target, sizeof(MDBX_meta), env->me_os_psize); + osal_flush_incoherent_mmap(target, sizeof(MDBX_meta), env->me_os_psize); /* sync meta-pages */ if ((flags & MDBX_NOMETASYNC) == 0 && fd == env->me_lazy_fd) { - rc = mdbx_fsync(env->me_lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); + rc = osal_fsync(env->me_lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); if (rc != MDBX_SUCCESS) goto undo; } @@ -11365,13 +11267,13 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, /* LY: shrink datafile if needed */ if (unlikely(shrink)) { - mdbx_verbose("shrink to %" PRIaPGNO " pages (-%" PRIaPGNO ")", - pending->mm_geo.now, shrink); - rc = mdbx_mapresize_implicit(env, pending->mm_geo.next, pending->mm_geo.now, - pending->mm_geo.upper); + VERBOSE("shrink to %" PRIaPGNO " pages (-%" PRIaPGNO ")", + pending->mm_geo.now, shrink); + rc = map_resize_implicit(env, pending->mm_geo.next, pending->mm_geo.now, + pending->mm_geo.upper); if (rc != MDBX_SUCCESS && rc != MDBX_EPERM) goto fail; - mdbx_assert(env, meta_checktxnid(env, target, true)); + eASSERT(env, meta_checktxnid(env, target, true)); } MDBX_lockinfo *const lck = env->me_lck_mmap.lck; @@ -11399,23 +11301,23 @@ static void recalculate_merge_threshold(MDBX_env *env) { : bytes / 4 /* 25 % */)); } -__cold static void mdbx_setup_pagesize(MDBX_env *env, const size_t pagesize) { +__cold static void setup_pagesize(MDBX_env *env, const size_t pagesize) { STATIC_ASSERT(PTRDIFF_MAX > MAX_MAPSIZE); STATIC_ASSERT(MIN_PAGESIZE > sizeof(MDBX_page) + sizeof(MDBX_meta)); - mdbx_ensure(env, is_powerof2(pagesize)); - mdbx_ensure(env, pagesize >= MIN_PAGESIZE); - mdbx_ensure(env, pagesize <= MAX_PAGESIZE); + ENSURE(env, is_powerof2(pagesize)); + ENSURE(env, pagesize >= MIN_PAGESIZE); + ENSURE(env, pagesize <= MAX_PAGESIZE); env->me_psize = (unsigned)pagesize; if (env->me_pbuf) { - mdbx_memalign_free(env->me_pbuf); + osal_memalign_free(env->me_pbuf); env->me_pbuf = nullptr; } STATIC_ASSERT(MAX_GC1OVPAGE(MIN_PAGESIZE) > 4); STATIC_ASSERT(MAX_GC1OVPAGE(MAX_PAGESIZE) < MDBX_PGL_LIMIT); const intptr_t maxgc_ov1page = (pagesize - PAGEHDRSZ) / sizeof(pgno_t) - 1; - mdbx_ensure(env, maxgc_ov1page > 42 && - maxgc_ov1page < (intptr_t)MDBX_PGL_LIMIT / 4); + ENSURE(env, + maxgc_ov1page > 42 && maxgc_ov1page < (intptr_t)MDBX_PGL_LIMIT / 4); env->me_maxgc_ov1page = (unsigned)maxgc_ov1page; STATIC_ASSERT(LEAF_NODE_MAX(MIN_PAGESIZE) > sizeof(MDBX_db) + NODESIZE + 42); @@ -11425,16 +11327,15 @@ __cold static void mdbx_setup_pagesize(MDBX_env *env, const size_t pagesize) { STATIC_ASSERT(BRANCH_NODE_MAX(MAX_PAGESIZE) < UINT16_MAX); const intptr_t branch_nodemax = BRANCH_NODE_MAX(pagesize); const intptr_t leaf_nodemax = LEAF_NODE_MAX(pagesize); - mdbx_ensure(env, - branch_nodemax > (intptr_t)(NODESIZE + 42) && + ENSURE(env, branch_nodemax > (intptr_t)(NODESIZE + 42) && branch_nodemax % 2 == 0 && leaf_nodemax > (intptr_t)(sizeof(MDBX_db) + NODESIZE + 42) && leaf_nodemax >= branch_nodemax && leaf_nodemax < (int)UINT16_MAX && leaf_nodemax % 2 == 0); env->me_leaf_nodemax = (unsigned)leaf_nodemax; env->me_psize2log = (uint8_t)log2n_powerof2(pagesize); - mdbx_assert(env, pgno2bytes(env, 1) == pagesize); - mdbx_assert(env, bytes2pgno(env, pagesize + pagesize) == 2); + eASSERT(env, pgno2bytes(env, 1) == pagesize); + eASSERT(env, bytes2pgno(env, pagesize + pagesize) == 2); recalculate_merge_threshold(env); const pgno_t max_pgno = bytes2pgno(env, MAX_MAPSIZE); @@ -11443,7 +11344,7 @@ __cold static void mdbx_setup_pagesize(MDBX_env *env, const size_t pagesize) { intptr_t total_ram_pages, avail_ram_pages; int err = mdbx_get_sysraminfo(nullptr, &total_ram_pages, &avail_ram_pages); if (unlikely(err != MDBX_SUCCESS)) - mdbx_error("mdbx_get_sysraminfo(), rc %d", err); + ERROR("mdbx_get_sysraminfo(), rc %d", err); else { size_t reasonable_dpl_limit = (size_t)(total_ram_pages + avail_ram_pages) / 42; @@ -11475,7 +11376,7 @@ lckless_stub(const MDBX_env *env) { } __cold int mdbx_env_create(MDBX_env **penv) { - MDBX_env *env = mdbx_calloc(1, sizeof(MDBX_env)); + MDBX_env *env = osal_calloc(1, sizeof(MDBX_env)); if (unlikely(!env)) return MDBX_ENOMEM; @@ -11484,7 +11385,7 @@ __cold int mdbx_env_create(MDBX_env **penv) { env->me_lazy_fd = INVALID_HANDLE_VALUE; env->me_dsync_fd = INVALID_HANDLE_VALUE; env->me_lfd = INVALID_HANDLE_VALUE; - env->me_pid = mdbx_getpid(); + env->me_pid = osal_getpid(); env->me_stuck_meta = -1; env->me_options.dp_reserve_limit = 1024; @@ -11502,37 +11403,37 @@ __cold int mdbx_env_create(MDBX_env **penv) { env->me_options.merge_threshold_16dot16_percent = 65536 / 4 /* 25% */; int rc; - const size_t os_psize = mdbx_syspagesize(); + const size_t os_psize = osal_syspagesize(); if (unlikely(!is_powerof2(os_psize) || os_psize < MIN_PAGESIZE)) { - mdbx_error("unsuitable system pagesize %" PRIuPTR, os_psize); + ERROR("unsuitable system pagesize %" PRIuPTR, os_psize); rc = MDBX_INCOMPATIBLE; goto bailout; } env->me_os_psize = (unsigned)os_psize; - mdbx_setup_pagesize(env, (env->me_os_psize < MAX_PAGESIZE) ? env->me_os_psize - : MAX_PAGESIZE); + setup_pagesize(env, (env->me_os_psize < MAX_PAGESIZE) ? env->me_os_psize + : MAX_PAGESIZE); - rc = mdbx_fastmutex_init(&env->me_dbi_lock); + rc = osal_fastmutex_init(&env->me_dbi_lock); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; #if defined(_WIN32) || defined(_WIN64) - mdbx_srwlock_Init(&env->me_remap_guard); + osal_srwlock_Init(&env->me_remap_guard); InitializeCriticalSection(&env->me_windowsbug_lock); #else - rc = mdbx_fastmutex_init(&env->me_remap_guard); + rc = osal_fastmutex_init(&env->me_remap_guard); if (unlikely(rc != MDBX_SUCCESS)) { - mdbx_fastmutex_destroy(&env->me_dbi_lock); + osal_fastmutex_destroy(&env->me_dbi_lock); goto bailout; } #if MDBX_LOCKING > MDBX_LOCKING_SYSV MDBX_lockinfo *const stub = lckless_stub(env); - rc = mdbx_ipclock_stub(&stub->mti_wlock); + rc = osal_ipclock_stub(&stub->mti_wlock); #endif /* MDBX_LOCKING */ if (unlikely(rc != MDBX_SUCCESS)) { - mdbx_fastmutex_destroy(&env->me_remap_guard); - mdbx_fastmutex_destroy(&env->me_dbi_lock); + osal_fastmutex_destroy(&env->me_remap_guard); + osal_fastmutex_destroy(&env->me_dbi_lock); goto bailout; } #endif /* Windows */ @@ -11543,7 +11444,7 @@ __cold int mdbx_env_create(MDBX_env **penv) { return MDBX_SUCCESS; bailout: - mdbx_free(env); + osal_free(env); *penv = nullptr; return rc; } @@ -11588,7 +11489,7 @@ mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t size_now, return rc; const bool inside_txn = - (env->me_txn0 && env->me_txn0->mt_owner == mdbx_thread_self()); + (env->me_txn0 && env->me_txn0->mt_owner == osal_thread_self()); #if MDBX_DEBUG if (growth_step < 0) { @@ -11657,7 +11558,7 @@ mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t size_now, pagesize = env->me_os_psize; if ((uintptr_t)pagesize > MAX_PAGESIZE) pagesize = MAX_PAGESIZE; - mdbx_assert(env, (uintptr_t)pagesize >= MIN_PAGESIZE); + eASSERT(env, (uintptr_t)pagesize >= MIN_PAGESIZE); } else if (pagesize == 0 /* minimal */) pagesize = MIN_PAGESIZE; @@ -11767,7 +11668,7 @@ mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t size_now, if ((size_t)size_upper < (size_t)size_lower) size_lower = size_upper; } - mdbx_assert(env, (size_upper - size_lower) % env->me_os_psize == 0); + eASSERT(env, (size_upper - size_lower) % env->me_os_psize == 0); if (size_now < size_lower) size_now = size_lower; @@ -11796,7 +11697,7 @@ mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t size_now, if (!env->me_map) { /* save user's geo-params for future open/create */ if (pagesize != (intptr_t)env->me_psize) - mdbx_setup_pagesize(env, pagesize); + setup_pagesize(env, pagesize); env->me_dbgeo.lower = size_lower; env->me_dbgeo.now = size_now; env->me_dbgeo.upper = size_upper; @@ -11805,36 +11706,35 @@ mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t size_now, env->me_dbgeo.shrink = pgno2bytes(env, pv2pages(pages2pv(bytes2pgno(env, shrink_threshold)))); - mdbx_ensure(env, env->me_dbgeo.lower >= MIN_MAPSIZE); - mdbx_ensure(env, env->me_dbgeo.lower / (unsigned)pagesize >= MIN_PAGENO); - mdbx_ensure(env, env->me_dbgeo.lower % (unsigned)pagesize == 0); - mdbx_ensure(env, env->me_dbgeo.lower % env->me_os_psize == 0); + ENSURE(env, env->me_dbgeo.lower >= MIN_MAPSIZE); + ENSURE(env, env->me_dbgeo.lower / (unsigned)pagesize >= MIN_PAGENO); + ENSURE(env, env->me_dbgeo.lower % (unsigned)pagesize == 0); + ENSURE(env, env->me_dbgeo.lower % env->me_os_psize == 0); - mdbx_ensure(env, env->me_dbgeo.upper <= MAX_MAPSIZE); - mdbx_ensure(env, - env->me_dbgeo.upper / (unsigned)pagesize <= MAX_PAGENO + 1); - mdbx_ensure(env, env->me_dbgeo.upper % (unsigned)pagesize == 0); - mdbx_ensure(env, env->me_dbgeo.upper % env->me_os_psize == 0); + ENSURE(env, env->me_dbgeo.upper <= MAX_MAPSIZE); + ENSURE(env, env->me_dbgeo.upper / (unsigned)pagesize <= MAX_PAGENO + 1); + ENSURE(env, env->me_dbgeo.upper % (unsigned)pagesize == 0); + ENSURE(env, env->me_dbgeo.upper % env->me_os_psize == 0); - mdbx_ensure(env, env->me_dbgeo.now >= env->me_dbgeo.lower); - mdbx_ensure(env, env->me_dbgeo.now <= env->me_dbgeo.upper); - mdbx_ensure(env, env->me_dbgeo.now % (unsigned)pagesize == 0); - mdbx_ensure(env, env->me_dbgeo.now % env->me_os_psize == 0); + ENSURE(env, env->me_dbgeo.now >= env->me_dbgeo.lower); + ENSURE(env, env->me_dbgeo.now <= env->me_dbgeo.upper); + ENSURE(env, env->me_dbgeo.now % (unsigned)pagesize == 0); + ENSURE(env, env->me_dbgeo.now % env->me_os_psize == 0); - mdbx_ensure(env, env->me_dbgeo.grow % (unsigned)pagesize == 0); - mdbx_ensure(env, env->me_dbgeo.grow % env->me_os_psize == 0); - mdbx_ensure(env, env->me_dbgeo.shrink % (unsigned)pagesize == 0); - mdbx_ensure(env, env->me_dbgeo.shrink % env->me_os_psize == 0); + ENSURE(env, env->me_dbgeo.grow % (unsigned)pagesize == 0); + ENSURE(env, env->me_dbgeo.grow % env->me_os_psize == 0); + ENSURE(env, env->me_dbgeo.shrink % (unsigned)pagesize == 0); + ENSURE(env, env->me_dbgeo.shrink % env->me_os_psize == 0); rc = MDBX_SUCCESS; } else { /* apply new params to opened environment */ - mdbx_ensure(env, pagesize == (intptr_t)env->me_psize); + ENSURE(env, pagesize == (intptr_t)env->me_psize); MDBX_meta meta; memset(&meta, 0, sizeof(meta)); const MDBX_geo *current_geo; if (!inside_txn) { - mdbx_assert(env, need_unlock); + eASSERT(env, need_unlock); const MDBX_meta *head = constmeta_prefer_last(env); uint64_t timestamp = 0; @@ -11850,7 +11750,7 @@ mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t size_now, const txnid_t txnid = safe64_txnid_next(constmeta_txnid(env, &meta)); if (unlikely(txnid > MAX_TXNID)) { rc = MDBX_TXN_FULL; - mdbx_error("txnid overflow, raise %d", rc); + ERROR("txnid overflow, raise %d", rc); goto bailout; } meta_set_txnid(env, &meta, txnid); @@ -11867,22 +11767,19 @@ mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t size_now, new_geo.shrink_pv = pages2pv(bytes2pgno(env, shrink_threshold)); new_geo.next = current_geo->next; - mdbx_ensure(env, - pgno_align2os_bytes(env, new_geo.lower) == (size_t)size_lower); - mdbx_ensure(env, - pgno_align2os_bytes(env, new_geo.upper) == (size_t)size_upper); - mdbx_ensure(env, pgno_align2os_bytes(env, new_geo.now) == (size_t)size_now); - mdbx_ensure(env, new_geo.grow_pv == pages2pv(pv2pages(new_geo.grow_pv))); - mdbx_ensure(env, - new_geo.shrink_pv == pages2pv(pv2pages(new_geo.shrink_pv))); + ENSURE(env, pgno_align2os_bytes(env, new_geo.lower) == (size_t)size_lower); + ENSURE(env, pgno_align2os_bytes(env, new_geo.upper) == (size_t)size_upper); + ENSURE(env, pgno_align2os_bytes(env, new_geo.now) == (size_t)size_now); + ENSURE(env, new_geo.grow_pv == pages2pv(pv2pages(new_geo.grow_pv))); + ENSURE(env, new_geo.shrink_pv == pages2pv(pv2pages(new_geo.shrink_pv))); - mdbx_ensure(env, (size_t)size_lower >= MIN_MAPSIZE); - mdbx_ensure(env, new_geo.lower >= MIN_PAGENO); - mdbx_ensure(env, (size_t)size_upper <= MAX_MAPSIZE); - mdbx_ensure(env, new_geo.upper <= MAX_PAGENO + 1); - mdbx_ensure(env, new_geo.now >= new_geo.next); - mdbx_ensure(env, new_geo.upper >= new_geo.now); - mdbx_ensure(env, new_geo.now >= new_geo.lower); + ENSURE(env, (size_t)size_lower >= MIN_MAPSIZE); + ENSURE(env, new_geo.lower >= MIN_PAGENO); + ENSURE(env, (size_t)size_upper <= MAX_MAPSIZE); + ENSURE(env, new_geo.upper <= MAX_PAGENO + 1); + ENSURE(env, new_geo.now >= new_geo.next); + ENSURE(env, new_geo.upper >= new_geo.now); + ENSURE(env, new_geo.now >= new_geo.lower); if (memcmp(current_geo, &new_geo, sizeof(MDBX_geo)) != 0) { #if defined(_WIN32) || defined(_WIN64) @@ -11894,7 +11791,7 @@ mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t size_now, rc = MDBX_EPERM; goto bailout; } - int err = mdbx_rdt_lock(env); + int err = osal_rdt_lock(env); if (unlikely(MDBX_IS_ERROR(err))) { rc = err; goto bailout; @@ -11915,7 +11812,7 @@ mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t size_now, } } - mdbx_rdt_unlock(env); + osal_rdt_unlock(env); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } @@ -11923,8 +11820,8 @@ mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t size_now, if (new_geo.now != current_geo->now || new_geo.upper != current_geo->upper) { - rc = mdbx_mapresize(env, current_geo->next, new_geo.now, new_geo.upper, - false); + rc = map_resize(env, current_geo->next, new_geo.now, new_geo.upper, + false); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } @@ -11933,7 +11830,7 @@ mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t size_now, env->me_txn->mt_flags |= MDBX_TXN_DIRTY; } else { meta.mm_geo = new_geo; - rc = mdbx_sync_locked(env, env->me_flags, &meta); + rc = sync_locked(env, env->me_flags, &meta); } if (likely(rc == MDBX_SUCCESS)) { @@ -11978,23 +11875,23 @@ __cold int mdbx_env_get_maxreaders(const MDBX_env *env, unsigned *readers) { __cold static int alloc_page_buf(MDBX_env *env) { return env->me_pbuf ? MDBX_SUCCESS - : mdbx_memalign_alloc(env->me_os_psize, env->me_psize * NUM_METAS, + : osal_memalign_alloc(env->me_os_psize, env->me_psize * NUM_METAS, &env->me_pbuf); } /* Further setup required for opening an MDBX environment */ -__cold static int mdbx_setup_dxb(MDBX_env *env, const int lck_rc, - const mdbx_mode_t mode_bits) { +__cold static int setup_dxb(MDBX_env *env, const int lck_rc, + const mdbx_mode_t mode_bits) { MDBX_meta meta; int rc = MDBX_RESULT_FALSE; - int err = mdbx_read_header(env, &meta, lck_rc, mode_bits); + int err = read_header(env, &meta, lck_rc, mode_bits); if (unlikely(err != MDBX_SUCCESS)) { if (lck_rc != /* lck exclusive */ MDBX_RESULT_TRUE || err != MDBX_ENODATA || (env->me_flags & MDBX_RDONLY) != 0 || /* recovery mode */ env->me_stuck_meta >= 0) return err; - mdbx_debug("%s", "create new database"); + DEBUG("%s", "create new database"); rc = /* new database */ MDBX_RESULT_TRUE; if (!env->me_dbgeo.now) { @@ -12008,35 +11905,35 @@ __cold static int mdbx_setup_dxb(MDBX_env *env, const int lck_rc, if (unlikely(err != MDBX_SUCCESS)) return err; - meta = *mdbx_init_metas(env, env->me_pbuf); - err = mdbx_pwrite(env->me_lazy_fd, env->me_pbuf, env->me_psize * NUM_METAS, + meta = *init_metas(env, env->me_pbuf); + err = osal_pwrite(env->me_lazy_fd, env->me_pbuf, env->me_psize * NUM_METAS, 0); if (unlikely(err != MDBX_SUCCESS)) return err; - err = mdbx_ftruncate(env->me_lazy_fd, env->me_dxb_mmap.filesize = + err = osal_ftruncate(env->me_lazy_fd, env->me_dxb_mmap.filesize = env->me_dxb_mmap.current = env->me_dbgeo.now); if (unlikely(err != MDBX_SUCCESS)) return err; #ifndef NDEBUG /* just for checking */ - err = mdbx_read_header(env, &meta, lck_rc, mode_bits); + err = read_header(env, &meta, lck_rc, mode_bits); if (unlikely(err != MDBX_SUCCESS)) return err; #endif } - mdbx_verbose( + VERBOSE( "header: root %" PRIaPGNO "/%" PRIaPGNO ", geo %" PRIaPGNO "/%" PRIaPGNO "-%" PRIaPGNO "/%" PRIaPGNO " +%u -%u, txn_id %" PRIaTXN ", %s", meta.mm_dbs[MAIN_DBI].md_root, meta.mm_dbs[FREE_DBI].md_root, meta.mm_geo.lower, meta.mm_geo.next, meta.mm_geo.now, meta.mm_geo.upper, pv2pages(meta.mm_geo.grow_pv), pv2pages(meta.mm_geo.shrink_pv), - unaligned_peek_u64(4, meta.mm_txnid_a), mdbx_durable_str(&meta)); + unaligned_peek_u64(4, meta.mm_txnid_a), durable_caption(&meta)); if (env->me_psize != meta.mm_psize) - mdbx_setup_pagesize(env, meta.mm_psize); + setup_pagesize(env, meta.mm_psize); const size_t used_bytes = pgno2bytes(env, meta.mm_geo.next); const size_t used_aligned2os_bytes = ceil_powerof2(used_bytes, env->me_os_psize); @@ -12050,8 +11947,8 @@ __cold static int mdbx_setup_dxb(MDBX_env *env, const int lck_rc, meta.mm_geo.upper * pagesize, pv2pages(meta.mm_geo.grow_pv) * pagesize, pv2pages(meta.mm_geo.shrink_pv) * pagesize, meta.mm_psize); if (unlikely(err != MDBX_SUCCESS)) { - mdbx_error("%s: err %d", "could not apply preconfigured geometry from db", - err); + ERROR("%s: err %d", "could not apply preconfigured geometry from db", + err); return (err == MDBX_EINVAL) ? MDBX_INCOMPATIBLE : err; } } else if (env->me_dbgeo.now) { @@ -12083,8 +11980,7 @@ __cold static int mdbx_setup_dxb(MDBX_env *env, const int lck_rc, env->me_dbgeo.upper, env->me_dbgeo.grow, env->me_dbgeo.shrink, meta.mm_psize); if (unlikely(err != MDBX_SUCCESS)) { - mdbx_error("%s: err %d", "could not apply preconfigured db-geometry", - err); + ERROR("%s: err %d", "could not apply preconfigured db-geometry", err); return (err == MDBX_EINVAL) ? MDBX_INCOMPATIBLE : err; } @@ -12095,21 +11991,20 @@ __cold static int mdbx_setup_dxb(MDBX_env *env, const int lck_rc, meta.mm_geo.grow_pv = pages2pv(bytes2pgno(env, env->me_dbgeo.grow)); meta.mm_geo.shrink_pv = pages2pv(bytes2pgno(env, env->me_dbgeo.shrink)); - mdbx_verbose("amended: root %" PRIaPGNO "/%" PRIaPGNO ", geo %" PRIaPGNO - "/%" PRIaPGNO "-%" PRIaPGNO "/%" PRIaPGNO - " +%u -%u, txn_id %" PRIaTXN ", %s", - meta.mm_dbs[MAIN_DBI].md_root, meta.mm_dbs[FREE_DBI].md_root, - meta.mm_geo.lower, meta.mm_geo.next, meta.mm_geo.now, - meta.mm_geo.upper, pv2pages(meta.mm_geo.grow_pv), - pv2pages(meta.mm_geo.shrink_pv), - unaligned_peek_u64(4, meta.mm_txnid_a), - mdbx_durable_str(&meta)); + VERBOSE("amended: root %" PRIaPGNO "/%" PRIaPGNO ", geo %" PRIaPGNO + "/%" PRIaPGNO "-%" PRIaPGNO "/%" PRIaPGNO + " +%u -%u, txn_id %" PRIaTXN ", %s", + meta.mm_dbs[MAIN_DBI].md_root, meta.mm_dbs[FREE_DBI].md_root, + meta.mm_geo.lower, meta.mm_geo.next, meta.mm_geo.now, + meta.mm_geo.upper, pv2pages(meta.mm_geo.grow_pv), + pv2pages(meta.mm_geo.shrink_pv), + unaligned_peek_u64(4, meta.mm_txnid_a), durable_caption(&meta)); } else { /* fetch back 'now/current' size, since it was ignored during comparison * and may differ. */ env->me_dbgeo.now = pgno_align2os_bytes(env, meta.mm_geo.now); } - mdbx_ensure(env, meta.mm_geo.now >= meta.mm_geo.next); + ENSURE(env, meta.mm_geo.now >= meta.mm_geo.next); } else { /* geo-params are not pre-configured by user, * get current values from the meta. */ @@ -12120,45 +12015,44 @@ __cold static int mdbx_setup_dxb(MDBX_env *env, const int lck_rc, env->me_dbgeo.shrink = pgno2bytes(env, pv2pages(meta.mm_geo.shrink_pv)); } - mdbx_ensure(env, - pgno_align2os_bytes(env, meta.mm_geo.now) == env->me_dbgeo.now); - mdbx_ensure(env, env->me_dbgeo.now >= used_bytes); + ENSURE(env, pgno_align2os_bytes(env, meta.mm_geo.now) == env->me_dbgeo.now); + ENSURE(env, env->me_dbgeo.now >= used_bytes); const uint64_t filesize_before = env->me_dxb_mmap.filesize; if (unlikely(filesize_before != env->me_dbgeo.now)) { if (lck_rc != /* lck exclusive */ MDBX_RESULT_TRUE) { - mdbx_verbose("filesize mismatch (expect %" PRIuPTR "b/%" PRIaPGNO - "p, have %" PRIu64 "b/%" PRIaPGNO "p), " - "assume other process working", - env->me_dbgeo.now, bytes2pgno(env, env->me_dbgeo.now), - filesize_before, bytes2pgno(env, (size_t)filesize_before)); + VERBOSE("filesize mismatch (expect %" PRIuPTR "b/%" PRIaPGNO + "p, have %" PRIu64 "b/%" PRIaPGNO "p), " + "assume other process working", + env->me_dbgeo.now, bytes2pgno(env, env->me_dbgeo.now), + filesize_before, bytes2pgno(env, (size_t)filesize_before)); } else { - mdbx_warning("filesize mismatch (expect %" PRIuSIZE "b/%" PRIaPGNO - "p, have %" PRIu64 "b/%" PRIaPGNO "p)", - env->me_dbgeo.now, bytes2pgno(env, env->me_dbgeo.now), - filesize_before, bytes2pgno(env, (size_t)filesize_before)); + WARNING("filesize mismatch (expect %" PRIuSIZE "b/%" PRIaPGNO + "p, have %" PRIu64 "b/%" PRIaPGNO "p)", + env->me_dbgeo.now, bytes2pgno(env, env->me_dbgeo.now), + filesize_before, bytes2pgno(env, (size_t)filesize_before)); if (filesize_before < used_bytes) { - mdbx_error("last-page beyond end-of-file (last %" PRIaPGNO - ", have %" PRIaPGNO ")", - meta.mm_geo.next, bytes2pgno(env, (size_t)filesize_before)); + ERROR("last-page beyond end-of-file (last %" PRIaPGNO + ", have %" PRIaPGNO ")", + meta.mm_geo.next, bytes2pgno(env, (size_t)filesize_before)); return MDBX_CORRUPTED; } if (env->me_flags & MDBX_RDONLY) { if (filesize_before & (env->me_os_psize - 1)) { - mdbx_error("%s", "filesize should be rounded-up to system page"); + ERROR("%s", "filesize should be rounded-up to system page"); return MDBX_WANNA_RECOVERY; } - mdbx_warning("%s", "ignore filesize mismatch in readonly-mode"); + WARNING("%s", "ignore filesize mismatch in readonly-mode"); } else { - mdbx_verbose("will resize datafile to %" PRIuSIZE " bytes, %" PRIaPGNO - " pages", - env->me_dbgeo.now, bytes2pgno(env, env->me_dbgeo.now)); + VERBOSE("will resize datafile to %" PRIuSIZE " bytes, %" PRIaPGNO + " pages", + env->me_dbgeo.now, bytes2pgno(env, env->me_dbgeo.now)); } } } - mdbx_verbose("current boot-id %" PRIx64 "-%" PRIx64 " (%savailable)", - bootid.x, bootid.y, (bootid.x | bootid.y) ? "" : "not-"); + VERBOSE("current boot-id %" PRIx64 "-%" PRIx64 " (%savailable)", bootid.x, + bootid.y, (bootid.x | bootid.y) ? "" : "not-"); #if MDBX_ENABLE_MADVISE /* calculate readahead hint before mmap with zero redundant pages */ @@ -12167,7 +12061,7 @@ __cold static int mdbx_setup_dxb(MDBX_env *env, const int lck_rc, mdbx_is_readahead_reasonable(used_bytes, 0) == MDBX_RESULT_TRUE; #endif /* MDBX_ENABLE_MADVISE */ - err = mdbx_mmap(env->me_flags, &env->me_dxb_mmap, env->me_dbgeo.now, + err = osal_mmap(env->me_flags, &env->me_dxb_mmap, env->me_dbgeo.now, env->me_dbgeo.upper, lck_rc ? MMAP_OPTION_TRUNCATE : 0); if (unlikely(err != MDBX_SUCCESS)) return err; @@ -12181,7 +12075,7 @@ __cold static int mdbx_setup_dxb(MDBX_env *env, const int lck_rc, return err; #endif /* MADV_DONTDUMP */ #if defined(MADV_DODUMP) - if (mdbx_runtime_flags & MDBX_DBG_DUMP) { + if (runtime_flags & MDBX_DBG_DUMP) { const size_t meta_length_aligned2os = pgno_align2os_bytes(env, NUM_METAS); err = madvise(env->me_map, meta_length_aligned2os, MADV_DODUMP) ? ignore_enosys(errno) @@ -12197,8 +12091,8 @@ __cold static int mdbx_setup_dxb(MDBX_env *env, const int lck_rc, VALGRIND_CREATE_BLOCK(env->me_map, env->me_dxb_mmap.limit, "mdbx"); #endif /* MDBX_USE_VALGRIND */ - mdbx_assert(env, used_bytes >= pgno2bytes(env, NUM_METAS) && - used_bytes <= env->me_dxb_mmap.limit); + eASSERT(env, used_bytes >= pgno2bytes(env, NUM_METAS) && + used_bytes <= env->me_dxb_mmap.limit); #if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__) if (env->me_dxb_mmap.filesize > used_bytes && env->me_dxb_mmap.filesize < env->me_dxb_mmap.limit) { @@ -12218,18 +12112,18 @@ __cold static int mdbx_setup_dxb(MDBX_env *env, const int lck_rc, /* recovery mode */ MDBX_meta clone; MDBX_meta const *const target = METAPAGE(env, env->me_stuck_meta); - err = mdbx_validate_meta_copy(env, target, &clone); + err = validate_meta_copy(env, target, &clone); if (unlikely(err != MDBX_SUCCESS)) { - mdbx_error("target meta[%u] is corrupted", - bytes2pgno(env, (uint8_t *)data_page(target) - env->me_map)); + ERROR("target meta[%u] is corrupted", + bytes2pgno(env, (uint8_t *)data_page(target) - env->me_map)); return MDBX_CORRUPTED; } } else /* not recovery mode */ while (1) { - mdbx_memory_fence(mo_AcquireRelease, false); + osal_memory_fence(mo_AcquireRelease, false); const unsigned meta_clash_mask = meta_eq_mask(env); if (unlikely(meta_clash_mask)) { - mdbx_error("meta-pages are clashed: mask 0x%d", meta_clash_mask); + ERROR("meta-pages are clashed: mask 0x%d", meta_clash_mask); return MDBX_CORRUPTED; } @@ -12246,16 +12140,16 @@ __cold static int mdbx_setup_dxb(MDBX_env *env, const int lck_rc, if (!env->me_lck_mmap.lck) { /* LY: without-lck (read-only) mode, so it is impossible that other * process made weak checkpoint. */ - mdbx_error("%s", "without-lck, unable recovery/rollback"); + ERROR("%s", "without-lck, unable recovery/rollback"); return MDBX_WANNA_RECOVERY; } /* LY: assume just have a collision with other running process, * or someone make a weak checkpoint */ - mdbx_verbose("%s", "assume collision or online weak checkpoint"); + VERBOSE("%s", "assume collision or online weak checkpoint"); break; } - mdbx_assert(env, lck_rc == MDBX_RESULT_TRUE); + eASSERT(env, lck_rc == MDBX_RESULT_TRUE); /* exclusive mode */ MDBX_meta clone; @@ -12263,12 +12157,11 @@ __cold static int mdbx_setup_dxb(MDBX_env *env, const int lck_rc, const MDBX_meta *const head = constmeta_prefer_last(env); const txnid_t steady_txnid = constmeta_txnid(env, steady); if (META_IS_STEADY(steady)) { - err = mdbx_validate_meta_copy(env, steady, &clone); + err = validate_meta_copy(env, steady, &clone); if (unlikely(err != MDBX_SUCCESS)) { - mdbx_error("meta[%u] with %s txnid %" PRIaTXN - " is corrupted, %s needed", - bytes2pgno(env, (uint8_t *)steady - env->me_map), "steady", - steady_txnid, "manual recovery"); + ERROR("meta[%u] with %s txnid %" PRIaTXN " is corrupted, %s needed", + bytes2pgno(env, (uint8_t *)steady - env->me_map), "steady", + steady_txnid, "manual recovery"); return MDBX_CORRUPTED; } if (steady == head) @@ -12278,69 +12171,66 @@ __cold static int mdbx_setup_dxb(MDBX_env *env, const int lck_rc, const pgno_t pgno = bytes2pgno(env, (uint8_t *)head - env->me_map); const txnid_t head_txnid = constmeta_txnid(env, head); const bool head_valid = - mdbx_validate_meta_copy(env, head, &clone) == MDBX_SUCCESS; - mdbx_assert(env, !META_IS_STEADY(steady) || head_txnid != steady_txnid); + validate_meta_copy(env, head, &clone) == MDBX_SUCCESS; + eASSERT(env, !META_IS_STEADY(steady) || head_txnid != steady_txnid); if (unlikely(!head_valid)) { if (unlikely(!META_IS_STEADY(steady))) { - mdbx_error("%s for open or automatic rollback, %s", - "there are no suitable meta-pages", - "manual recovery is required"); + ERROR("%s for open or automatic rollback, %s", + "there are no suitable meta-pages", + "manual recovery is required"); return MDBX_CORRUPTED; } - mdbx_warning("meta[%u] with last txnid %" PRIaTXN - " is corrupted, rollback needed", - pgno, head_txnid); + WARNING("meta[%u] with last txnid %" PRIaTXN + " is corrupted, rollback needed", + pgno, head_txnid); goto purge_meta_head; } if (meta_bootid_match(head)) { if (env->me_flags & MDBX_RDONLY) { - mdbx_error("%s, but boot-id(%016" PRIx64 "-%016" PRIx64 ") is MATCH: " - "rollback NOT needed, steady-sync NEEDED%s", - "opening after an unclean shutdown", bootid.x, bootid.y, - ", but unable in read-only mode"); + ERROR("%s, but boot-id(%016" PRIx64 "-%016" PRIx64 ") is MATCH: " + "rollback NOT needed, steady-sync NEEDED%s", + "opening after an unclean shutdown", bootid.x, bootid.y, + ", but unable in read-only mode"); return MDBX_WANNA_RECOVERY; } - mdbx_warning("%s, but boot-id(%016" PRIx64 "-%016" PRIx64 ") is MATCH: " - "rollback NOT needed, steady-sync NEEDED%s", - "opening after an unclean shutdown", bootid.x, bootid.y, - ""); + WARNING("%s, but boot-id(%016" PRIx64 "-%016" PRIx64 ") is MATCH: " + "rollback NOT needed, steady-sync NEEDED%s", + "opening after an unclean shutdown", bootid.x, bootid.y, ""); meta = clone; atomic_store32(&env->me_lck->mti_unsynced_pages, meta.mm_geo.next, mo_Relaxed); break; } if (unlikely(!META_IS_STEADY(steady))) { - mdbx_error("%s, but %s for automatic rollback: %s", - "opening after an unclean shutdown", - "there are no suitable meta-pages", - "manual recovery is required"); + ERROR("%s, but %s for automatic rollback: %s", + "opening after an unclean shutdown", + "there are no suitable meta-pages", + "manual recovery is required"); return MDBX_CORRUPTED; } if (env->me_flags & MDBX_RDONLY) { - mdbx_error("%s and rollback needed: (from head %" PRIaTXN - " to steady %" PRIaTXN ")%s", - "opening after an unclean shutdown", head_txnid, - steady_txnid, ", but unable in read-only mode"); + ERROR("%s and rollback needed: (from head %" PRIaTXN + " to steady %" PRIaTXN ")%s", + "opening after an unclean shutdown", head_txnid, steady_txnid, + ", but unable in read-only mode"); return MDBX_WANNA_RECOVERY; } purge_meta_head: - mdbx_notice("%s and doing automatic rollback: " - "purge%s meta[%u] with%s txnid %" PRIaTXN, - "opening after an unclean shutdown", - head_valid ? "" : " invalid", pgno, head_valid ? " weak" : "", - head_txnid); - mdbx_ensure(env, META_IS_STEADY(steady)); - err = mdbx_override_meta(env, pgno, 0, head_valid ? head : steady); + NOTICE("%s and doing automatic rollback: " + "purge%s meta[%u] with%s txnid %" PRIaTXN, + "opening after an unclean shutdown", head_valid ? "" : " invalid", + pgno, head_valid ? " weak" : "", head_txnid); + ENSURE(env, META_IS_STEADY(steady)); + err = override_meta(env, pgno, 0, head_valid ? head : steady); if (err) { - mdbx_error("rollback: overwrite meta[%u] with txnid %" PRIaTXN - ", error %d", - pgno, head_txnid, err); + ERROR("rollback: overwrite meta[%u] with txnid %" PRIaTXN ", error %d", + pgno, head_txnid, err); return err; } - mdbx_ensure(env, 0 == meta_txnid(env, head)); - mdbx_ensure(env, 0 == meta_eq_mask(env)); + ENSURE(env, 0 == meta_txnid(env, head)); + ENSURE(env, 0 == meta_eq_mask(env)); } if (lck_rc == /* lck exclusive */ MDBX_RESULT_TRUE) { @@ -12349,63 +12239,62 @@ __cold static int mdbx_setup_dxb(MDBX_env *env, const int lck_rc, /* re-check size after mmap */ if ((env->me_dxb_mmap.current & (env->me_os_psize - 1)) != 0 || env->me_dxb_mmap.current < used_bytes) { - mdbx_error("unacceptable/unexpected datafile size %" PRIuPTR, - env->me_dxb_mmap.current); + ERROR("unacceptable/unexpected datafile size %" PRIuPTR, + env->me_dxb_mmap.current); return MDBX_PROBLEM; } if (env->me_dxb_mmap.current != env->me_dbgeo.now) { meta.mm_geo.now = bytes2pgno(env, env->me_dxb_mmap.current); - mdbx_notice("need update meta-geo to filesize %" PRIuPTR - " bytes, %" PRIaPGNO " pages", - env->me_dxb_mmap.current, meta.mm_geo.now); + NOTICE("need update meta-geo to filesize %" PRIuPTR " bytes, %" PRIaPGNO + " pages", + env->me_dxb_mmap.current, meta.mm_geo.now); } if (memcmp(&meta.mm_geo, &head->mm_geo, sizeof(meta.mm_geo))) { if ((env->me_flags & MDBX_RDONLY) != 0 || /* recovery mode */ env->me_stuck_meta >= 0) { - mdbx_warning( - "skipped update meta.geo in %s mode: from l%" PRIaPGNO - "-n%" PRIaPGNO "-u%" PRIaPGNO "/s%u-g%u, to l%" PRIaPGNO - "-n%" PRIaPGNO "-u%" PRIaPGNO "/s%u-g%u", - (env->me_stuck_meta < 0) ? "read-only" : "recovery", - head->mm_geo.lower, head->mm_geo.now, head->mm_geo.upper, - pv2pages(head->mm_geo.shrink_pv), pv2pages(head->mm_geo.grow_pv), - meta.mm_geo.lower, meta.mm_geo.now, meta.mm_geo.upper, - pv2pages(meta.mm_geo.shrink_pv), pv2pages(meta.mm_geo.grow_pv)); + WARNING("skipped update meta.geo in %s mode: from l%" PRIaPGNO + "-n%" PRIaPGNO "-u%" PRIaPGNO "/s%u-g%u, to l%" PRIaPGNO + "-n%" PRIaPGNO "-u%" PRIaPGNO "/s%u-g%u", + (env->me_stuck_meta < 0) ? "read-only" : "recovery", + head->mm_geo.lower, head->mm_geo.now, head->mm_geo.upper, + pv2pages(head->mm_geo.shrink_pv), + pv2pages(head->mm_geo.grow_pv), meta.mm_geo.lower, + meta.mm_geo.now, meta.mm_geo.upper, + pv2pages(meta.mm_geo.shrink_pv), pv2pages(meta.mm_geo.grow_pv)); } else { const txnid_t txnid = constmeta_txnid(env, head); const txnid_t next_txnid = safe64_txnid_next(txnid); if (unlikely(txnid > MAX_TXNID)) { - mdbx_error("txnid overflow, raise %d", MDBX_TXN_FULL); + ERROR("txnid overflow, raise %d", MDBX_TXN_FULL); return MDBX_TXN_FULL; } - mdbx_notice("updating meta.geo: " - "from l%" PRIaPGNO "-n%" PRIaPGNO "-u%" PRIaPGNO - "/s%u-g%u (txn#%" PRIaTXN "), " - "to l%" PRIaPGNO "-n%" PRIaPGNO "-u%" PRIaPGNO - "/s%u-g%u (txn#%" PRIaTXN ")", - head->mm_geo.lower, head->mm_geo.now, head->mm_geo.upper, - pv2pages(head->mm_geo.shrink_pv), - pv2pages(head->mm_geo.grow_pv), txnid, meta.mm_geo.lower, - meta.mm_geo.now, meta.mm_geo.upper, - pv2pages(meta.mm_geo.shrink_pv), - pv2pages(meta.mm_geo.grow_pv), next_txnid); + NOTICE("updating meta.geo: " + "from l%" PRIaPGNO "-n%" PRIaPGNO "-u%" PRIaPGNO + "/s%u-g%u (txn#%" PRIaTXN "), " + "to l%" PRIaPGNO "-n%" PRIaPGNO "-u%" PRIaPGNO + "/s%u-g%u (txn#%" PRIaTXN ")", + head->mm_geo.lower, head->mm_geo.now, head->mm_geo.upper, + pv2pages(head->mm_geo.shrink_pv), pv2pages(head->mm_geo.grow_pv), + txnid, meta.mm_geo.lower, meta.mm_geo.now, meta.mm_geo.upper, + pv2pages(meta.mm_geo.shrink_pv), pv2pages(meta.mm_geo.grow_pv), + next_txnid); - mdbx_ensure(env, meta_eq(env, &meta, head)); + ENSURE(env, meta_eq(env, &meta, head)); meta_set_txnid(env, &meta, next_txnid); - err = mdbx_sync_locked(env, env->me_flags | MDBX_SHRINK_ALLOWED, &meta); + err = sync_locked(env, env->me_flags | MDBX_SHRINK_ALLOWED, &meta); if (err) { - mdbx_error("error %d, while updating meta.geo: " - "from l%" PRIaPGNO "-n%" PRIaPGNO "-u%" PRIaPGNO - "/s%u-g%u (txn#%" PRIaTXN "), " - "to l%" PRIaPGNO "-n%" PRIaPGNO "-u%" PRIaPGNO - "/s%u-g%u (txn#%" PRIaTXN ")", - err, head->mm_geo.lower, head->mm_geo.now, - head->mm_geo.upper, pv2pages(head->mm_geo.shrink_pv), - pv2pages(head->mm_geo.grow_pv), txnid, meta.mm_geo.lower, - meta.mm_geo.now, meta.mm_geo.upper, - pv2pages(meta.mm_geo.shrink_pv), - pv2pages(meta.mm_geo.grow_pv), next_txnid); + ERROR("error %d, while updating meta.geo: " + "from l%" PRIaPGNO "-n%" PRIaPGNO "-u%" PRIaPGNO + "/s%u-g%u (txn#%" PRIaTXN "), " + "to l%" PRIaPGNO "-n%" PRIaPGNO "-u%" PRIaPGNO + "/s%u-g%u (txn#%" PRIaTXN ")", + err, head->mm_geo.lower, head->mm_geo.now, head->mm_geo.upper, + pv2pages(head->mm_geo.shrink_pv), + pv2pages(head->mm_geo.grow_pv), txnid, meta.mm_geo.lower, + meta.mm_geo.now, meta.mm_geo.upper, + pv2pages(meta.mm_geo.shrink_pv), pv2pages(meta.mm_geo.grow_pv), + next_txnid); return err; } } @@ -12415,25 +12304,25 @@ __cold static int mdbx_setup_dxb(MDBX_env *env, const int lck_rc, bytes2pgno(env, used_aligned2os_bytes), mo_Relaxed); if ((env->me_flags & MDBX_RDONLY) == 0 && env->me_stuck_meta < 0 && - (mdbx_runtime_flags & MDBX_DBG_DONT_UPGRADE) == 0) { + (runtime_flags & MDBX_DBG_DONT_UPGRADE) == 0) { for (int n = 0; n < NUM_METAS; ++n) { MDBX_meta *const pmeta = METAPAGE(env, n); if (unlikely(unaligned_peek_u64(4, &pmeta->mm_magic_and_version) != MDBX_DATA_MAGIC)) { const txnid_t txnid = constmeta_txnid(env, pmeta); - mdbx_notice("%s %s" - "meta[%u], txnid %" PRIaTXN, - "updating db-format signature for", - META_IS_STEADY(pmeta) ? "stead-" : "weak-", n, txnid); - err = mdbx_override_meta(env, n, txnid, pmeta); + NOTICE("%s %s" + "meta[%u], txnid %" PRIaTXN, + "updating db-format signature for", + META_IS_STEADY(pmeta) ? "stead-" : "weak-", n, txnid); + err = override_meta(env, n, txnid, pmeta); if (unlikely(err != MDBX_SUCCESS) && /* Just ignore the MDBX_PROBLEM error, since here it is * returned only in case of the attempt to upgrade an obsolete * meta-page that is invalid for current state of a DB, * e.g. after shrinking DB file */ err != MDBX_PROBLEM) { - mdbx_error("%s meta[%u], txnid %" PRIaTXN ", error %d", - "updating db-format signature for", n, txnid, err); + ERROR("%s meta[%u], txnid %" PRIaTXN ", error %d", + "updating db-format signature for", n, txnid, err); return err; } } @@ -12447,9 +12336,9 @@ __cold static int mdbx_setup_dxb(MDBX_env *env, const int lck_rc, #if defined(MADV_REMOVE) if (lck_rc && (env->me_flags & MDBX_WRITEMAP) != 0 && /* not recovery mode */ env->me_stuck_meta < 0) { - mdbx_notice("open-MADV_%s %u..%u", "REMOVE (deallocate file space)", - env->me_lck->mti_discarded_tail.weak, - bytes2pgno(env, env->me_dxb_mmap.current)); + NOTICE("open-MADV_%s %u..%u", "REMOVE (deallocate file space)", + env->me_lck->mti_discarded_tail.weak, + bytes2pgno(env, env->me_dxb_mmap.current)); err = madvise(env->me_map + used_aligned2os_bytes, env->me_dxb_mmap.current - used_aligned2os_bytes, MADV_REMOVE) @@ -12460,9 +12349,9 @@ __cold static int mdbx_setup_dxb(MDBX_env *env, const int lck_rc, } #endif /* MADV_REMOVE */ #if defined(MADV_DONTNEED) - mdbx_notice("open-MADV_%s %u..%u", "DONTNEED", - env->me_lck->mti_discarded_tail.weak, - bytes2pgno(env, env->me_dxb_mmap.current)); + NOTICE("open-MADV_%s %u..%u", "DONTNEED", + env->me_lck->mti_discarded_tail.weak, + bytes2pgno(env, env->me_dxb_mmap.current)); err = madvise(env->me_map + used_aligned2os_bytes, env->me_dxb_mmap.current - used_aligned2os_bytes, MADV_DONTNEED) @@ -12485,7 +12374,7 @@ __cold static int mdbx_setup_dxb(MDBX_env *env, const int lck_rc, #endif /* MADV_DONTNEED */ } - err = mdbx_set_readahead(env, bytes2pgno(env, used_bytes), readahead, true); + err = set_readahead(env, bytes2pgno(env, used_bytes), readahead, true); if (unlikely(err != MDBX_SUCCESS)) return err; #endif /* MDBX_ENABLE_MADVISE */ @@ -12496,12 +12385,12 @@ __cold static int mdbx_setup_dxb(MDBX_env *env, const int lck_rc, /******************************************************************************/ /* Open and/or initialize the lock region for the environment. */ -__cold static int mdbx_setup_lck(MDBX_env *env, pathchar_t *lck_pathname, - mdbx_mode_t mode) { - mdbx_assert(env, env->me_lazy_fd != INVALID_HANDLE_VALUE); - mdbx_assert(env, env->me_lfd == INVALID_HANDLE_VALUE); +__cold static int setup_lck(MDBX_env *env, pathchar_t *lck_pathname, + mdbx_mode_t mode) { + eASSERT(env, env->me_lazy_fd != INVALID_HANDLE_VALUE); + eASSERT(env, env->me_lfd == INVALID_HANDLE_VALUE); - int err = mdbx_openfile(MDBX_OPEN_LCK, env, lck_pathname, &env->me_lfd, mode); + int err = osal_openfile(MDBX_OPEN_LCK, env, lck_pathname, &env->me_lfd, mode); if (err != MDBX_SUCCESS) { switch (err) { default: @@ -12519,8 +12408,8 @@ __cold static int mdbx_setup_lck(MDBX_env *env, pathchar_t *lck_pathname, } if (err != MDBX_ENOFILE) { - /* ensure the file system is read-only */ - err = mdbx_check_fs_rdonly(env->me_lazy_fd, lck_pathname, err); + /* ENSURE the file system is read-only */ + err = osal_check_fs_rdonly(env->me_lazy_fd, lck_pathname, err); if (err != MDBX_SUCCESS && /* ignore ERROR_NOT_SUPPORTED for exclusive mode */ !(err == MDBX_ENOSYS && (env->me_flags & MDBX_EXCLUSIVE))) @@ -12530,12 +12419,12 @@ __cold static int mdbx_setup_lck(MDBX_env *env, pathchar_t *lck_pathname, /* LY: without-lck mode (e.g. exclusive or on read-only filesystem) */ /* beginning of a locked section ---------------------------------------- */ lcklist_lock(); - mdbx_assert(env, env->me_lcklist_next == nullptr); + eASSERT(env, env->me_lcklist_next == nullptr); env->me_lfd = INVALID_HANDLE_VALUE; - const int rc = mdbx_lck_seize(env); + const int rc = osal_lck_seize(env); if (MDBX_IS_ERROR(rc)) { /* Calling lcklist_detach_locked() is required to restore POSIX-filelock - * and this job will be done by mdbx_env_close0(). */ + * and this job will be done by env_close(). */ lcklist_unlock(); return rc; } @@ -12547,23 +12436,23 @@ __cold static int mdbx_setup_lck(MDBX_env *env, pathchar_t *lck_pathname, env->me_lck = lckless_stub(env); env->me_maxreaders = UINT_MAX; - mdbx_debug("lck-setup:%s%s%s", " lck-less", - (env->me_flags & MDBX_RDONLY) ? " readonly" : "", - (rc == MDBX_RESULT_TRUE) ? " exclusive" : " cooperative"); + DEBUG("lck-setup:%s%s%s", " lck-less", + (env->me_flags & MDBX_RDONLY) ? " readonly" : "", + (rc == MDBX_RESULT_TRUE) ? " exclusive" : " cooperative"); return rc; } /* beginning of a locked section ------------------------------------------ */ lcklist_lock(); - mdbx_assert(env, env->me_lcklist_next == nullptr); + eASSERT(env, env->me_lcklist_next == nullptr); /* Try to get exclusive lock. If we succeed, then * nobody is using the lock region and we should initialize it. */ - err = mdbx_lck_seize(env); + err = osal_lck_seize(env); if (MDBX_IS_ERROR(err)) { bailout: /* Calling lcklist_detach_locked() is required to restore POSIX-filelock - * and this job will be done by mdbx_env_close0(). */ + * and this job will be done by env_close(). */ lcklist_unlock(); return err; } @@ -12574,7 +12463,7 @@ __cold static int mdbx_setup_lck(MDBX_env *env, pathchar_t *lck_pathname, if (MDBX_IS_ERROR(err)) goto bailout; if (inprocess_neighbor && - ((mdbx_runtime_flags & MDBX_DBG_LEGACY_MULTIOPEN) == 0 || + ((runtime_flags & MDBX_DBG_LEGACY_MULTIOPEN) == 0 || (inprocess_neighbor->me_flags & MDBX_EXCLUSIVE) != 0)) { err = MDBX_BUSY; goto bailout; @@ -12582,13 +12471,12 @@ __cold static int mdbx_setup_lck(MDBX_env *env, pathchar_t *lck_pathname, } const int lck_seize_rc = err; - mdbx_debug("lck-setup:%s%s%s", " with-lck", - (env->me_flags & MDBX_RDONLY) ? " readonly" : "", - (lck_seize_rc == MDBX_RESULT_TRUE) ? " exclusive" - : " cooperative"); + DEBUG("lck-setup:%s%s%s", " with-lck", + (env->me_flags & MDBX_RDONLY) ? " readonly" : "", + (lck_seize_rc == MDBX_RESULT_TRUE) ? " exclusive" : " cooperative"); uint64_t size = 0; - err = mdbx_filesize(env->me_lfd, &size); + err = osal_filesize(env->me_lfd, &size); if (unlikely(err != MDBX_SUCCESS)) goto bailout; @@ -12596,7 +12484,7 @@ __cold static int mdbx_setup_lck(MDBX_env *env, pathchar_t *lck_pathname, size = ceil_powerof2(env->me_maxreaders * sizeof(MDBX_reader) + sizeof(MDBX_lockinfo), env->me_os_psize); - mdbx_jitter4testing(false); + jitter4testing(false); } else { if (env->me_flags & MDBX_EXCLUSIVE) { err = MDBX_BUSY; @@ -12604,7 +12492,7 @@ __cold static int mdbx_setup_lck(MDBX_env *env, pathchar_t *lck_pathname, } if (size > INT_MAX || (size & (env->me_os_psize - 1)) != 0 || size < env->me_os_psize) { - mdbx_error("lck-file has invalid size %" PRIu64 " bytes", size); + ERROR("lck-file has invalid size %" PRIu64 " bytes", size); err = MDBX_PROBLEM; goto bailout; } @@ -12613,7 +12501,7 @@ __cold static int mdbx_setup_lck(MDBX_env *env, pathchar_t *lck_pathname, const size_t maxreaders = ((size_t)size - sizeof(MDBX_lockinfo)) / sizeof(MDBX_reader); if (maxreaders < 4) { - mdbx_error("lck-size too small (up to %" PRIuPTR " readers)", maxreaders); + ERROR("lck-size too small (up to %" PRIuPTR " readers)", maxreaders); err = MDBX_PROBLEM; goto bailout; } @@ -12621,7 +12509,7 @@ __cold static int mdbx_setup_lck(MDBX_env *env, pathchar_t *lck_pathname, ? (unsigned)maxreaders : (unsigned)MDBX_READERS_LIMIT; - err = mdbx_mmap((env->me_flags & MDBX_EXCLUSIVE) | MDBX_WRITEMAP, + err = osal_mmap((env->me_flags & MDBX_EXCLUSIVE) | MDBX_WRITEMAP, &env->me_lck_mmap, (size_t)size, (size_t)size, lck_seize_rc ? MMAP_OPTION_TRUNCATE | MMAP_OPTION_SEMAPHORE : MMAP_OPTION_SEMAPHORE); @@ -12654,55 +12542,54 @@ __cold static int mdbx_setup_lck(MDBX_env *env, pathchar_t *lck_pathname, if (lck_seize_rc == MDBX_RESULT_TRUE) { /* LY: exclusive mode, check and reset lck content */ memset(lck, 0, (size_t)size); - mdbx_jitter4testing(false); + jitter4testing(false); lck->mti_magic_and_version = MDBX_LOCK_MAGIC; lck->mti_os_and_format = MDBX_LOCK_FORMAT; #if MDBX_ENABLE_PGOP_STAT lck->mti_pgop_stat.wops.weak = 1; #endif /* MDBX_ENABLE_PGOP_STAT */ - err = mdbx_msync(&env->me_lck_mmap, 0, (size_t)size, MDBX_SYNC_NONE); + err = osal_msync(&env->me_lck_mmap, 0, (size_t)size, MDBX_SYNC_NONE); if (unlikely(err != MDBX_SUCCESS)) { - mdbx_error("initial-%s for lck-file failed", "msync"); + ERROR("initial-%s for lck-file failed", "msync"); goto bailout; } - err = mdbx_fsync(env->me_lck_mmap.fd, MDBX_SYNC_SIZE); + err = osal_fsync(env->me_lck_mmap.fd, MDBX_SYNC_SIZE); if (unlikely(err != MDBX_SUCCESS)) { - mdbx_error("initial-%s for lck-file failed", "fsync"); + ERROR("initial-%s for lck-file failed", "fsync"); goto bailout; } } else { if (lck->mti_magic_and_version != MDBX_LOCK_MAGIC) { const bool invalid = (lck->mti_magic_and_version >> 8) != MDBX_MAGIC; - mdbx_error( - "lock region has %s", - invalid - ? "invalid magic" - : "incompatible version (only applications with nearly or the " - "same versions of libmdbx can share the same database)"); + ERROR("lock region has %s", + invalid + ? "invalid magic" + : "incompatible version (only applications with nearly or the " + "same versions of libmdbx can share the same database)"); err = invalid ? MDBX_INVALID : MDBX_VERSION_MISMATCH; goto bailout; } if (lck->mti_os_and_format != MDBX_LOCK_FORMAT) { - mdbx_error("lock region has os/format signature 0x%" PRIx32 - ", expected 0x%" PRIx32, - lck->mti_os_and_format, MDBX_LOCK_FORMAT); + ERROR("lock region has os/format signature 0x%" PRIx32 + ", expected 0x%" PRIx32, + lck->mti_os_and_format, MDBX_LOCK_FORMAT); err = MDBX_VERSION_MISMATCH; goto bailout; } } - err = mdbx_lck_init(env, inprocess_neighbor, lck_seize_rc); + err = osal_lck_init(env, inprocess_neighbor, lck_seize_rc); if (MDBX_IS_ERROR(err)) goto bailout; - mdbx_ensure(env, env->me_lcklist_next == nullptr); + ENSURE(env, env->me_lcklist_next == nullptr); /* insert into inprocess lck-list */ env->me_lcklist_next = inprocess_lcklist_head; inprocess_lcklist_head = env; lcklist_unlock(); /* end of a locked section ------------------------------------------------ */ - mdbx_assert(env, !MDBX_IS_ERROR(lck_seize_rc)); + eASSERT(env, !MDBX_IS_ERROR(lck_seize_rc)); env->me_lck = lck; return lck_seize_rc; } @@ -12760,24 +12647,26 @@ static uint32_t merge_sync_flags(const uint32_t a, const uint32_t b) { return r; } -__cold static int __must_check_result mdbx_override_meta( - MDBX_env *env, unsigned target, txnid_t txnid, const MDBX_meta *shape) { +__cold static int __must_check_result override_meta(MDBX_env *env, + unsigned target, + txnid_t txnid, + const MDBX_meta *shape) { int rc = alloc_page_buf(env); if (unlikely(rc != MDBX_SUCCESS)) return rc; MDBX_page *const page = env->me_pbuf; - mdbx_meta_model(env, page, target); + meta_model(env, page, target); MDBX_meta *const model = page_meta(page); meta_set_txnid(env, model, txnid); - mdbx_assert(env, meta_checktxnid(env, model, true)); + eASSERT(env, meta_checktxnid(env, model, true)); if (shape) { if (txnid && unlikely(!meta_checktxnid(env, shape, false))) { - mdbx_error("bailout overriding meta-%u since model failed " - "freedb/maindb %s-check for txnid #%" PRIaTXN, - target, "pre", constmeta_txnid(env, shape)); + ERROR("bailout overriding meta-%u since model failed " + "freedb/maindb %s-check for txnid #%" PRIaTXN, + target, "pre", constmeta_txnid(env, shape)); return MDBX_PROBLEM; } - if (mdbx_runtime_flags & MDBX_DBG_DONT_UPGRADE) + if (runtime_flags & MDBX_DBG_DONT_UPGRADE) memcpy(&model->mm_magic_and_version, &shape->mm_magic_and_version, sizeof(model->mm_magic_and_version)); model->mm_extra_flags = shape->mm_extra_flags; @@ -12796,15 +12685,15 @@ __cold static int __must_check_result mdbx_override_meta( memcpy(&model->mm_magic_and_version, &shape->mm_magic_and_version, sizeof(model->mm_magic_and_version)); if (unlikely(!meta_checktxnid(env, model, false))) { - mdbx_error("bailout overriding meta-%u since model failed " - "freedb/maindb %s-check for txnid #%" PRIaTXN, - target, "post", txnid); + ERROR("bailout overriding meta-%u since model failed " + "freedb/maindb %s-check for txnid #%" PRIaTXN, + target, "post", txnid); return MDBX_PROBLEM; } } } unaligned_poke_u64(4, model->mm_datasync_sign, meta_sign(model)); - rc = mdbx_validate_meta(env, model, page, target, nullptr); + rc = validate_meta(env, model, page, target, nullptr); if (unlikely(MDBX_IS_ERROR(rc))) return MDBX_PROBLEM; @@ -12815,27 +12704,27 @@ __cold static int __must_check_result mdbx_override_meta( env->me_lck->mti_pgop_stat.wops.weak += 1; #endif /* MDBX_ENABLE_PGOP_STAT */ if (env->me_flags & MDBX_WRITEMAP) { - rc = mdbx_msync(&env->me_dxb_mmap, 0, + rc = osal_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, model->mm_geo.next), MDBX_SYNC_DATA | MDBX_SYNC_IODQ); if (unlikely(rc != MDBX_SUCCESS)) return rc; - /* mdbx_override_meta() called only while current process have exclusive + /* override_meta() called only while current process have exclusive * lock of a DB file. So meta-page could be updated directly without * clearing consistency flag by mdbx_meta_update_begin() */ memcpy(pgno2page(env, target), page, env->me_psize); - mdbx_flush_incoherent_cpu_writeback(); - rc = mdbx_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, target + 1), + osal_flush_incoherent_cpu_writeback(); + rc = osal_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, target + 1), MDBX_SYNC_DATA | MDBX_SYNC_IODQ); } else { const mdbx_filehandle_t fd = (env->me_dsync_fd != INVALID_HANDLE_VALUE) ? env->me_dsync_fd : env->me_lazy_fd; - rc = mdbx_pwrite(fd, page, env->me_psize, pgno2bytes(env, target)); + rc = osal_pwrite(fd, page, env->me_psize, pgno2bytes(env, target)); if (rc == MDBX_SUCCESS && fd == env->me_lazy_fd) - rc = mdbx_fsync(env->me_lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); + rc = osal_fsync(env->me_lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); } - mdbx_flush_incoherent_mmap(env->me_map, pgno2bytes(env, NUM_METAS), + osal_flush_incoherent_mmap(env->me_map, pgno2bytes(env, NUM_METAS), env->me_os_psize); meta_cache_clear(env); return rc; @@ -12859,8 +12748,8 @@ __cold int mdbx_env_turn_for_recovery(MDBX_env *env, unsigned target) { MDBX_meta meta = *page_meta(page); if (n == target) continue; - if (mdbx_validate_meta(env, &meta, page, n, nullptr) != MDBX_SUCCESS) { - int err = mdbx_override_meta(env, n, 0, nullptr); + if (validate_meta(env, &meta, page, n, nullptr) != MDBX_SUCCESS) { + int err = override_meta(env, n, 0, nullptr); if (unlikely(err != MDBX_SUCCESS)) return err; } else { @@ -12871,17 +12760,17 @@ __cold int mdbx_env_turn_for_recovery(MDBX_env *env, unsigned target) { } if (unlikely(new_txnid > MAX_TXNID)) { - mdbx_error("txnid overflow, raise %d", MDBX_TXN_FULL); + ERROR("txnid overflow, raise %d", MDBX_TXN_FULL); return MDBX_TXN_FULL; } - return mdbx_override_meta(env, target, new_txnid, target_meta); + return override_meta(env, target, new_txnid, target_meta); } __cold int mdbx_env_open_for_recovery(MDBX_env *env, const char *pathname, unsigned target_meta, bool writeable) { #if defined(_WIN32) || defined(_WIN64) const wchar_t *pathnameW = nullptr; - MUSTDIE_MB2WIDE(pathname, pathnameW); + OSAL_MB2WIDE(pathname, pathnameW); return mdbx_env_open_for_recoveryW(env, pathnameW, target_meta, writeable); } @@ -12930,10 +12819,10 @@ static bool path_equal(const pathchar_t *l, const pathchar_t *r, size_t len) { #endif } -__cold static int mdbx_handle_env_pathname(MDBX_handle_env_pathname *ctx, - const pathchar_t *pathname, - MDBX_env_flags_t *flags, - const mdbx_mode_t mode) { +__cold static int handle_env_pathname(MDBX_handle_env_pathname *ctx, + const pathchar_t *pathname, + MDBX_env_flags_t *flags, + const mdbx_mode_t mode) { memset(ctx, 0, sizeof(*ctx)); if (unlikely(!pathname || !*pathname)) return MDBX_EINVAL; @@ -13016,7 +12905,7 @@ __cold static int mdbx_handle_env_pathname(MDBX_handle_env_pathname *ctx, sizeof(pathchar_t) * ctx->ent_len * 2 + ((*flags & MDBX_NOSUBDIR) ? sizeof(lock_suffix) + sizeof(pathchar_t) : sizeof(lck_name) + sizeof(dxb_name)); - ctx->buffer_for_free = mdbx_malloc(bytes_needed); + ctx->buffer_for_free = osal_malloc(bytes_needed); if (!ctx->buffer_for_free) return MDBX_ENOMEM; @@ -13038,7 +12927,7 @@ __cold static int mdbx_handle_env_pathname(MDBX_handle_env_pathname *ctx, __cold int mdbx_env_delete(const char *pathname, MDBX_env_delete_mode_t mode) { #if defined(_WIN32) || defined(_WIN64) const wchar_t *pathnameW = nullptr; - MUSTDIE_MB2WIDE(pathname, pathnameW); + OSAL_MB2WIDE(pathname, pathnameW); return mdbx_env_deleteW(pathnameW, mode); } @@ -13063,35 +12952,35 @@ __cold int mdbx_env_deleteW(const wchar_t *pathname, memset(dummy_env, 0, sizeof(*dummy_env)); dummy_env->me_flags = (mode == MDBX_ENV_ENSURE_UNUSED) ? MDBX_EXCLUSIVE : MDBX_ENV_DEFAULTS; - dummy_env->me_os_psize = (unsigned)mdbx_syspagesize(); + dummy_env->me_os_psize = (unsigned)osal_syspagesize(); dummy_env->me_psize = (unsigned)mdbx_default_pagesize(); dummy_env->me_pathname = (pathchar_t *)pathname; MDBX_handle_env_pathname env_pathname; STATIC_ASSERT(sizeof(dummy_env->me_flags) == sizeof(MDBX_env_flags_t)); int rc = MDBX_RESULT_TRUE, - err = mdbx_handle_env_pathname( - &env_pathname, pathname, (MDBX_env_flags_t *)&dummy_env->me_flags, 0); + err = handle_env_pathname(&env_pathname, pathname, + (MDBX_env_flags_t *)&dummy_env->me_flags, 0); if (likely(err == MDBX_SUCCESS)) { mdbx_filehandle_t clk_handle = INVALID_HANDLE_VALUE, dxb_handle = INVALID_HANDLE_VALUE; if (mode > MDBX_ENV_JUST_DELETE) { - err = mdbx_openfile(MDBX_OPEN_DELETE, dummy_env, env_pathname.dxb, + err = osal_openfile(MDBX_OPEN_DELETE, dummy_env, env_pathname.dxb, &dxb_handle, 0); err = (err == MDBX_ENOFILE) ? MDBX_SUCCESS : err; if (err == MDBX_SUCCESS) { - err = mdbx_openfile(MDBX_OPEN_DELETE, dummy_env, env_pathname.lck, + err = osal_openfile(MDBX_OPEN_DELETE, dummy_env, env_pathname.lck, &clk_handle, 0); err = (err == MDBX_ENOFILE) ? MDBX_SUCCESS : err; } if (err == MDBX_SUCCESS && clk_handle != INVALID_HANDLE_VALUE) - err = mdbx_lockfile(clk_handle, mode == MDBX_ENV_WAIT_FOR_UNUSED); + err = osal_lockfile(clk_handle, mode == MDBX_ENV_WAIT_FOR_UNUSED); if (err == MDBX_SUCCESS && dxb_handle != INVALID_HANDLE_VALUE) - err = mdbx_lockfile(dxb_handle, mode == MDBX_ENV_WAIT_FOR_UNUSED); + err = osal_lockfile(dxb_handle, mode == MDBX_ENV_WAIT_FOR_UNUSED); } if (err == MDBX_SUCCESS) { - err = mdbx_removefile(env_pathname.dxb); + err = osal_removefile(env_pathname.dxb); if (err == MDBX_SUCCESS) rc = MDBX_SUCCESS; else if (err == MDBX_ENOFILE) @@ -13099,7 +12988,7 @@ __cold int mdbx_env_deleteW(const wchar_t *pathname, } if (err == MDBX_SUCCESS) { - err = mdbx_removefile(env_pathname.lck); + err = osal_removefile(env_pathname.lck); if (err == MDBX_SUCCESS) rc = MDBX_SUCCESS; else if (err == MDBX_ENOFILE) @@ -13107,7 +12996,7 @@ __cold int mdbx_env_deleteW(const wchar_t *pathname, } if (err == MDBX_SUCCESS && !(dummy_env->me_flags & MDBX_NOSUBDIR)) { - err = mdbx_removedirectory(pathname); + err = osal_removedirectory(pathname); if (err == MDBX_SUCCESS) rc = MDBX_SUCCESS; else if (err == MDBX_ENOFILE) @@ -13115,13 +13004,13 @@ __cold int mdbx_env_deleteW(const wchar_t *pathname, } if (dxb_handle != INVALID_HANDLE_VALUE) - mdbx_closefile(dxb_handle); + osal_closefile(dxb_handle); if (clk_handle != INVALID_HANDLE_VALUE) - mdbx_closefile(clk_handle); + osal_closefile(clk_handle); } else if (err == MDBX_ENOFILE) err = MDBX_SUCCESS; - mdbx_free(env_pathname.buffer_for_free); + osal_free(env_pathname.buffer_for_free); return (err == MDBX_SUCCESS) ? rc : err; } @@ -13129,7 +13018,7 @@ __cold int mdbx_env_open(MDBX_env *env, const char *pathname, MDBX_env_flags_t flags, mdbx_mode_t mode) { #if defined(_WIN32) || defined(_WIN64) const wchar_t *pathnameW = nullptr; - MUSTDIE_MB2WIDE(pathname, pathnameW); + OSAL_MB2WIDE(pathname, pathnameW); return mdbx_env_openW(env, pathnameW, flags, mode); } @@ -13167,9 +13056,9 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, if (flags & MDBX_ACCEDE) flags |= MDBX_WRITEMAP; else { - mdbx_debug_log(MDBX_LOG_ERROR, __func__, __LINE__, - "System (i.e. OpenBSD) requires MDBX_WRITEMAP because " - "of an internal flaw(s) in a file/buffer/page cache.\n"); + debug_log(MDBX_LOG_ERROR, __func__, __LINE__, + "System (i.e. OpenBSD) requires MDBX_WRITEMAP because " + "of an internal flaw(s) in a file/buffer/page cache.\n"); return 42 /* ENOPROTOOPT */; } } @@ -13177,15 +13066,15 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, } MDBX_handle_env_pathname env_pathname; - rc = mdbx_handle_env_pathname(&env_pathname, pathname, &flags, mode); + rc = handle_env_pathname(&env_pathname, pathname, &flags, mode); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; env->me_flags = (flags & ~MDBX_FATAL_ERROR) | MDBX_ENV_ACTIVE; - env->me_pathname = mdbx_calloc(env_pathname.ent_len + 1, sizeof(pathchar_t)); - env->me_dbxs = mdbx_calloc(env->me_maxdbs, sizeof(MDBX_dbx)); - env->me_dbflags = mdbx_calloc(env->me_maxdbs, sizeof(env->me_dbflags[0])); - env->me_dbiseqs = mdbx_calloc(env->me_maxdbs, sizeof(env->me_dbiseqs[0])); + env->me_pathname = osal_calloc(env_pathname.ent_len + 1, sizeof(pathchar_t)); + env->me_dbxs = osal_calloc(env->me_maxdbs, sizeof(MDBX_dbx)); + env->me_dbflags = osal_calloc(env->me_maxdbs, sizeof(env->me_dbflags[0])); + env->me_dbiseqs = osal_calloc(env->me_maxdbs, sizeof(env->me_dbiseqs[0])); if (!(env->me_dbxs && env->me_pathname && env->me_dbflags && env->me_dbiseqs)) { rc = MDBX_ENOMEM; @@ -13196,18 +13085,18 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, env->me_dbxs[FREE_DBI].md_cmp = cmp_int_align4; /* aligned MDBX_INTEGERKEY */ env->me_dbxs[FREE_DBI].md_dcmp = cmp_lenfast; - rc = mdbx_openfile(F_ISSET(flags, MDBX_RDONLY) ? MDBX_OPEN_DXB_READ + rc = osal_openfile(F_ISSET(flags, MDBX_RDONLY) ? MDBX_OPEN_DXB_READ : MDBX_OPEN_DXB_LAZY, env, env_pathname.dxb, &env->me_lazy_fd, mode); if (rc != MDBX_SUCCESS) goto bailout; - mdbx_assert(env, env->me_dsync_fd == INVALID_HANDLE_VALUE); + eASSERT(env, env->me_dsync_fd == INVALID_HANDLE_VALUE); if ((flags & (MDBX_RDONLY | MDBX_SAFE_NOSYNC | MDBX_NOMETASYNC)) == 0) { - rc = mdbx_openfile(MDBX_OPEN_DXB_DSYNC, env, env_pathname.dxb, + rc = osal_openfile(MDBX_OPEN_DXB_DSYNC, env, env_pathname.dxb, &env->me_dsync_fd, 0); - mdbx_ensure(env, (rc != MDBX_SUCCESS) == - (env->me_dsync_fd == INVALID_HANDLE_VALUE)); + ENSURE(env, + (rc != MDBX_SUCCESS) == (env->me_dsync_fd == INVALID_HANDLE_VALUE)); } #if MDBX_LOCKING == MDBX_LOCKING_SYSV @@ -13234,7 +13123,7 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, ((mode & S_IRGRP) ? /* +write if readable by group */ S_IWGRP : 0) | ((mode & S_IROTH) ? /* +write if readable by others */ S_IWOTH : 0); #endif /* !Windows */ - const int lck_rc = mdbx_setup_lck(env, env_pathname.lck, mode); + const int lck_rc = setup_lck(env, env_pathname.lck, mode); if (MDBX_IS_ERROR(lck_rc)) { rc = lck_rc; goto bailout; @@ -13242,10 +13131,10 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, /* Set the position in files outside of the data to avoid corruption * due to erroneous use of file descriptors in the application code. */ - mdbx_fseek(env->me_lfd, UINT64_C(1) << 63); - mdbx_fseek(env->me_lazy_fd, UINT64_C(1) << 63); + osal_fseek(env->me_lfd, UINT64_C(1) << 63); + osal_fseek(env->me_lazy_fd, UINT64_C(1) << 63); if (env->me_dsync_fd != INVALID_HANDLE_VALUE) - mdbx_fseek(env->me_dsync_fd, UINT64_C(1) << 63); + osal_fseek(env->me_dsync_fd, UINT64_C(1) << 63); const MDBX_env_flags_t rigorous_flags = MDBX_SAFE_NOSYNC | MDBX_DEPRECATED_MAPASYNC; @@ -13278,19 +13167,19 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, /* Pickup current mode-flags (MDBX_LIFORECLAIM, MDBX_NORDAHEAD, etc). */ const unsigned diff = (lck->mti_envmode.weak ^ env->me_flags) & mode_flags; - mdbx_notice("accede mode-flags: 0x%X, 0x%X -> 0x%X", diff, env->me_flags, - env->me_flags ^ diff); + NOTICE("accede mode-flags: 0x%X, 0x%X -> 0x%X", diff, env->me_flags, + env->me_flags ^ diff); env->me_flags ^= diff; } if ((lck->mti_envmode.weak ^ env->me_flags) & rigorous_flags) { - mdbx_error("%s", "current mode/flags incompatible with requested"); + ERROR("%s", "current mode/flags incompatible with requested"); rc = MDBX_INCOMPATIBLE; goto bailout; } } - const int dxb_rc = mdbx_setup_dxb(env, lck_rc, mode); + const int dxb_rc = setup_dxb(env, lck_rc, mode); if (MDBX_IS_ERROR(dxb_rc)) { rc = dxb_rc; goto bailout; @@ -13299,32 +13188,31 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, if (unlikely(/* recovery mode */ env->me_stuck_meta >= 0) && (lck_rc != /* exclusive */ MDBX_RESULT_TRUE || (flags & MDBX_EXCLUSIVE) == 0)) { - mdbx_error("%s", "recovery requires exclusive mode"); + ERROR("%s", "recovery requires exclusive mode"); rc = MDBX_BUSY; goto bailout; } - mdbx_debug("opened dbenv %p", (void *)env); + DEBUG("opened dbenv %p", (void *)env); if (lck) { if (lck_rc == MDBX_RESULT_TRUE) { lck->mti_envmode.weak = env->me_flags & (mode_flags | MDBX_RDONLY); - lck->mti_meta_sync_txnid.weak = - (uint32_t)mdbx_recent_committed_txnid(env); - lck->mti_reader_check_timestamp.weak = mdbx_osal_monotime(); - rc = mdbx_lck_downgrade(env); - mdbx_debug("lck-downgrade-%s: rc %i", - (env->me_flags & MDBX_EXCLUSIVE) ? "partial" : "full", rc); + lck->mti_meta_sync_txnid.weak = (uint32_t)recent_committed_txnid(env); + lck->mti_reader_check_timestamp.weak = osal_monotime(); + rc = osal_lck_downgrade(env); + DEBUG("lck-downgrade-%s: rc %i", + (env->me_flags & MDBX_EXCLUSIVE) ? "partial" : "full", rc); if (rc != MDBX_SUCCESS) goto bailout; } else { - rc = mdbx_cleanup_dead_readers(env, false, NULL); + rc = cleanup_dead_readers(env, false, NULL); if (MDBX_IS_ERROR(rc)) goto bailout; } if ((env->me_flags & MDBX_NOTLS) == 0) { - rc = mdbx_rthc_alloc(&env->me_txkey, &lck->mti_readers[0], - &lck->mti_readers[env->me_maxreaders]); + rc = rthc_alloc(&env->me_txkey, &lck->mti_readers[0], + &lck->mti_readers[env->me_maxreaders]); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; env->me_flags |= MDBX_ENV_TXKEY; @@ -13332,8 +13220,8 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, } else { env->me_lck->mti_envmode.weak = env->me_flags & (mode_flags | MDBX_RDONLY); env->me_lck->mti_meta_sync_txnid.weak = - (uint32_t)mdbx_recent_committed_txnid(env); - env->me_lck->mti_reader_check_timestamp.weak = mdbx_osal_monotime(); + (uint32_t)recent_committed_txnid(env); + env->me_lck->mti_reader_check_timestamp.weak = osal_monotime(); } if ((flags & MDBX_RDONLY) == 0) { @@ -13344,7 +13232,7 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, rc = alloc_page_buf(env); if (rc == MDBX_SUCCESS) { memset(env->me_pbuf, -1, env->me_psize * 2); - MDBX_txn *txn = mdbx_calloc(1, size); + MDBX_txn *txn = osal_calloc(1, size); if (txn) { txn->mt_dbs = (MDBX_db *)((char *)txn + tsize); txn->mt_cursors = (MDBX_cursor **)(txn->mt_dbs + env->me_maxdbs); @@ -13354,8 +13242,8 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, txn->mt_dbxs = env->me_dbxs; txn->mt_flags = MDBX_TXN_FINISHED; env->me_txn0 = txn; - txn->tw.retired_pages = mdbx_pnl_alloc(MDBX_PNL_INITIAL); - txn->tw.reclaimed_pglist = mdbx_pnl_alloc(MDBX_PNL_INITIAL); + txn->tw.retired_pages = pnl_alloc(MDBX_PNL_INITIAL); + txn->tw.reclaimed_pglist = pnl_alloc(MDBX_PNL_INITIAL); if (unlikely(!txn->tw.retired_pages || !txn->tw.reclaimed_pglist)) rc = MDBX_ENOMEM; } else @@ -13368,48 +13256,48 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, const MDBX_meta *meta = (const MDBX_meta *)meta_prefer_last(env); const MDBX_db *db = &meta->mm_dbs[MAIN_DBI]; - mdbx_debug("opened database version %u, pagesize %u", - (uint8_t)unaligned_peek_u64(4, meta->mm_magic_and_version), - env->me_psize); - mdbx_debug("using meta page %" PRIaPGNO ", txn %" PRIaTXN, - data_page(meta)->mp_pgno, meta_txnid(env, meta)); - mdbx_debug("depth: %u", db->md_depth); - mdbx_debug("entries: %" PRIu64, db->md_entries); - mdbx_debug("branch pages: %" PRIaPGNO, db->md_branch_pages); - mdbx_debug("leaf pages: %" PRIaPGNO, db->md_leaf_pages); - mdbx_debug("large/overflow pages: %" PRIaPGNO, db->md_overflow_pages); - mdbx_debug("root: %" PRIaPGNO, db->md_root); - mdbx_debug("schema_altered: %" PRIaTXN, db->md_mod_txnid); + DEBUG("opened database version %u, pagesize %u", + (uint8_t)unaligned_peek_u64(4, meta->mm_magic_and_version), + env->me_psize); + DEBUG("using meta page %" PRIaPGNO ", txn %" PRIaTXN, + data_page(meta)->mp_pgno, meta_txnid(env, meta)); + DEBUG("depth: %u", db->md_depth); + DEBUG("entries: %" PRIu64, db->md_entries); + DEBUG("branch pages: %" PRIaPGNO, db->md_branch_pages); + DEBUG("leaf pages: %" PRIaPGNO, db->md_leaf_pages); + DEBUG("large/overflow pages: %" PRIaPGNO, db->md_overflow_pages); + DEBUG("root: %" PRIaPGNO, db->md_root); + DEBUG("schema_altered: %" PRIaTXN, db->md_mod_txnid); } #endif bailout: if (rc != MDBX_SUCCESS) { - rc = mdbx_env_close0(env) ? MDBX_PANIC : rc; + rc = env_close(env) ? MDBX_PANIC : rc; env->me_flags = saved_me_flags | ((rc != MDBX_PANIC) ? 0 : MDBX_FATAL_ERROR); } else { #if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__) - mdbx_txn_valgrind(env, nullptr); + txn_valgrind(env, nullptr); #endif } - mdbx_free(env_pathname.buffer_for_free); + osal_free(env_pathname.buffer_for_free); return rc; } /* Destroy resources from mdbx_env_open(), clear our readers & DBIs */ -__cold static int mdbx_env_close0(MDBX_env *env) { +__cold static int env_close(MDBX_env *env) { const unsigned flags = env->me_flags; if (!(flags & MDBX_ENV_ACTIVE)) { - mdbx_ensure(env, env->me_lcklist_next == nullptr); + ENSURE(env, env->me_lcklist_next == nullptr); return MDBX_SUCCESS; } env->me_flags &= ~ENV_INTERNAL_FLAGS; env->me_lck = nullptr; if (flags & MDBX_ENV_TXKEY) { - mdbx_rthc_remove(env->me_txkey); - env->me_txkey = (mdbx_thread_key_t)0; + rthc_remove(env->me_txkey); + env->me_txkey = (osal_thread_key_t)0; } lcklist_lock(); @@ -13417,7 +13305,7 @@ __cold static int mdbx_env_close0(MDBX_env *env) { lcklist_unlock(); if (env->me_map) { - mdbx_munmap(&env->me_dxb_mmap); + osal_munmap(&env->me_dxb_mmap); #ifdef MDBX_USE_VALGRIND VALGRIND_DISCARD(env->me_valgrind_handle); env->me_valgrind_handle = -1; @@ -13425,52 +13313,52 @@ __cold static int mdbx_env_close0(MDBX_env *env) { } if (env->me_dsync_fd != INVALID_HANDLE_VALUE) { - (void)mdbx_closefile(env->me_dsync_fd); + (void)osal_closefile(env->me_dsync_fd); env->me_dsync_fd = INVALID_HANDLE_VALUE; } if (env->me_lazy_fd != INVALID_HANDLE_VALUE) { - (void)mdbx_closefile(env->me_lazy_fd); + (void)osal_closefile(env->me_lazy_fd); env->me_lazy_fd = INVALID_HANDLE_VALUE; } if (env->me_lck_mmap.lck) - mdbx_munmap(&env->me_lck_mmap); + osal_munmap(&env->me_lck_mmap); if (env->me_lfd != INVALID_HANDLE_VALUE) { - (void)mdbx_closefile(env->me_lfd); + (void)osal_closefile(env->me_lfd); env->me_lfd = INVALID_HANDLE_VALUE; } if (env->me_dbxs) { for (unsigned i = env->me_numdbs; --i >= CORE_DBS;) - mdbx_free(env->me_dbxs[i].md_name.iov_base); - mdbx_free(env->me_dbxs); + osal_free(env->me_dbxs[i].md_name.iov_base); + osal_free(env->me_dbxs); env->me_dbxs = nullptr; } if (env->me_pbuf) { - mdbx_memalign_free(env->me_pbuf); + osal_memalign_free(env->me_pbuf); env->me_pbuf = nullptr; } if (env->me_dbiseqs) { - mdbx_free(env->me_dbiseqs); + osal_free(env->me_dbiseqs); env->me_dbiseqs = nullptr; } if (env->me_dbflags) { - mdbx_free(env->me_dbflags); + osal_free(env->me_dbflags); env->me_dbflags = nullptr; } if (env->me_pathname) { - mdbx_free(env->me_pathname); + osal_free(env->me_pathname); env->me_pathname = nullptr; } if (env->me_txn0) { - mdbx_dpl_free(env->me_txn0); - mdbx_txl_free(env->me_txn0->tw.lifo_reclaimed); - mdbx_pnl_free(env->me_txn0->tw.retired_pages); - mdbx_pnl_free(env->me_txn0->tw.spill_pages); - mdbx_pnl_free(env->me_txn0->tw.reclaimed_pglist); - mdbx_free(env->me_txn0); + dpl_free(env->me_txn0); + txl_free(env->me_txn0->tw.lifo_reclaimed); + pnl_free(env->me_txn0->tw.retired_pages); + pnl_free(env->me_txn0->tw.spill_pages); + pnl_free(env->me_txn0->tw.reclaimed_pglist); + osal_free(env->me_txn0); env->me_txn0 = nullptr; } env->me_stuck_meta = -1; @@ -13492,13 +13380,13 @@ __cold int mdbx_env_close_ex(MDBX_env *env, bool dont_sync) { * platforms (i.e. where fork() is available). * This is required to legitimize a call after fork() * from a child process, that should be allowed to free resources. */ - if (unlikely(env->me_pid != mdbx_getpid())) + if (unlikely(env->me_pid != osal_getpid())) env->me_flags |= MDBX_FATAL_ERROR; #endif /* MDBX_ENV_CHECKPID */ if (env->me_map && (env->me_flags & (MDBX_RDONLY | MDBX_FATAL_ERROR)) == 0 && env->me_txn0) { - if (env->me_txn0->mt_owner && env->me_txn0->mt_owner != mdbx_thread_self()) + if (env->me_txn0->mt_owner && env->me_txn0->mt_owner != osal_thread_self()) return MDBX_BUSY; } else dont_sync = true; @@ -13512,14 +13400,14 @@ __cold int mdbx_env_close_ex(MDBX_env *env, bool dont_sync) { * process is running a writing transaction or not. * Because in the "owner died" condition kernel don't release * file lock immediately. */ - rc = mdbx_env_sync_internal(env, true, false); + rc = env_sync(env, true, false); rc = (rc == MDBX_RESULT_TRUE) ? MDBX_SUCCESS : rc; #else struct stat st; if (unlikely(fstat(env->me_lazy_fd, &st))) rc = errno; else if (st.st_nlink > 0 /* don't sync deleted files */) { - rc = mdbx_env_sync_internal(env, true, true); + rc = env_sync(env, true, true); rc = (rc == MDBX_BUSY || rc == EAGAIN || rc == EACCES || rc == EBUSY || rc == EWOULDBLOCK || rc == MDBX_RESULT_TRUE) ? MDBX_SUCCESS @@ -13528,32 +13416,31 @@ __cold int mdbx_env_close_ex(MDBX_env *env, bool dont_sync) { #endif } - mdbx_assert(env, env->me_signature.weak == 0); - rc = mdbx_env_close0(env) ? MDBX_PANIC : rc; - mdbx_ensure(env, mdbx_fastmutex_destroy(&env->me_dbi_lock) == MDBX_SUCCESS); + eASSERT(env, env->me_signature.weak == 0); + rc = env_close(env) ? MDBX_PANIC : rc; + ENSURE(env, osal_fastmutex_destroy(&env->me_dbi_lock) == MDBX_SUCCESS); #if defined(_WIN32) || defined(_WIN64) /* me_remap_guard don't have destructor (Slim Reader/Writer Lock) */ DeleteCriticalSection(&env->me_windowsbug_lock); #else - mdbx_ensure(env, - mdbx_fastmutex_destroy(&env->me_remap_guard) == MDBX_SUCCESS); + ENSURE(env, osal_fastmutex_destroy(&env->me_remap_guard) == MDBX_SUCCESS); #endif /* Windows */ #if MDBX_LOCKING > MDBX_LOCKING_SYSV MDBX_lockinfo *const stub = lckless_stub(env); - mdbx_ensure(env, mdbx_ipclock_destroy(&stub->mti_wlock) == 0); + ENSURE(env, osal_ipclock_destroy(&stub->mti_wlock) == 0); #endif /* MDBX_LOCKING */ while ((dp = env->me_dp_reserve) != NULL) { MDBX_ASAN_UNPOISON_MEMORY_REGION(dp, env->me_psize); VALGRIND_MAKE_MEM_DEFINED(&dp->mp_next, sizeof(dp->mp_next)); env->me_dp_reserve = dp->mp_next; - mdbx_free(dp); + osal_free(dp); } VALGRIND_DESTROY_MEMPOOL(env); - mdbx_ensure(env, env->me_lcklist_next == nullptr); + ENSURE(env, env->me_lcklist_next == nullptr); env->me_pid = 0; - mdbx_free(env); + osal_free(env); return rc; } @@ -13566,7 +13453,7 @@ __cold int mdbx_env_close(MDBX_env *env) { /* Compare two items pointing at aligned unsigned int's. */ __hot static int cmp_int_align4(const MDBX_val *a, const MDBX_val *b) { - mdbx_assert(NULL, a->iov_len == b->iov_len); + eASSERT(NULL, a->iov_len == b->iov_len); switch (a->iov_len) { case 4: return CMP2INT(unaligned_peek_u32(4, a->iov_base), @@ -13583,7 +13470,7 @@ __hot static int cmp_int_align4(const MDBX_val *a, const MDBX_val *b) { /* Compare two items pointing at 2-byte aligned unsigned int's. */ __hot static int cmp_int_align2(const MDBX_val *a, const MDBX_val *b) { - mdbx_assert(NULL, a->iov_len == b->iov_len); + eASSERT(NULL, a->iov_len == b->iov_len); switch (a->iov_len) { case 4: return CMP2INT(unaligned_peek_u32(2, a->iov_base), @@ -13602,7 +13489,7 @@ __hot static int cmp_int_align2(const MDBX_val *a, const MDBX_val *b) { * * This is also set as MDBX_INTEGERDUP|MDBX_DUPFIXED's MDBX_dbx.md_dcmp. */ __hot static int cmp_int_unaligned(const MDBX_val *a, const MDBX_val *b) { - mdbx_assert(NULL, a->iov_len == b->iov_len); + eASSERT(NULL, a->iov_len == b->iov_len); switch (a->iov_len) { case 4: return CMP2INT(unaligned_peek_u32(1, a->iov_base), @@ -13664,15 +13551,15 @@ static bool unsure_equal(MDBX_cmp_func cmp, const MDBX_val *a, * Returns the smallest entry larger or equal to the key. * Updates the cursor index with the index of the found entry. * If no entry larger or equal to the key is found, returns NULL. */ -__hot static struct node_result mdbx_node_search(MDBX_cursor *mc, - const MDBX_val *key) { +__hot static struct node_result node_search(MDBX_cursor *mc, + const MDBX_val *key) { MDBX_page *mp = mc->mc_pg[mc->mc_top]; const int nkeys = page_numkeys(mp); DKBUF_DEBUG; - mdbx_debug("searching %u keys in %s %spage %" PRIaPGNO, nkeys, - IS_LEAF(mp) ? "leaf" : "branch", IS_SUBP(mp) ? "sub-" : "", - mp->mp_pgno); + DEBUG("searching %u keys in %s %spage %" PRIaPGNO, nkeys, + IS_LEAF(mp) ? "leaf" : "branch", IS_SUBP(mp) ? "sub-" : "", + mp->mp_pgno); struct node_result ret; ret.exact = false; @@ -13689,16 +13576,15 @@ __hot static struct node_result mdbx_node_search(MDBX_cursor *mc, MDBX_cmp_func *cmp = mc->mc_dbx->md_cmp; MDBX_val nodekey; if (unlikely(IS_LEAF2(mp))) { - mdbx_cassert(mc, mp->mp_leaf2_ksize == mc->mc_db->md_xsize); + cASSERT(mc, mp->mp_leaf2_ksize == mc->mc_db->md_xsize); nodekey.iov_len = mp->mp_leaf2_ksize; do { i = (low + high) >> 1; nodekey.iov_base = page_leaf2key(mp, i, nodekey.iov_len); - mdbx_cassert(mc, (char *)mp + mc->mc_txn->mt_env->me_psize >= - (char *)nodekey.iov_base + nodekey.iov_len); + cASSERT(mc, (char *)mp + mc->mc_txn->mt_env->me_psize >= + (char *)nodekey.iov_base + nodekey.iov_len); int cr = cmp(key, &nodekey); - mdbx_debug("found leaf index %u [%s], rc = %i", i, DKEY_DEBUG(&nodekey), - cr); + DEBUG("found leaf index %u [%s], rc = %i", i, DKEY_DEBUG(&nodekey), cr); if (cr > 0) /* Found entry is less than the key. */ /* Skip to get the smallest entry larger than key. */ @@ -13730,15 +13616,14 @@ __hot static struct node_result mdbx_node_search(MDBX_cursor *mc, node = page_node(mp, i); nodekey.iov_len = node_ks(node); nodekey.iov_base = node_key(node); - mdbx_cassert(mc, (char *)mp + mc->mc_txn->mt_env->me_psize >= - (char *)nodekey.iov_base + nodekey.iov_len); + cASSERT(mc, (char *)mp + mc->mc_txn->mt_env->me_psize >= + (char *)nodekey.iov_base + nodekey.iov_len); int cr = cmp(key, &nodekey); if (IS_LEAF(mp)) - mdbx_debug("found leaf index %u [%s], rc = %i", i, DKEY_DEBUG(&nodekey), - cr); + DEBUG("found leaf index %u [%s], rc = %i", i, DKEY_DEBUG(&nodekey), cr); else - mdbx_debug("found branch index %u [%s -> %" PRIaPGNO "], rc = %i", i, - DKEY_DEBUG(&nodekey), node_pgno(node), cr); + DEBUG("found branch index %u [%s -> %" PRIaPGNO "], rc = %i", i, + DKEY_DEBUG(&nodekey), node_pgno(node), cr); if (cr > 0) /* Found entry is less than the key. */ /* Skip to get the smallest entry larger than key. */ @@ -13760,10 +13645,10 @@ __hot static struct node_result mdbx_node_search(MDBX_cursor *mc, } /* Pop a page off the top of the cursor's stack. */ -static __inline void mdbx_cursor_pop(MDBX_cursor *mc) { +static __inline void cursor_pop(MDBX_cursor *mc) { if (likely(mc->mc_snum)) { - mdbx_debug("popped page %" PRIaPGNO " off db %d cursor %p", - mc->mc_pg[mc->mc_top]->mp_pgno, DDBI(mc), (void *)mc); + DEBUG("popped page %" PRIaPGNO " off db %d cursor %p", + mc->mc_pg[mc->mc_top]->mp_pgno, DDBI(mc), (void *)mc); if (likely(--mc->mc_snum)) { mc->mc_top--; } else { @@ -13774,9 +13659,9 @@ static __inline void mdbx_cursor_pop(MDBX_cursor *mc) { /* Push a page onto the top of the cursor's stack. * Set MDBX_TXN_ERROR on failure. */ -static __inline int mdbx_cursor_push(MDBX_cursor *mc, MDBX_page *mp) { - mdbx_debug("pushing page %" PRIaPGNO " on db %d cursor %p", mp->mp_pgno, - DDBI(mc), (void *)mc); +static __inline int cursor_push(MDBX_cursor *mc, MDBX_page *mp) { + DEBUG("pushing page %" PRIaPGNO " on db %d cursor %p", mp->mp_pgno, DDBI(mc), + (void *)mc); if (unlikely(mc->mc_snum >= CURSOR_STACK)) { mc->mc_txn->mt_flags |= MDBX_TXN_ERROR; @@ -13850,7 +13735,7 @@ __cold static __noinline pgr_t page_get_checker_full(const uint16_t ILL, const txnid_t front) { pgr_t r = {page, page_get_checker_lite(ILL, page, mc->mc_txn, front)}; if (likely(r.err == MDBX_SUCCESS)) - r.err = mdbx_page_check(mc, page); + r.err = page_check(mc, page); if (unlikely(r.err != MDBX_SUCCESS)) mc->mc_txn->mt_flags |= MDBX_TXN_ERROR; return r; @@ -13861,11 +13746,11 @@ __hot static __always_inline pgr_t page_get_inline(const uint16_t ILL, const pgno_t pgno, const txnid_t front) { MDBX_txn *const txn = mc->mc_txn; - mdbx_tassert(txn, front <= txn->mt_front); + tASSERT(txn, front <= txn->mt_front); pgr_t r; if (unlikely(pgno >= txn->mt_next_pgno)) { - mdbx_error("page #%" PRIaPGNO " beyond next-pgno", pgno); + ERROR("page #%" PRIaPGNO " beyond next-pgno", pgno); r.page = nullptr; r.err = MDBX_PAGE_NOTFOUND; bailout: @@ -13873,8 +13758,8 @@ __hot static __always_inline pgr_t page_get_inline(const uint16_t ILL, return r; } - mdbx_assert(txn->mt_env, - ((txn->mt_flags ^ txn->mt_env->me_flags) & MDBX_WRITEMAP) == 0); + eASSERT(txn->mt_env, + ((txn->mt_flags ^ txn->mt_env->me_flags) & MDBX_WRITEMAP) == 0); r.page = pgno2page(txn->mt_env, pgno); if ((txn->mt_flags & (MDBX_TXN_RDONLY | MDBX_WRITEMAP)) == 0) { const MDBX_txn *spiller = txn; @@ -13884,11 +13769,11 @@ __hot static __always_inline pgr_t page_get_inline(const uint16_t ILL, * back in from the map (but don't unspill it here, * leave that unless page_touch happens again). */ if (unlikely(spiller->mt_flags & MDBX_TXN_SPILLS) && - mdbx_search_spilled(spiller, pgno)) + search_spilled(spiller, pgno)) break; const unsigned i = dpl_search(spiller, pgno); - mdbx_tassert(txn, (int)i > 0); + tASSERT(txn, (int)i > 0); if (spiller->tw.dirtylist->items[i].pgno == pgno) { spiller->tw.dirtylist->items[i].lru = txn->tw.dirtylru++; r.page = spiller->tw.dirtylist->items[i].ptr; @@ -13921,8 +13806,8 @@ __hot static __always_inline pgr_t page_get_inline(const uint16_t ILL, /* Finish mdbx_page_search() / mdbx_page_search_lowest(). * The cursor is at the root page, set up the rest of it. */ -__hot __noinline static int -mdbx_page_search_root(MDBX_cursor *mc, const MDBX_val *key, int flags) { +__hot __noinline static int page_search_root(MDBX_cursor *mc, + const MDBX_val *key, int flags) { MDBX_page *mp = mc->mc_pg[mc->mc_top]; int rc; DKBUF_DEBUG; @@ -13931,13 +13816,13 @@ mdbx_page_search_root(MDBX_cursor *mc, const MDBX_val *key, int flags) { MDBX_node *node; int i; - mdbx_debug("branch page %" PRIaPGNO " has %u keys", mp->mp_pgno, - page_numkeys(mp)); + DEBUG("branch page %" PRIaPGNO " has %u keys", mp->mp_pgno, + page_numkeys(mp)); /* Don't assert on branch pages in the GC. We can get here * while in the process of rebalancing a GC branch page; we must * let that proceed. ITS#8336 */ - mdbx_cassert(mc, !mc->mc_dbi || page_numkeys(mp) > 1); - mdbx_debug("found index 0 to page %" PRIaPGNO, node_pgno(page_node(mp, 0))); + cASSERT(mc, !mc->mc_dbi || page_numkeys(mp) > 1); + DEBUG("found index 0 to page %" PRIaPGNO, node_pgno(page_node(mp, 0))); if (flags & (MDBX_PS_FIRST | MDBX_PS_LAST)) { i = 0; @@ -13953,15 +13838,15 @@ mdbx_page_search_root(MDBX_cursor *mc, const MDBX_val *key, int flags) { } } } else { - const struct node_result nsr = mdbx_node_search(mc, key); + const struct node_result nsr = node_search(mc, key); if (likely(nsr.node)) i = mc->mc_ki[mc->mc_top] + nsr.exact - 1; else i = page_numkeys(mp) - 1; - mdbx_debug("following index %u for key [%s]", i, DKEY_DEBUG(key)); + DEBUG("following index %u for key [%s]", i, DKEY_DEBUG(key)); } - mdbx_cassert(mc, i >= 0 && i < (int)page_numkeys(mp)); + cASSERT(mc, i >= 0 && i < (int)page_numkeys(mp)); node = page_node(mp, i); rc = page_get(mc, node_pgno(node), &mp, mp->mp_txnid); @@ -13969,33 +13854,33 @@ mdbx_page_search_root(MDBX_cursor *mc, const MDBX_val *key, int flags) { return rc; mc->mc_ki[mc->mc_top] = (indx_t)i; - if (unlikely(rc = mdbx_cursor_push(mc, mp))) + if (unlikely(rc = cursor_push(mc, mp))) return rc; ready: if (flags & MDBX_PS_MODIFY) { - if (unlikely((rc = mdbx_page_touch(mc)) != 0)) + if (unlikely((rc = page_touch(mc)) != 0)) return rc; mp = mc->mc_pg[mc->mc_top]; } } if (!MDBX_DISABLE_VALIDATION && unlikely(!CHECK_LEAF_TYPE(mc, mp))) { - mdbx_error("unexpected leaf-page #%" PRIaPGNO " type 0x%x seen by cursor", - mp->mp_pgno, mp->mp_flags); + ERROR("unexpected leaf-page #%" PRIaPGNO " type 0x%x seen by cursor", + mp->mp_pgno, mp->mp_flags); return MDBX_CORRUPTED; } - mdbx_debug("found leaf page %" PRIaPGNO " for key [%s]", mp->mp_pgno, - DKEY_DEBUG(key)); + DEBUG("found leaf page %" PRIaPGNO " for key [%s]", mp->mp_pgno, + DKEY_DEBUG(key)); mc->mc_flags |= C_INITIALIZED; mc->mc_flags &= ~C_EOF; return MDBX_SUCCESS; } -static int mdbx_setup_dbx(MDBX_dbx *const dbx, const MDBX_db *const db, - const unsigned pagesize) { +static int setup_dbx(MDBX_dbx *const dbx, const MDBX_db *const db, + const unsigned pagesize) { if (unlikely(!dbx->md_cmp)) { dbx->md_cmp = get_default_keycmp(db->md_flags); dbx->md_dcmp = get_default_datacmp(db->md_flags); @@ -14015,8 +13900,8 @@ static int mdbx_setup_dbx(MDBX_dbx *const dbx, const MDBX_db *const db, if ((db->md_flags & (MDBX_DUPFIXED | MDBX_INTEGERDUP)) != 0 && db->md_xsize) { if (!MDBX_DISABLE_VALIDATION && unlikely(db->md_xsize < dbx->md_vlen_min || db->md_xsize > dbx->md_vlen_max)) { - mdbx_error("db.md_xsize (%u) <> min/max value-length (%zu/%zu)", - db->md_xsize, dbx->md_vlen_min, dbx->md_vlen_max); + ERROR("db.md_xsize (%u) <> min/max value-length (%zu/%zu)", db->md_xsize, + dbx->md_vlen_min, dbx->md_vlen_max); return MDBX_CORRUPTED; } dbx->md_vlen_min = dbx->md_vlen_max = db->md_xsize; @@ -14024,51 +13909,49 @@ static int mdbx_setup_dbx(MDBX_dbx *const dbx, const MDBX_db *const db, return MDBX_SUCCESS; } -static int mdbx_fetch_sdb(MDBX_txn *txn, MDBX_dbi dbi) { +static int fetch_sdb(MDBX_txn *txn, MDBX_dbi dbi) { MDBX_cursor_couple couple; if (unlikely(TXN_DBI_CHANGED(txn, dbi))) { - mdbx_notice("dbi %u was changed for txn %" PRIaTXN, dbi, txn->mt_txnid); + NOTICE("dbi %u was changed for txn %" PRIaTXN, dbi, txn->mt_txnid); return MDBX_BAD_DBI; } - int rc = mdbx_cursor_init(&couple.outer, txn, MAIN_DBI); + int rc = cursor_init(&couple.outer, txn, MAIN_DBI); if (unlikely(rc != MDBX_SUCCESS)) return rc; MDBX_dbx *const dbx = &txn->mt_dbxs[dbi]; - rc = mdbx_page_search(&couple.outer, &dbx->md_name, 0); + rc = page_search(&couple.outer, &dbx->md_name, 0); if (unlikely(rc != MDBX_SUCCESS)) { notfound: - mdbx_notice("dbi %u refs to inaccessible subDB `%*s` for txn %" PRIaTXN - " (err %d)", - dbi, (int)dbx->md_name.iov_len, - (const char *)dbx->md_name.iov_base, txn->mt_txnid, rc); + NOTICE("dbi %u refs to inaccessible subDB `%*s` for txn %" PRIaTXN + " (err %d)", + dbi, (int)dbx->md_name.iov_len, (const char *)dbx->md_name.iov_base, + txn->mt_txnid, rc); return (rc == MDBX_NOTFOUND) ? MDBX_BAD_DBI : rc; } MDBX_val data; - struct node_result nsr = mdbx_node_search(&couple.outer, &dbx->md_name); + struct node_result nsr = node_search(&couple.outer, &dbx->md_name); if (unlikely(!nsr.exact)) { rc = MDBX_NOTFOUND; goto notfound; } if (unlikely((node_flags(nsr.node) & (F_DUPDATA | F_SUBDATA)) != F_SUBDATA)) { - mdbx_notice( - "dbi %u refs to not a named subDB `%*s` for txn %" PRIaTXN " (%s)", dbi, - (int)dbx->md_name.iov_len, (const char *)dbx->md_name.iov_base, - txn->mt_txnid, "wrong flags"); + NOTICE("dbi %u refs to not a named subDB `%*s` for txn %" PRIaTXN " (%s)", + dbi, (int)dbx->md_name.iov_len, (const char *)dbx->md_name.iov_base, + txn->mt_txnid, "wrong flags"); return MDBX_INCOMPATIBLE; /* not a named DB */ } - rc = mdbx_node_read(&couple.outer, nsr.node, &data, - couple.outer.mc_pg[couple.outer.mc_top]); + rc = node_read(&couple.outer, nsr.node, &data, + couple.outer.mc_pg[couple.outer.mc_top]); if (unlikely(rc != MDBX_SUCCESS)) return rc; if (unlikely(data.iov_len != sizeof(MDBX_db))) { - mdbx_notice( - "dbi %u refs to not a named subDB `%*s` for txn %" PRIaTXN " (%s)", dbi, - (int)dbx->md_name.iov_len, (const char *)dbx->md_name.iov_base, - txn->mt_txnid, "wrong rec-size"); + NOTICE("dbi %u refs to not a named subDB `%*s` for txn %" PRIaTXN " (%s)", + dbi, (int)dbx->md_name.iov_len, (const char *)dbx->md_name.iov_base, + txn->mt_txnid, "wrong rec-size"); return MDBX_INCOMPATIBLE; /* not a named DB */ } @@ -14077,25 +13960,24 @@ static int mdbx_fetch_sdb(MDBX_txn *txn, MDBX_dbi dbi) { * have dropped and recreated the DB with other flags. */ MDBX_db *const db = &txn->mt_dbs[dbi]; if (unlikely((db->md_flags & DB_PERSISTENT_FLAGS) != md_flags)) { - mdbx_notice("dbi %u refs to the re-created subDB `%*s` for txn %" PRIaTXN - " with different flags (present 0x%X != wanna 0x%X)", - dbi, (int)dbx->md_name.iov_len, - (const char *)dbx->md_name.iov_base, txn->mt_txnid, - db->md_flags & DB_PERSISTENT_FLAGS, md_flags); + NOTICE("dbi %u refs to the re-created subDB `%*s` for txn %" PRIaTXN + " with different flags (present 0x%X != wanna 0x%X)", + dbi, (int)dbx->md_name.iov_len, (const char *)dbx->md_name.iov_base, + txn->mt_txnid, db->md_flags & DB_PERSISTENT_FLAGS, md_flags); return MDBX_INCOMPATIBLE; } memcpy(db, data.iov_base, sizeof(MDBX_db)); #if !MDBX_DISABLE_VALIDATION const txnid_t pp_txnid = couple.outer.mc_pg[couple.outer.mc_top]->mp_txnid; - mdbx_tassert(txn, txn->mt_front >= pp_txnid); + tASSERT(txn, txn->mt_front >= pp_txnid); if (unlikely(db->md_mod_txnid > pp_txnid)) { - mdbx_error("db.md_mod_txnid (%" PRIaTXN ") > page-txnid (%" PRIaTXN ")", - db->md_mod_txnid, pp_txnid); + ERROR("db.md_mod_txnid (%" PRIaTXN ") > page-txnid (%" PRIaTXN ")", + db->md_mod_txnid, pp_txnid); return MDBX_CORRUPTED; } #endif /* !MDBX_DISABLE_VALIDATION */ - rc = mdbx_setup_dbx(dbx, db, txn->mt_env->me_psize); + rc = setup_dbx(dbx, db, txn->mt_env->me_psize); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -14108,9 +13990,9 @@ static int mdbx_fetch_sdb(MDBX_txn *txn, MDBX_dbi dbi) { * before calling mdbx_page_search_root(), because the callers * are all in situations where the current page is known to * be underfilled. */ -__hot static int mdbx_page_search_lowest(MDBX_cursor *mc) { +__hot static int page_search_lowest(MDBX_cursor *mc) { MDBX_page *mp = mc->mc_pg[mc->mc_top]; - mdbx_cassert(mc, IS_BRANCH(mp)); + cASSERT(mc, IS_BRANCH(mp)); MDBX_node *node = page_node(mp, 0); int rc = page_get(mc, node_pgno(node), &mp, mp->mp_txnid); @@ -14118,9 +14000,9 @@ __hot static int mdbx_page_search_lowest(MDBX_cursor *mc) { return rc; mc->mc_ki[mc->mc_top] = 0; - if (unlikely(rc = mdbx_cursor_push(mc, mp))) + if (unlikely(rc = cursor_push(mc, mp))) return rc; - return mdbx_page_search_root(mc, NULL, MDBX_PS_FIRST); + return page_search_root(mc, NULL, MDBX_PS_FIRST); } /* Search for the page a given key should be in. @@ -14137,32 +14019,31 @@ __hot static int mdbx_page_search_lowest(MDBX_cursor *mc) { * lookups. * * Returns 0 on success, non-zero on failure. */ -__hot static int mdbx_page_search(MDBX_cursor *mc, const MDBX_val *key, - int flags) { +__hot static int page_search(MDBX_cursor *mc, const MDBX_val *key, int flags) { int rc; pgno_t root; /* Make sure the txn is still viable, then find the root from * the txn's db table and set it as the root of the cursor's stack. */ if (unlikely(mc->mc_txn->mt_flags & MDBX_TXN_BLOCKED)) { - mdbx_debug("%s", "transaction has failed, must abort"); + DEBUG("%s", "transaction has failed, must abort"); return MDBX_BAD_TXN; } /* Make sure we're using an up-to-date root */ if (unlikely(*mc->mc_dbistate & DBI_STALE)) { - rc = mdbx_fetch_sdb(mc->mc_txn, mc->mc_dbi); + rc = fetch_sdb(mc->mc_txn, mc->mc_dbi); if (unlikely(rc != MDBX_SUCCESS)) return rc; } root = mc->mc_db->md_root; if (unlikely(root == P_INVALID)) { /* Tree is empty. */ - mdbx_debug("%s", "tree is empty"); + DEBUG("%s", "tree is empty"); return MDBX_NOTFOUND; } - mdbx_cassert(mc, root >= NUM_METAS); + cASSERT(mc, root >= NUM_METAS); if (!mc->mc_pg[0] || mc->mc_pg[0]->mp_pgno != root) { txnid_t pp_txnid = mc->mc_db->md_mod_txnid; pp_txnid = /* mc->mc_db->md_mod_txnid maybe zero in a legacy DB */ pp_txnid @@ -14186,36 +14067,35 @@ __hot static int mdbx_page_search(MDBX_cursor *mc, const MDBX_val *key, mc->mc_snum = 1; mc->mc_top = 0; - mdbx_debug("db %d root page %" PRIaPGNO " has flags 0x%X", DDBI(mc), root, - mc->mc_pg[0]->mp_flags); + DEBUG("db %d root page %" PRIaPGNO " has flags 0x%X", DDBI(mc), root, + mc->mc_pg[0]->mp_flags); if (flags & MDBX_PS_MODIFY) { - if (!(*mc->mc_dbistate & DBI_DIRTY) && unlikely(rc = mdbx_touch_dbi(mc))) + if (!(*mc->mc_dbistate & DBI_DIRTY) && unlikely(rc = touch_dbi(mc))) return rc; - if (unlikely(rc = mdbx_page_touch(mc))) + if (unlikely(rc = page_touch(mc))) return rc; } if (flags & MDBX_PS_ROOTONLY) return MDBX_SUCCESS; - return mdbx_page_search_root(mc, key, flags); + return page_search_root(mc, key, flags); } /* Read large/overflow node data. */ static __noinline int node_read_bigdata(MDBX_cursor *mc, const MDBX_node *node, MDBX_val *data, const MDBX_page *mp) { - mdbx_cassert(mc, - node_flags(node) == F_BIGDATA && data->iov_len == node_ds(node)); + cASSERT(mc, node_flags(node) == F_BIGDATA && data->iov_len == node_ds(node)); pgr_t lp = page_get_large(mc, node_largedata_pgno(node), mp->mp_txnid); if (unlikely((lp.err != MDBX_SUCCESS))) { - mdbx_debug("read large/overflow page %" PRIaPGNO " failed", - node_largedata_pgno(node)); + DEBUG("read large/overflow page %" PRIaPGNO " failed", + node_largedata_pgno(node)); return lp.err; } - mdbx_cassert(mc, PAGETYPE_WHOLE(lp.page) == P_OVERFLOW); + cASSERT(mc, PAGETYPE_WHOLE(lp.page) == P_OVERFLOW); data->iov_base = page_data(lp.page); if (!MDBX_DISABLE_VALIDATION) { const MDBX_env *env = mc->mc_txn->mt_env; @@ -14237,9 +14117,8 @@ static __noinline int node_read_bigdata(MDBX_cursor *mc, const MDBX_node *node, } /* Return the data associated with a given node. */ -static __always_inline int mdbx_node_read(MDBX_cursor *mc, - const MDBX_node *node, MDBX_val *data, - const MDBX_page *mp) { +static __always_inline int node_read(MDBX_cursor *mc, const MDBX_node *node, + MDBX_val *data, const MDBX_page *mp) { data->iov_len = node_ds(node); data->iov_base = node_data(node); if (likely(node_flags(node) != F_BIGDATA)) @@ -14249,7 +14128,7 @@ static __always_inline int mdbx_node_read(MDBX_cursor *mc, int mdbx_get(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, MDBX_val *data) { DKBUF_DEBUG; - mdbx_debug("===> get db %u key [%s]", dbi, DKEY_DEBUG(key)); + DEBUG("===> get db %u key [%s]", dbi, DKEY_DEBUG(key)); int rc = check_txn(txn, MDBX_TXN_BLOCKED); if (unlikely(rc != MDBX_SUCCESS)) @@ -14262,11 +14141,11 @@ int mdbx_get(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, MDBX_val *data) { return MDBX_BAD_DBI; MDBX_cursor_couple cx; - rc = mdbx_cursor_init(&cx.outer, txn, dbi); + rc = cursor_init(&cx.outer, txn, dbi); if (unlikely(rc != MDBX_SUCCESS)) return rc; - return mdbx_cursor_set(&cx.outer, (MDBX_val *)key, data, MDBX_SET).err; + return cursor_set(&cx.outer, (MDBX_val *)key, data, MDBX_SET).err; } int mdbx_get_equal_or_great(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, @@ -14285,7 +14164,7 @@ int mdbx_get_equal_or_great(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, return MDBX_BAD_TXN; MDBX_cursor_couple cx; - rc = mdbx_cursor_init(&cx.outer, txn, dbi); + rc = cursor_init(&cx.outer, txn, dbi); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -14295,7 +14174,7 @@ int mdbx_get_equal_or_great(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, int mdbx_get_ex(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data, size_t *values_count) { DKBUF_DEBUG; - mdbx_debug("===> get db %u key [%s]", dbi, DKEY_DEBUG(key)); + DEBUG("===> get db %u key [%s]", dbi, DKEY_DEBUG(key)); int rc = check_txn(txn, MDBX_TXN_BLOCKED); if (unlikely(rc != MDBX_SUCCESS)) @@ -14308,11 +14187,11 @@ int mdbx_get_ex(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data, return MDBX_BAD_DBI; MDBX_cursor_couple cx; - rc = mdbx_cursor_init(&cx.outer, txn, dbi); + rc = cursor_init(&cx.outer, txn, dbi); if (unlikely(rc != MDBX_SUCCESS)) return rc; - rc = mdbx_cursor_set(&cx.outer, key, data, MDBX_SET_KEY).err; + rc = cursor_set(&cx.outer, key, data, MDBX_SET_KEY).err; if (unlikely(rc != MDBX_SUCCESS)) { if (rc == MDBX_NOTFOUND && values_count) *values_count = 0; @@ -14326,8 +14205,8 @@ int mdbx_get_ex(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data, cx.outer.mc_ki[cx.outer.mc_top]); if (F_ISSET(node_flags(node), F_DUPDATA)) { // coverity[uninit_use : FALSE] - mdbx_tassert(txn, cx.outer.mc_xcursor == &cx.inner && - (cx.inner.mx_cursor.mc_flags & C_INITIALIZED)); + tASSERT(txn, cx.outer.mc_xcursor == &cx.inner && + (cx.inner.mx_cursor.mc_flags & C_INITIALIZED)); // coverity[uninit_use : FALSE] *values_count = (sizeof(*values_count) >= sizeof(cx.inner.mx_db.md_entries) || @@ -14348,7 +14227,7 @@ int mdbx_get_ex(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data, * [in] dir SIBLING_LEFT or SIBLING_RIGHT. * * Returns 0 on success, non-zero on failure. */ -static int mdbx_cursor_sibling(MDBX_cursor *mc, int dir) { +static int cursor_sibling(MDBX_cursor *mc, int dir) { int rc; MDBX_node *node; MDBX_page *mp; @@ -14357,16 +14236,16 @@ static int mdbx_cursor_sibling(MDBX_cursor *mc, int dir) { if (unlikely(mc->mc_snum < 2)) return MDBX_NOTFOUND; /* root has no siblings */ - mdbx_cursor_pop(mc); - mdbx_debug("parent page is page %" PRIaPGNO ", index %u", - mc->mc_pg[mc->mc_top]->mp_pgno, mc->mc_ki[mc->mc_top]); + cursor_pop(mc); + DEBUG("parent page is page %" PRIaPGNO ", index %u", + mc->mc_pg[mc->mc_top]->mp_pgno, mc->mc_ki[mc->mc_top]); if ((dir == SIBLING_RIGHT) ? (mc->mc_ki[mc->mc_top] + 1u >= page_numkeys(mc->mc_pg[mc->mc_top])) : (mc->mc_ki[mc->mc_top] == 0)) { - mdbx_debug("no more keys aside, moving to next %s sibling", - dir ? "right" : "left"); - if (unlikely((rc = mdbx_cursor_sibling(mc, dir)) != MDBX_SUCCESS)) { + DEBUG("no more keys aside, moving to next %s sibling", + dir ? "right" : "left"); + if (unlikely((rc = cursor_sibling(mc, dir)) != MDBX_SUCCESS)) { /* undo cursor_pop before returning */ mc->mc_top++; mc->mc_snum++; @@ -14375,11 +14254,10 @@ static int mdbx_cursor_sibling(MDBX_cursor *mc, int dir) { } else { assert((dir - 1) == -1 || (dir - 1) == 1); mc->mc_ki[mc->mc_top] += (indx_t)(dir - 1); - mdbx_debug("just moving to %s index key %u", - (dir == SIBLING_RIGHT) ? "right" : "left", - mc->mc_ki[mc->mc_top]); + DEBUG("just moving to %s index key %u", + (dir == SIBLING_RIGHT) ? "right" : "left", mc->mc_ki[mc->mc_top]); } - mdbx_cassert(mc, IS_BRANCH(mc->mc_pg[mc->mc_top])); + cASSERT(mc, IS_BRANCH(mc->mc_pg[mc->mc_top])); node = page_node(mp = mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); rc = page_get(mc, node_pgno(node), &mp, mp->mp_txnid); @@ -14389,7 +14267,7 @@ static int mdbx_cursor_sibling(MDBX_cursor *mc, int dir) { return rc; } - rc = mdbx_cursor_push(mc, mp); + rc = cursor_push(mc, mp); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -14399,8 +14277,8 @@ static int mdbx_cursor_sibling(MDBX_cursor *mc, int dir) { } /* Move the cursor to the next data item. */ -static int mdbx_cursor_next(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, - MDBX_cursor_op op) { +static int cursor_next(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, + MDBX_cursor_op op) { MDBX_page *mp; MDBX_node *node; int rc; @@ -14409,7 +14287,7 @@ static int mdbx_cursor_next(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, return MDBX_NOTFOUND; if (unlikely(!(mc->mc_flags & C_INITIALIZED))) - return mdbx_cursor_first(mc, key, data); + return cursor_first(mc, key, data); mp = mc->mc_pg[mc->mc_top]; if (unlikely(mc->mc_flags & C_EOF)) { @@ -14422,8 +14300,7 @@ static int mdbx_cursor_next(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, node = page_node(mp, mc->mc_ki[mc->mc_top]); if (F_ISSET(node_flags(node), F_DUPDATA)) { if (op == MDBX_NEXT || op == MDBX_NEXT_DUP) { - rc = - mdbx_cursor_next(&mc->mc_xcursor->mx_cursor, data, NULL, MDBX_NEXT); + rc = cursor_next(&mc->mc_xcursor->mx_cursor, data, NULL, MDBX_NEXT); if (op != MDBX_NEXT || rc != MDBX_NOTFOUND) { if (likely(rc == MDBX_SUCCESS)) get_key_optional(node, key); @@ -14437,8 +14314,8 @@ static int mdbx_cursor_next(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, } } - mdbx_debug("cursor_next: top page is %" PRIaPGNO " in cursor %p", mp->mp_pgno, - (void *)mc); + DEBUG("cursor_next: top page is %" PRIaPGNO " in cursor %p", mp->mp_pgno, + (void *)mc); if (mc->mc_flags & C_DEL) { mc->mc_flags ^= C_DEL; goto skip; @@ -14448,26 +14325,25 @@ static int mdbx_cursor_next(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, mc->mc_ki[mc->mc_top] = (indx_t)++ki; const int numkeys = page_numkeys(mp); if (unlikely(ki >= numkeys)) { - mdbx_debug("%s", "=====> move to next sibling page"); + DEBUG("%s", "=====> move to next sibling page"); mc->mc_ki[mc->mc_top] = (indx_t)(numkeys - 1); - rc = mdbx_cursor_sibling(mc, SIBLING_RIGHT); + rc = cursor_sibling(mc, SIBLING_RIGHT); if (unlikely(rc != MDBX_SUCCESS)) { mc->mc_flags |= C_EOF; return rc; } mp = mc->mc_pg[mc->mc_top]; - mdbx_debug("next page is %" PRIaPGNO ", key index %u", mp->mp_pgno, - mc->mc_ki[mc->mc_top]); + DEBUG("next page is %" PRIaPGNO ", key index %u", mp->mp_pgno, + mc->mc_ki[mc->mc_top]); } skip: - mdbx_debug("==> cursor points to page %" PRIaPGNO - " with %u keys, key index %u", - mp->mp_pgno, page_numkeys(mp), mc->mc_ki[mc->mc_top]); + DEBUG("==> cursor points to page %" PRIaPGNO " with %u keys, key index %u", + mp->mp_pgno, page_numkeys(mp), mc->mc_ki[mc->mc_top]); if (!MDBX_DISABLE_VALIDATION && unlikely(!CHECK_LEAF_TYPE(mc, mp))) { - mdbx_error("unexpected leaf-page #%" PRIaPGNO " type 0x%x seen by cursor", - mp->mp_pgno, mp->mp_flags); + ERROR("unexpected leaf-page #%" PRIaPGNO " type 0x%x seen by cursor", + mp->mp_pgno, mp->mp_flags); return MDBX_CORRUPTED; } @@ -14481,14 +14357,14 @@ skip: node = page_node(mp, mc->mc_ki[mc->mc_top]); if (F_ISSET(node_flags(node), F_DUPDATA)) { - rc = mdbx_xcursor_init1(mc, node, mp); + rc = cursor_xinit1(mc, node, mp); if (unlikely(rc != MDBX_SUCCESS)) return rc; - rc = mdbx_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); + rc = cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); if (unlikely(rc != MDBX_SUCCESS)) return rc; } else if (likely(data)) { - rc = mdbx_node_read(mc, node, data, mp); + rc = node_read(mc, node, data, mp); if (unlikely(rc != MDBX_SUCCESS)) return rc; } @@ -14498,8 +14374,8 @@ skip: } /* Move the cursor to the previous data item. */ -static int mdbx_cursor_prev(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, - MDBX_cursor_op op) { +static int cursor_prev(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, + MDBX_cursor_op op) { MDBX_page *mp; MDBX_node *node; int rc; @@ -14508,7 +14384,7 @@ static int mdbx_cursor_prev(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, return MDBX_NOTFOUND; if (unlikely(!(mc->mc_flags & C_INITIALIZED))) { - rc = mdbx_cursor_last(mc, key, data); + rc = cursor_last(mc, key, data); if (unlikely(rc)) return rc; mc->mc_ki[mc->mc_top]++; @@ -14520,8 +14396,7 @@ static int mdbx_cursor_prev(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, node = page_node(mp, mc->mc_ki[mc->mc_top]); if (F_ISSET(node_flags(node), F_DUPDATA)) { if (op == MDBX_PREV || op == MDBX_PREV_DUP) { - rc = - mdbx_cursor_prev(&mc->mc_xcursor->mx_cursor, data, NULL, MDBX_PREV); + rc = cursor_prev(&mc->mc_xcursor->mx_cursor, data, NULL, MDBX_PREV); if (op != MDBX_PREV || rc != MDBX_NOTFOUND) { if (likely(rc == MDBX_SUCCESS)) { get_key_optional(node, key); @@ -14537,8 +14412,8 @@ static int mdbx_cursor_prev(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, } } - mdbx_debug("cursor_prev: top page is %" PRIaPGNO " in cursor %p", mp->mp_pgno, - (void *)mc); + DEBUG("cursor_prev: top page is %" PRIaPGNO " in cursor %p", mp->mp_pgno, + (void *)mc); mc->mc_flags &= ~(C_EOF | C_DEL); @@ -14546,20 +14421,19 @@ static int mdbx_cursor_prev(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, mc->mc_ki[mc->mc_top] = (indx_t)--ki; if (unlikely(ki < 0)) { mc->mc_ki[mc->mc_top] = 0; - mdbx_debug("%s", "=====> move to prev sibling page"); - if ((rc = mdbx_cursor_sibling(mc, SIBLING_LEFT)) != MDBX_SUCCESS) + DEBUG("%s", "=====> move to prev sibling page"); + if ((rc = cursor_sibling(mc, SIBLING_LEFT)) != MDBX_SUCCESS) return rc; mp = mc->mc_pg[mc->mc_top]; - mdbx_debug("prev page is %" PRIaPGNO ", key index %u", mp->mp_pgno, - mc->mc_ki[mc->mc_top]); + DEBUG("prev page is %" PRIaPGNO ", key index %u", mp->mp_pgno, + mc->mc_ki[mc->mc_top]); } - mdbx_debug("==> cursor points to page %" PRIaPGNO - " with %u keys, key index %u", - mp->mp_pgno, page_numkeys(mp), mc->mc_ki[mc->mc_top]); + DEBUG("==> cursor points to page %" PRIaPGNO " with %u keys, key index %u", + mp->mp_pgno, page_numkeys(mp), mc->mc_ki[mc->mc_top]); if (!MDBX_DISABLE_VALIDATION && unlikely(!CHECK_LEAF_TYPE(mc, mp))) { - mdbx_error("unexpected leaf-page #%" PRIaPGNO " type 0x%x seen by cursor", - mp->mp_pgno, mp->mp_flags); + ERROR("unexpected leaf-page #%" PRIaPGNO " type 0x%x seen by cursor", + mp->mp_pgno, mp->mp_flags); return MDBX_CORRUPTED; } @@ -14574,14 +14448,14 @@ static int mdbx_cursor_prev(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, node = page_node(mp, mc->mc_ki[mc->mc_top]); if (F_ISSET(node_flags(node), F_DUPDATA)) { - rc = mdbx_xcursor_init1(mc, node, mp); + rc = cursor_xinit1(mc, node, mp); if (unlikely(rc != MDBX_SUCCESS)) return rc; - rc = mdbx_cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL); + rc = cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL); if (unlikely(rc != MDBX_SUCCESS)) return rc; } else if (likely(data)) { - rc = mdbx_node_read(mc, node, data, mp); + rc = node_read(mc, node, data, mp); if (unlikely(rc != MDBX_SUCCESS)) return rc; } @@ -14591,10 +14465,8 @@ static int mdbx_cursor_prev(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, } /* Set the cursor on a specific data item. */ -__hot static struct cursor_set_result mdbx_cursor_set(MDBX_cursor *mc, - MDBX_val *key, - MDBX_val *data, - MDBX_cursor_op op) { +__hot static struct cursor_set_result +cursor_set(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, MDBX_cursor_op op) { MDBX_page *mp; MDBX_node *node = NULL; DKBUF_DEBUG; @@ -14603,7 +14475,7 @@ __hot static struct cursor_set_result mdbx_cursor_set(MDBX_cursor *mc, ret.exact = false; if (unlikely(key->iov_len < mc->mc_dbx->md_klen_min || key->iov_len > mc->mc_dbx->md_klen_max)) { - mdbx_cassert(mc, !"Invalid key-size"); + cASSERT(mc, !"Invalid key-size"); ret.err = MDBX_BAD_VALSIZE; return ret; } @@ -14613,7 +14485,7 @@ __hot static struct cursor_set_result mdbx_cursor_set(MDBX_cursor *mc, if (mc->mc_db->md_flags & MDBX_INTEGERKEY) { switch (aligned_key.iov_len) { default: - mdbx_cassert(mc, !"key-size is invalid for MDBX_INTEGERKEY"); + cASSERT(mc, !"key-size is invalid for MDBX_INTEGERKEY"); ret.err = MDBX_BAD_VALSIZE; return ret; case 4: @@ -14638,7 +14510,7 @@ __hot static struct cursor_set_result mdbx_cursor_set(MDBX_cursor *mc, if (mc->mc_flags & C_INITIALIZED) { MDBX_val nodekey; - mdbx_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); + cASSERT(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); mp = mc->mc_pg[mc->mc_top]; if (unlikely(!page_numkeys(mp))) { mc->mc_ki[mc->mc_top] = 0; @@ -14659,9 +14531,8 @@ __hot static struct cursor_set_result mdbx_cursor_set(MDBX_cursor *mc, * was the one we wanted. */ mc->mc_ki[mc->mc_top] = 0; ret.exact = true; - mdbx_cassert(mc, mc->mc_ki[mc->mc_top] < - page_numkeys(mc->mc_pg[mc->mc_top]) || - (mc->mc_flags & C_EOF)); + cASSERT(mc, mc->mc_ki[mc->mc_top] < page_numkeys(mc->mc_pg[mc->mc_top]) || + (mc->mc_flags & C_EOF)); goto got_node; } if (cmp > 0) { @@ -14676,12 +14547,12 @@ __hot static struct cursor_set_result mdbx_cursor_set(MDBX_cursor *mc, cmp = mc->mc_dbx->md_cmp(&aligned_key, &nodekey); if (cmp == 0) { /* last node was the one we wanted */ - mdbx_cassert(mc, nkeys >= 1 && nkeys <= UINT16_MAX + 1); + cASSERT(mc, nkeys >= 1 && nkeys <= UINT16_MAX + 1); mc->mc_ki[mc->mc_top] = (indx_t)(nkeys - 1); ret.exact = true; - mdbx_cassert(mc, mc->mc_ki[mc->mc_top] < - page_numkeys(mc->mc_pg[mc->mc_top]) || - (mc->mc_flags & C_EOF)); + cASSERT(mc, + mc->mc_ki[mc->mc_top] < page_numkeys(mc->mc_pg[mc->mc_top]) || + (mc->mc_flags & C_EOF)); goto got_node; } if (cmp < 0) { @@ -14698,9 +14569,9 @@ __hot static struct cursor_set_result mdbx_cursor_set(MDBX_cursor *mc, if (cmp == 0) { /* current node was the one we wanted */ ret.exact = true; - mdbx_cassert(mc, mc->mc_ki[mc->mc_top] < - page_numkeys(mc->mc_pg[mc->mc_top]) || - (mc->mc_flags & C_EOF)); + cASSERT(mc, mc->mc_ki[mc->mc_top] < + page_numkeys(mc->mc_pg[mc->mc_top]) || + (mc->mc_flags & C_EOF)); goto got_node; } } @@ -14716,7 +14587,7 @@ __hot static struct cursor_set_result mdbx_cursor_set(MDBX_cursor *mc, break; if (i == mc->mc_top) { /* There are no other pages */ - mdbx_cassert(mc, nkeys <= UINT16_MAX); + cASSERT(mc, nkeys <= UINT16_MAX); mc->mc_ki[mc->mc_top] = (uint16_t)nkeys; mc->mc_flags |= C_EOF; ret.err = MDBX_NOTFOUND; @@ -14729,9 +14600,8 @@ __hot static struct cursor_set_result mdbx_cursor_set(MDBX_cursor *mc, if (op == MDBX_SET_RANGE) goto got_node; - mdbx_cassert(mc, mc->mc_ki[mc->mc_top] < - page_numkeys(mc->mc_pg[mc->mc_top]) || - (mc->mc_flags & C_EOF)); + cASSERT(mc, mc->mc_ki[mc->mc_top] < page_numkeys(mc->mc_pg[mc->mc_top]) || + (mc->mc_flags & C_EOF)); ret.err = MDBX_NOTFOUND; return ret; } @@ -14739,15 +14609,15 @@ __hot static struct cursor_set_result mdbx_cursor_set(MDBX_cursor *mc, mc->mc_pg[0] = 0; } - ret.err = mdbx_page_search(mc, &aligned_key, 0); + ret.err = page_search(mc, &aligned_key, 0); if (unlikely(ret.err != MDBX_SUCCESS)) return ret; mp = mc->mc_pg[mc->mc_top]; - mdbx_cassert(mc, IS_LEAF(mp)); + cASSERT(mc, IS_LEAF(mp)); search_node:; - struct node_result nsr = mdbx_node_search(mc, &aligned_key); + struct node_result nsr = node_search(mc, &aligned_key); node = nsr.node; ret.exact = nsr.exact; if (!ret.exact) { @@ -14761,29 +14631,28 @@ search_node:; } if (node == NULL) { - mdbx_debug("%s", "===> inexact leaf not found, goto sibling"); - ret.err = mdbx_cursor_sibling(mc, SIBLING_RIGHT); + DEBUG("%s", "===> inexact leaf not found, goto sibling"); + ret.err = cursor_sibling(mc, SIBLING_RIGHT); if (unlikely(ret.err != MDBX_SUCCESS)) { mc->mc_flags |= C_EOF; return ret; /* no entries matched */ } mp = mc->mc_pg[mc->mc_top]; - mdbx_cassert(mc, IS_LEAF(mp)); + cASSERT(mc, IS_LEAF(mp)); if (!IS_LEAF2(mp)) node = page_node(mp, 0); } } - mdbx_cassert(mc, - mc->mc_ki[mc->mc_top] < page_numkeys(mc->mc_pg[mc->mc_top]) || - (mc->mc_flags & C_EOF)); + cASSERT(mc, mc->mc_ki[mc->mc_top] < page_numkeys(mc->mc_pg[mc->mc_top]) || + (mc->mc_flags & C_EOF)); got_node: mc->mc_flags |= C_INITIALIZED; mc->mc_flags &= ~C_EOF; if (!MDBX_DISABLE_VALIDATION && unlikely(!CHECK_LEAF_TYPE(mc, mp))) { - mdbx_error("unexpected leaf-page #%" PRIaPGNO " type 0x%x seen by cursor", - mp->mp_pgno, mp->mp_flags); + ERROR("unexpected leaf-page #%" PRIaPGNO " type 0x%x seen by cursor", + mp->mp_pgno, mp->mp_flags); ret.err = MDBX_CORRUPTED; return ret; } @@ -14798,16 +14667,15 @@ got_node: } if (F_ISSET(node_flags(node), F_DUPDATA)) { - ret.err = mdbx_xcursor_init1(mc, node, mp); + ret.err = cursor_xinit1(mc, node, mp); if (unlikely(ret.err != MDBX_SUCCESS)) return ret; if (op == MDBX_SET || op == MDBX_SET_KEY || op == MDBX_SET_RANGE) { - ret.err = mdbx_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); + ret.err = cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); if (unlikely(ret.err != MDBX_SUCCESS)) return ret; } else { - ret = mdbx_cursor_set(&mc->mc_xcursor->mx_cursor, data, NULL, - MDBX_SET_RANGE); + ret = cursor_set(&mc->mc_xcursor->mx_cursor, data, NULL, MDBX_SET_RANGE); if (unlikely(ret.err != MDBX_SUCCESS)) return ret; if (op == MDBX_GET_BOTH && !ret.exact) { @@ -14819,7 +14687,7 @@ got_node: if (op == MDBX_GET_BOTH || op == MDBX_GET_BOTH_RANGE) { if (unlikely(data->iov_len < mc->mc_dbx->md_vlen_min || data->iov_len > mc->mc_dbx->md_vlen_max)) { - mdbx_cassert(mc, !"Invalid data-size"); + cASSERT(mc, !"Invalid data-size"); ret.err = MDBX_BAD_VALSIZE; return ret; } @@ -14828,7 +14696,7 @@ got_node: if (mc->mc_db->md_flags & MDBX_INTEGERDUP) { switch (aligned_data.iov_len) { default: - mdbx_cassert(mc, !"data-size is invalid for MDBX_INTEGERDUP"); + cASSERT(mc, !"data-size is invalid for MDBX_INTEGERDUP"); ret.err = MDBX_BAD_VALSIZE; return ret; case 4: @@ -14846,14 +14714,14 @@ got_node: } } MDBX_val actual_data; - ret.err = mdbx_node_read(mc, node, &actual_data, mc->mc_pg[mc->mc_top]); + ret.err = node_read(mc, node, &actual_data, mc->mc_pg[mc->mc_top]); if (unlikely(ret.err != MDBX_SUCCESS)) return ret; const int cmp = mc->mc_dbx->md_dcmp(&aligned_data, &actual_data); if (cmp) { - mdbx_cassert(mc, mc->mc_ki[mc->mc_top] < - page_numkeys(mc->mc_pg[mc->mc_top]) || - (mc->mc_flags & C_EOF)); + cASSERT(mc, + mc->mc_ki[mc->mc_top] < page_numkeys(mc->mc_pg[mc->mc_top]) || + (mc->mc_flags & C_EOF)); if (op != MDBX_GET_BOTH_RANGE || cmp > 0) { ret.err = MDBX_NOTFOUND; return ret; @@ -14861,7 +14729,7 @@ got_node: } *data = actual_data; } else { - ret.err = mdbx_node_read(mc, node, data, mc->mc_pg[mc->mc_top]); + ret.err = node_read(mc, node, data, mc->mc_pg[mc->mc_top]); if (unlikely(ret.err != MDBX_SUCCESS)) return ret; } @@ -14871,29 +14739,29 @@ got_node: if (op == MDBX_SET_RANGE || op == MDBX_SET_KEY) get_key_optional(node, key); - mdbx_debug("==> cursor placed on key [%s], data [%s]", DKEY_DEBUG(key), - DVAL_DEBUG(data)); + DEBUG("==> cursor placed on key [%s], data [%s]", DKEY_DEBUG(key), + DVAL_DEBUG(data)); ret.err = MDBX_SUCCESS; return ret; } /* Move the cursor to the first item in the database. */ -static int mdbx_cursor_first(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data) { +static int cursor_first(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data) { int rc; if (mc->mc_xcursor) mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED | C_EOF); if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) { - rc = mdbx_page_search(mc, NULL, MDBX_PS_FIRST); + rc = page_search(mc, NULL, MDBX_PS_FIRST); if (unlikely(rc != MDBX_SUCCESS)) return rc; } const MDBX_page *mp = mc->mc_pg[mc->mc_top]; if (!MDBX_DISABLE_VALIDATION && unlikely(!CHECK_LEAF_TYPE(mc, mp))) { - mdbx_error("unexpected leaf-page #%" PRIaPGNO " type 0x%x seen by cursor", - mp->mp_pgno, mp->mp_flags); + ERROR("unexpected leaf-page #%" PRIaPGNO " type 0x%x seen by cursor", + mp->mp_pgno, mp->mp_flags); return MDBX_CORRUPTED; } @@ -14911,14 +14779,14 @@ static int mdbx_cursor_first(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data) { MDBX_node *node = page_node(mp, 0); if (F_ISSET(node_flags(node), F_DUPDATA)) { - rc = mdbx_xcursor_init1(mc, node, mp); + rc = cursor_xinit1(mc, node, mp); if (unlikely(rc != MDBX_SUCCESS)) return rc; - rc = mdbx_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); + rc = cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); if (unlikely(rc)) return rc; } else if (likely(data)) { - rc = mdbx_node_read(mc, node, data, mp); + rc = node_read(mc, node, data, mp); if (unlikely(rc != MDBX_SUCCESS)) return rc; } @@ -14928,22 +14796,22 @@ static int mdbx_cursor_first(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data) { } /* Move the cursor to the last item in the database. */ -static int mdbx_cursor_last(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data) { +static int cursor_last(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data) { int rc; if (mc->mc_xcursor) mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED | C_EOF); if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) { - rc = mdbx_page_search(mc, NULL, MDBX_PS_LAST); + rc = page_search(mc, NULL, MDBX_PS_LAST); if (unlikely(rc != MDBX_SUCCESS)) return rc; } const MDBX_page *mp = mc->mc_pg[mc->mc_top]; if (!MDBX_DISABLE_VALIDATION && unlikely(!CHECK_LEAF_TYPE(mc, mp))) { - mdbx_error("unexpected leaf-page #%" PRIaPGNO " type 0x%x seen by cursor", - mp->mp_pgno, mp->mp_flags); + ERROR("unexpected leaf-page #%" PRIaPGNO " type 0x%x seen by cursor", + mp->mp_pgno, mp->mp_flags); return MDBX_CORRUPTED; } @@ -14960,14 +14828,14 @@ static int mdbx_cursor_last(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data) { MDBX_node *node = page_node(mp, mc->mc_ki[mc->mc_top]); if (F_ISSET(node_flags(node), F_DUPDATA)) { - rc = mdbx_xcursor_init1(mc, node, mp); + rc = cursor_xinit1(mc, node, mp); if (unlikely(rc != MDBX_SUCCESS)) return rc; - rc = mdbx_cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL); + rc = cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL); if (unlikely(rc)) return rc; } else if (likely(data)) { - rc = mdbx_node_read(mc, node, data, mp); + rc = node_read(mc, node, data, mp); if (unlikely(rc != MDBX_SUCCESS)) return rc; } @@ -14996,20 +14864,20 @@ __hot int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, return MDBX_ENODATA; const MDBX_page *mp = mc->mc_pg[mc->mc_top]; if (!MDBX_DISABLE_VALIDATION && unlikely(!CHECK_LEAF_TYPE(mc, mp))) { - mdbx_error("unexpected leaf-page #%" PRIaPGNO " type 0x%x seen by cursor", - mp->mp_pgno, mp->mp_flags); + ERROR("unexpected leaf-page #%" PRIaPGNO " type 0x%x seen by cursor", + mp->mp_pgno, mp->mp_flags); return MDBX_CORRUPTED; } const unsigned nkeys = page_numkeys(mp); if (unlikely(mc->mc_ki[mc->mc_top] >= nkeys)) { - mdbx_cassert(mc, nkeys <= UINT16_MAX); + cASSERT(mc, nkeys <= UINT16_MAX); if (mc->mc_flags & C_EOF) return MDBX_ENODATA; mc->mc_ki[mc->mc_top] = (uint16_t)nkeys; mc->mc_flags |= C_EOF; return MDBX_NOTFOUND; } - mdbx_cassert(mc, nkeys > 0); + cASSERT(mc, nkeys > 0); rc = MDBX_SUCCESS; if (IS_LEAF2(mp)) { @@ -15021,10 +14889,10 @@ __hot int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, if (data) { if (F_ISSET(node_flags(node), F_DUPDATA)) { if (unlikely(!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED))) { - rc = mdbx_xcursor_init1(mc, node, mp); + rc = cursor_xinit1(mc, node, mp); if (unlikely(rc != MDBX_SUCCESS)) return rc; - rc = mdbx_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); + rc = cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); if (unlikely(rc)) return rc; } else { @@ -15034,7 +14902,7 @@ __hot int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, return rc; } } else { - rc = mdbx_node_read(mc, node, data, mp); + rc = node_read(mc, node, data, mp); if (unlikely(rc)) return rc; } @@ -15055,12 +14923,11 @@ __hot int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, case MDBX_SET_RANGE: if (unlikely(key == NULL)) return MDBX_EINVAL; - rc = mdbx_cursor_set(mc, key, data, op).err; + rc = cursor_set(mc, key, data, op).err; if (mc->mc_flags & C_INITIALIZED) { - mdbx_cassert(mc, mc->mc_snum > 0 && mc->mc_top < mc->mc_snum); - mdbx_cassert(mc, mc->mc_ki[mc->mc_top] < - page_numkeys(mc->mc_pg[mc->mc_top]) || - (mc->mc_flags & C_EOF)); + cASSERT(mc, mc->mc_snum > 0 && mc->mc_top < mc->mc_snum); + cASSERT(mc, mc->mc_ki[mc->mc_top] < page_numkeys(mc->mc_pg[mc->mc_top]) || + (mc->mc_flags & C_EOF)); } break; case MDBX_GET_MULTIPLE: @@ -15078,7 +14945,7 @@ __hot int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, return MDBX_EINVAL; if (unlikely(!(mc->mc_db->md_flags & MDBX_DUPFIXED))) return MDBX_INCOMPATIBLE; - rc = mdbx_cursor_next(mc, key, data, MDBX_NEXT_DUP); + rc = cursor_next(mc, key, data, MDBX_NEXT_DUP); if (rc == MDBX_SUCCESS) { if (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) { MDBX_cursor *mx; @@ -15100,11 +14967,11 @@ __hot int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, return MDBX_INCOMPATIBLE; rc = MDBX_SUCCESS; if (!(mc->mc_flags & C_INITIALIZED)) - rc = mdbx_cursor_last(mc, key, data); + rc = cursor_last(mc, key, data); if (rc == MDBX_SUCCESS) { MDBX_cursor *mx = &mc->mc_xcursor->mx_cursor; if (mx->mc_flags & C_INITIALIZED) { - rc = mdbx_cursor_sibling(mx, SIBLING_LEFT); + rc = cursor_sibling(mx, SIBLING_LEFT); if (rc == MDBX_SUCCESS) goto fetchm; } else { @@ -15115,18 +14982,18 @@ __hot int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, case MDBX_NEXT: case MDBX_NEXT_DUP: case MDBX_NEXT_NODUP: - rc = mdbx_cursor_next(mc, key, data, op); + rc = cursor_next(mc, key, data, op); break; case MDBX_PREV: case MDBX_PREV_DUP: case MDBX_PREV_NODUP: - rc = mdbx_cursor_prev(mc, key, data, op); + rc = cursor_prev(mc, key, data, op); break; case MDBX_FIRST: - rc = mdbx_cursor_first(mc, key, data); + rc = cursor_first(mc, key, data); break; case MDBX_FIRST_DUP: - mfunc = mdbx_cursor_first; + mfunc = cursor_first; move: if (unlikely(data == NULL || !(mc->mc_flags & C_INITIALIZED))) return MDBX_EINVAL; @@ -15141,7 +15008,7 @@ __hot int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, MDBX_node *node = page_node(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); if (!F_ISSET(node_flags(node), F_DUPDATA)) { get_key_optional(node, key); - rc = mdbx_node_read(mc, node, data, mc->mc_pg[mc->mc_top]); + rc = node_read(mc, node, data, mc->mc_pg[mc->mc_top]); break; } } @@ -15150,18 +15017,17 @@ __hot int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, rc = mfunc(&mc->mc_xcursor->mx_cursor, data, NULL); break; case MDBX_LAST: - rc = mdbx_cursor_last(mc, key, data); + rc = cursor_last(mc, key, data); break; case MDBX_LAST_DUP: - mfunc = mdbx_cursor_last; + mfunc = cursor_last; goto move; case MDBX_SET_UPPERBOUND: /* mostly same as MDBX_SET_LOWERBOUND */ case MDBX_SET_LOWERBOUND: { if (unlikely(key == NULL || data == NULL)) return MDBX_EINVAL; MDBX_val save_data = *data; - struct cursor_set_result csr = - mdbx_cursor_set(mc, key, data, MDBX_SET_RANGE); + struct cursor_set_result csr = cursor_set(mc, key, data, MDBX_SET_RANGE); rc = csr.err; if (rc == MDBX_SUCCESS && csr.exact && mc->mc_xcursor) { mc->mc_flags &= ~C_DEL; @@ -15172,18 +15038,18 @@ __hot int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, * returning MDBX_BAD_VALSIZE. */ } else if (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) { *data = save_data; - csr = mdbx_cursor_set(&mc->mc_xcursor->mx_cursor, data, NULL, - MDBX_SET_RANGE); + csr = + cursor_set(&mc->mc_xcursor->mx_cursor, data, NULL, MDBX_SET_RANGE); rc = csr.err; if (rc == MDBX_NOTFOUND) { - mdbx_cassert(mc, !csr.exact); - rc = mdbx_cursor_next(mc, key, data, MDBX_NEXT_NODUP); + cASSERT(mc, !csr.exact); + rc = cursor_next(mc, key, data, MDBX_NEXT_NODUP); } } else { int cmp = mc->mc_dbx->md_dcmp(&save_data, data); csr.exact = (cmp == 0); if (cmp > 0) - rc = mdbx_cursor_next(mc, key, data, MDBX_NEXT_NODUP); + rc = cursor_next(mc, key, data, MDBX_NEXT_NODUP); } } if (rc == MDBX_SUCCESS && !csr.exact) @@ -15195,12 +15061,12 @@ __hot int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, rc = MDBX_SUCCESS; else if (rc == MDBX_SUCCESS) /* exactly match, going next */ - rc = mdbx_cursor_next(mc, key, data, MDBX_NEXT); + rc = cursor_next(mc, key, data, MDBX_NEXT); } break; } default: - mdbx_debug("unhandled/unimplemented cursor operation %u", op); + DEBUG("unhandled/unimplemented cursor operation %u", op); return MDBX_EINVAL; } @@ -15210,11 +15076,11 @@ __hot int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, static int cursor_first_batch(MDBX_cursor *mc) { if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) { - int err = mdbx_page_search(mc, NULL, MDBX_PS_FIRST); + int err = page_search(mc, NULL, MDBX_PS_FIRST); if (unlikely(err != MDBX_SUCCESS)) return err; } - mdbx_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); + cASSERT(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); mc->mc_flags |= C_INITIALIZED; mc->mc_flags &= ~C_EOF; @@ -15237,19 +15103,19 @@ static int cursor_next_batch(MDBX_cursor *mc) { mc->mc_ki[mc->mc_top] = (indx_t)++ki; const int numkeys = page_numkeys(mp); if (likely(ki >= numkeys)) { - mdbx_debug("%s", "=====> move to next sibling page"); + DEBUG("%s", "=====> move to next sibling page"); mc->mc_ki[mc->mc_top] = (indx_t)(numkeys - 1); - int err = mdbx_cursor_sibling(mc, SIBLING_RIGHT); + int err = cursor_sibling(mc, SIBLING_RIGHT); if (unlikely(err != MDBX_SUCCESS)) { mc->mc_flags |= C_EOF; return err; } mp = mc->mc_pg[mc->mc_top]; - mdbx_debug("next page is %" PRIaPGNO ", key index %u", mp->mp_pgno, - mc->mc_ki[mc->mc_top]); + DEBUG("next page is %" PRIaPGNO ", key index %u", mp->mp_pgno, + mc->mc_ki[mc->mc_top]); if (!MDBX_DISABLE_VALIDATION && unlikely(!CHECK_LEAF_TYPE(mc, mp))) { - mdbx_error("unexpected leaf-page #%" PRIaPGNO " type 0x%x seen by cursor", - mp->mp_pgno, mp->mp_flags); + ERROR("unexpected leaf-page #%" PRIaPGNO " type 0x%x seen by cursor", + mp->mp_pgno, mp->mp_flags); return MDBX_CORRUPTED; } } @@ -15283,7 +15149,7 @@ int mdbx_cursor_get_batch(MDBX_cursor *mc, size_t *count, MDBX_val *pairs, rc = likely(mc->mc_flags & C_INITIALIZED) ? MDBX_SUCCESS : MDBX_ENODATA; break; default: - mdbx_debug("unhandled/unimplemented cursor operation %u", op); + DEBUG("unhandled/unimplemented cursor operation %u", op); rc = EINVAL; break; } @@ -15295,18 +15161,18 @@ int mdbx_cursor_get_batch(MDBX_cursor *mc, size_t *count, MDBX_val *pairs, const MDBX_page *const mp = mc->mc_pg[mc->mc_top]; if (!MDBX_DISABLE_VALIDATION && unlikely(!CHECK_LEAF_TYPE(mc, mp))) { - mdbx_error("unexpected leaf-page #%" PRIaPGNO " type 0x%x seen by cursor", - mp->mp_pgno, mp->mp_flags); + ERROR("unexpected leaf-page #%" PRIaPGNO " type 0x%x seen by cursor", + mp->mp_pgno, mp->mp_flags); return MDBX_CORRUPTED; } const unsigned nkeys = page_numkeys(mp); unsigned i = mc->mc_ki[mc->mc_top], n = 0; if (unlikely(i >= nkeys)) { - mdbx_cassert(mc, op == MDBX_GET_CURRENT); - mdbx_cassert(mc, mdbx_cursor_on_last(mc) == MDBX_RESULT_TRUE); + cASSERT(mc, op == MDBX_GET_CURRENT); + cASSERT(mc, mdbx_cursor_on_last(mc) == MDBX_RESULT_TRUE); *count = 0; if (mc->mc_flags & C_EOF) { - mdbx_cassert(mc, mdbx_cursor_on_last(mc) == MDBX_RESULT_TRUE); + cASSERT(mc, mdbx_cursor_on_last(mc) == MDBX_RESULT_TRUE); return MDBX_ENODATA; } if (mdbx_cursor_on_last(mc) != MDBX_RESULT_TRUE) @@ -15322,7 +15188,7 @@ int mdbx_cursor_get_batch(MDBX_cursor *mc, size_t *count, MDBX_val *pairs, } const MDBX_node *leaf = page_node(mp, i); get_key(leaf, &pairs[n]); - rc = mdbx_node_read(mc, leaf, &pairs[n + 1], mp); + rc = node_read(mc, leaf, &pairs[n + 1], mp); if (unlikely(rc != MDBX_SUCCESS)) break; n += 2; @@ -15333,19 +15199,19 @@ int mdbx_cursor_get_batch(MDBX_cursor *mc, size_t *count, MDBX_val *pairs, return rc; } -static int mdbx_touch_dbi(MDBX_cursor *mc) { - mdbx_cassert(mc, (*mc->mc_dbistate & DBI_DIRTY) == 0); +static int touch_dbi(MDBX_cursor *mc) { + cASSERT(mc, (*mc->mc_dbistate & DBI_DIRTY) == 0); *mc->mc_dbistate |= DBI_DIRTY; mc->mc_txn->mt_flags |= MDBX_TXN_DIRTY; if (mc->mc_dbi >= CORE_DBS) { - mdbx_cassert(mc, (mc->mc_flags & C_RECLAIMING) == 0); + cASSERT(mc, (mc->mc_flags & C_RECLAIMING) == 0); /* Touch DB record of named DB */ MDBX_cursor_couple cx; - int rc = mdbx_cursor_init(&cx.outer, mc->mc_txn, MAIN_DBI); + int rc = cursor_init(&cx.outer, mc->mc_txn, MAIN_DBI); if (unlikely(rc != MDBX_SUCCESS)) return rc; mc->mc_txn->mt_dbistate[MAIN_DBI] |= DBI_DIRTY; - rc = mdbx_page_search(&cx.outer, &mc->mc_dbx->md_name, MDBX_PS_MODIFY); + rc = page_search(&cx.outer, &mc->mc_dbx->md_name, MDBX_PS_MODIFY); if (unlikely(rc != MDBX_SUCCESS)) return rc; } @@ -15355,17 +15221,17 @@ static int mdbx_touch_dbi(MDBX_cursor *mc) { /* Touch all the pages in the cursor stack. Set mc_top. * Makes sure all the pages are writable, before attempting a write operation. * [in] mc The cursor to operate on. */ -static int mdbx_cursor_touch(MDBX_cursor *mc) { +static int cursor_touch(MDBX_cursor *mc) { int rc = MDBX_SUCCESS; if (unlikely((*mc->mc_dbistate & DBI_DIRTY) == 0)) { - rc = mdbx_touch_dbi(mc); + rc = touch_dbi(mc); if (unlikely(rc != MDBX_SUCCESS)) return rc; } if (likely(mc->mc_snum)) { mc->mc_top = 0; do { - rc = mdbx_page_touch(mc); + rc = page_touch(mc); } while (!rc && ++(mc->mc_top) < mc->mc_snum); mc->mc_top = mc->mc_snum - 1; } @@ -15395,7 +15261,7 @@ __hot int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, if (unlikely(TXN_DBI_CHANGED(mc->mc_txn, mc->mc_dbi))) return MDBX_BAD_DBI; - mdbx_cassert(mc, cursor_is_tracked(mc)); + cASSERT(mc, cursor_is_tracked(mc)); env = mc->mc_txn->mt_env; /* Check this first so counter will always be zero on any early failures. */ @@ -15438,19 +15304,19 @@ __hot int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, if (likely((mc->mc_flags & C_SUB) == 0)) { if (unlikely(key->iov_len < mc->mc_dbx->md_klen_min || key->iov_len > mc->mc_dbx->md_klen_max)) { - mdbx_cassert(mc, !"Invalid key-size"); + cASSERT(mc, !"Invalid key-size"); return MDBX_BAD_VALSIZE; } if (unlikely(data->iov_len < mc->mc_dbx->md_vlen_min || data->iov_len > mc->mc_dbx->md_vlen_max)) { - mdbx_cassert(mc, !"Invalid data-size"); + cASSERT(mc, !"Invalid data-size"); return MDBX_BAD_VALSIZE; } if (mc->mc_db->md_flags & MDBX_INTEGERKEY) { switch (key->iov_len) { default: - mdbx_cassert(mc, !"key-size is invalid for MDBX_INTEGERKEY"); + cASSERT(mc, !"key-size is invalid for MDBX_INTEGERKEY"); return MDBX_BAD_VALSIZE; case 4: if (unlikely(3 & (uintptr_t)key->iov_base)) { @@ -15473,7 +15339,7 @@ __hot int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, if (mc->mc_db->md_flags & MDBX_INTEGERDUP) { switch (data->iov_len) { default: - mdbx_cassert(mc, !"data-size is invalid for MDBX_INTEGERKEY"); + cASSERT(mc, !"data-size is invalid for MDBX_INTEGERKEY"); return MDBX_BAD_VALSIZE; case 4: if (unlikely(3 & (uintptr_t)data->iov_base)) { @@ -15499,10 +15365,9 @@ __hot int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, } } - mdbx_debug( - "==> put db %d key [%s], size %" PRIuPTR ", data [%s] size %" PRIuPTR, - DDBI(mc), DKEY_DEBUG(key), key->iov_len, - DVAL_DEBUG((flags & MDBX_RESERVE) ? nullptr : data), data->iov_len); + DEBUG("==> put db %d key [%s], size %" PRIuPTR ", data [%s] size %" PRIuPTR, + DDBI(mc), DKEY_DEBUG(key), key->iov_len, + DVAL_DEBUG((flags & MDBX_RESERVE) ? nullptr : data), data->iov_len); int dupdata_flag = 0; if ((flags & MDBX_CURRENT) != 0 && (mc->mc_flags & C_SUB) == 0) { @@ -15526,9 +15391,8 @@ __hot int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, if (F_ISSET(mc->mc_db->md_flags, MDBX_DUPSORT)) { MDBX_node *node = page_node(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); if (F_ISSET(node_flags(node), F_DUPDATA)) { - mdbx_cassert(mc, - mc->mc_xcursor != NULL && - (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)); + cASSERT(mc, mc->mc_xcursor != NULL && + (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)); /* Если за ключом более одного значения, либо если размер данных * отличается, то вместо обновления требуется удаление и * последующая вставка. */ @@ -15564,7 +15428,7 @@ __hot int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, } else if ((flags & MDBX_CURRENT) == 0) { bool exact = false; if ((flags & MDBX_APPEND) && mc->mc_db->md_entries > 0) { - rc = mdbx_cursor_last(mc, &dkey, &olddata); + rc = cursor_last(mc, &dkey, &olddata); if (likely(rc == MDBX_SUCCESS)) { rc = mc->mc_dbx->md_cmp(key, &dkey); if (likely(rc > 0)) { @@ -15581,25 +15445,25 @@ __hot int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, } else { struct cursor_set_result csr = /* olddata may not be updated in case LEAF2-page of dupfixed-subDB */ - mdbx_cursor_set(mc, (MDBX_val *)key, &olddata, MDBX_SET); + cursor_set(mc, (MDBX_val *)key, &olddata, MDBX_SET); rc = csr.err; exact = csr.exact; } if (likely(rc == MDBX_SUCCESS)) { if (exact) { if (unlikely(flags & MDBX_NOOVERWRITE)) { - mdbx_debug("duplicate key [%s]", DKEY_DEBUG(key)); + DEBUG("duplicate key [%s]", DKEY_DEBUG(key)); *data = olddata; return MDBX_KEYEXIST; } if (unlikely(mc->mc_flags & C_SUB)) { /* nested subtree of DUPSORT-database with the same key, * nothing to update */ - mdbx_assert(env, data->iov_len == 0 && - (olddata.iov_len == 0 || - /* olddata may not be updated in case LEAF2-page - of dupfixed-subDB */ - (mc->mc_db->md_flags & MDBX_DUPFIXED))); + eASSERT(env, data->iov_len == 0 && + (olddata.iov_len == 0 || + /* olddata may not be updated in case LEAF2-page + of dupfixed-subDB */ + (mc->mc_db->md_flags & MDBX_DUPFIXED))); return MDBX_SUCCESS; } if (unlikely(flags & MDBX_ALLDUPS) && mc->mc_xcursor && @@ -15643,22 +15507,22 @@ __hot int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, rdata = &xdata; xdata.iov_len = data->iov_len * dcount; } - if (unlikely(err = mdbx_cursor_spill(mc, key, rdata))) + if (unlikely(err = cursor_spill(mc, key, rdata))) return err; } if (unlikely(rc == MDBX_NO_ROOT)) { /* new database, write a root leaf page */ - mdbx_debug("%s", "allocating new root leaf page"); + DEBUG("%s", "allocating new root leaf page"); if (unlikely((*mc->mc_dbistate & DBI_DIRTY) == 0)) { - err = mdbx_touch_dbi(mc); + err = touch_dbi(mc); if (unlikely(err != MDBX_SUCCESS)) return err; } pgr_t npr = page_new(mc, P_LEAF); if (unlikely(npr.err != MDBX_SUCCESS)) return npr.err; - npr.err = mdbx_cursor_push(mc, npr.page); + npr.err = cursor_push(mc, npr.page); if (unlikely(npr.err != MDBX_SUCCESS)) return npr.err; mc->mc_db->md_root = npr.page->mp_pgno; @@ -15683,7 +15547,7 @@ __hot int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, mc->mc_flags |= C_INITIALIZED; } else { /* make sure all cursor pages are writable */ - err = mdbx_cursor_touch(mc); + err = cursor_touch(mc); if (unlikely(err)) return err; } @@ -15695,7 +15559,7 @@ __hot int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, fp->mp_txnid = mc->mc_txn->mt_front; if (insert_key) { /* The key does not exist */ - mdbx_debug("inserting key at index %i", mc->mc_ki[mc->mc_top]); + DEBUG("inserting key at index %i", mc->mc_ki[mc->mc_top]); if ((mc->mc_db->md_flags & MDBX_DUPSORT) && node_size(key, data) > env->me_leaf_nodemax) { /* Too big for a node, insert in sub-DB. Set up an empty @@ -15728,15 +15592,15 @@ __hot int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, } err = MDBX_SUCCESS; if (mc->mc_ki[mc->mc_top]) - err = mdbx_update_key(mc, key); - mdbx_cassert(mc, mc->mc_top + dtop < UINT16_MAX); + err = update_key(mc, key); + cASSERT(mc, mc->mc_top + dtop < UINT16_MAX); mc->mc_top += (uint8_t)dtop; if (unlikely(err != MDBX_SUCCESS)) return err; } - if (mdbx_audit_enabled()) { - err = mdbx_cursor_check(mc); + if (AUDIT_ENABLED()) { + err = cursor_check(mc); if (unlikely(err != MDBX_SUCCESS)) return err; } @@ -15744,8 +15608,8 @@ __hot int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, } more:; - if (mdbx_audit_enabled()) { - err = mdbx_cursor_check(mc); + if (AUDIT_ENABLED()) { + err = cursor_check(mc); if (unlikely(err != MDBX_SUCCESS)) return err; } @@ -15761,7 +15625,7 @@ __hot int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, pgr_t lp = page_get_large(mc, pgno, mc->mc_pg[mc->mc_top]->mp_txnid); if (unlikely(lp.err != MDBX_SUCCESS)) return lp.err; - mdbx_cassert(mc, PAGETYPE_WHOLE(lp.page) == P_OVERFLOW); + cASSERT(mc, PAGETYPE_WHOLE(lp.page) == P_OVERFLOW); /* Is the ov page from this txn (or a parent) and big enough? */ int ovpages = lp.page->mp_pages; @@ -15776,35 +15640,34 @@ __hot int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, if (IS_SPILLED(mc->mc_txn, lp.page)) { lp = /* TODO: avoid search and get txn & spill-index from page_result */ - mdbx_page_unspill(mc->mc_txn, lp.page); + page_unspill(mc->mc_txn, lp.page); if (unlikely(lp.err)) return lp.err; } else { if (unlikely(!mc->mc_txn->mt_parent)) { - mdbx_error( - "Unexpected not frozen/modifiable/spilled but shadowed %s " - "page %" PRIaPGNO " mod-txnid %" PRIaTXN "," - " without parent transaction, current txn %" PRIaTXN - " front %" PRIaTXN, - "overflow/large", pgno, lp.page->mp_txnid, - mc->mc_txn->mt_txnid, mc->mc_txn->mt_front); + ERROR("Unexpected not frozen/modifiable/spilled but shadowed %s " + "page %" PRIaPGNO " mod-txnid %" PRIaTXN "," + " without parent transaction, current txn %" PRIaTXN + " front %" PRIaTXN, + "overflow/large", pgno, lp.page->mp_txnid, + mc->mc_txn->mt_txnid, mc->mc_txn->mt_front); return MDBX_PROBLEM; } /* It is writable only in a parent txn */ - MDBX_page *np = mdbx_page_malloc(mc->mc_txn, ovpages); + MDBX_page *np = page_malloc(mc->mc_txn, ovpages); if (unlikely(!np)) return MDBX_ENOMEM; memcpy(np, lp.page, PAGEHDRSZ); /* Copy header of page */ - err = mdbx_page_dirty(mc->mc_txn, lp.page = np, ovpages); + err = page_dirty(mc->mc_txn, lp.page = np, ovpages); if (unlikely(err != MDBX_SUCCESS)) return err; #if MDBX_ENABLE_PGOP_STAT mc->mc_txn->mt_env->me_lck->mti_pgop_stat.clone.weak += ovpages; #endif /* MDBX_ENABLE_PGOP_STAT */ - mdbx_cassert(mc, mdbx_dirtylist_check(mc->mc_txn)); + cASSERT(mc, dirtylist_check(mc->mc_txn)); } } node_set_ds(node, data->iov_len); @@ -15813,21 +15676,21 @@ __hot int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, else memcpy(page_data(lp.page), data->iov_base, data->iov_len); - if (mdbx_audit_enabled()) { - err = mdbx_cursor_check(mc); + if (AUDIT_ENABLED()) { + err = cursor_check(mc); if (unlikely(err != MDBX_SUCCESS)) return err; } return MDBX_SUCCESS; } - if ((err = mdbx_page_retire(mc, lp.page)) != MDBX_SUCCESS) + if ((err = page_retire(mc, lp.page)) != MDBX_SUCCESS) return err; } else { olddata.iov_len = node_ds(node); olddata.iov_base = node_data(node); - mdbx_cassert(mc, (char *)olddata.iov_base + olddata.iov_len <= - (char *)(mc->mc_pg[mc->mc_top]) + env->me_psize); + cASSERT(mc, (char *)olddata.iov_base + olddata.iov_len <= + (char *)(mc->mc_pg[mc->mc_top]) + env->me_psize); /* DB has dups? */ if (F_ISSET(mc->mc_db->md_flags, MDBX_DUPSORT)) { @@ -15862,13 +15725,13 @@ __hot int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, * considers them equal. So continue update since called without. * Continue to update since was called without MDBX_NODUPDATA. */ } - mdbx_cassert(mc, node_size(key, data) <= env->me_leaf_nodemax); + cASSERT(mc, node_size(key, data) <= env->me_leaf_nodemax); goto current; } /* Just overwrite the current item */ if (flags & MDBX_CURRENT) { - mdbx_cassert(mc, node_size(key, data) <= env->me_leaf_nodemax); + cASSERT(mc, node_size(key, data) <= env->me_leaf_nodemax); goto current; } @@ -15885,11 +15748,11 @@ __hot int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, fp->mp_flags |= P_LEAF2; fp->mp_leaf2_ksize = (uint16_t)data->iov_len; xdata.iov_len += 2 * data->iov_len; /* leave space for 2 more */ - mdbx_cassert(mc, xdata.iov_len <= env->me_psize); + cASSERT(mc, xdata.iov_len <= env->me_psize); } else { xdata.iov_len += 2 * (sizeof(indx_t) + NODESIZE) + (dkey.iov_len & 1) + (data->iov_len & 1); - mdbx_cassert(mc, xdata.iov_len <= env->me_psize); + cASSERT(mc, xdata.iov_len <= env->me_psize); } fp->mp_upper = (uint16_t)(xdata.iov_len - PAGEHDRSZ); olddata.iov_len = xdata.iov_len; /* pretend olddata is fp */ @@ -15948,7 +15811,7 @@ __hot int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, if (unlikely(par.err != MDBX_SUCCESS)) return par.err; mc->mc_db->md_leaf_pages += 1; - mdbx_cassert(mc, env->me_psize > olddata.iov_len); + cASSERT(mc, env->me_psize > olddata.iov_len); offset = env->me_psize - (unsigned)olddata.iov_len; flags |= F_DUPDATA | F_SUBDATA; nested_dupdb.md_root = mp->mp_pgno; @@ -15961,7 +15824,7 @@ __hot int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, mp->mp_txnid = mc->mc_txn->mt_front; mp->mp_leaf2_ksize = fp->mp_leaf2_ksize; mp->mp_lower = fp->mp_lower; - mdbx_cassert(mc, fp->mp_upper + offset <= UINT16_MAX); + cASSERT(mc, fp->mp_upper + offset <= UINT16_MAX); mp->mp_upper = (indx_t)(fp->mp_upper + offset); if (unlikely(fp_flags & P_LEAF2)) { memcpy(page_data(mp), page_data(fp), @@ -15973,7 +15836,7 @@ __hot int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, memcpy((char *)(&mp->mp_ptrs), (char *)(&fp->mp_ptrs), page_numkeys(fp) * sizeof(mp->mp_ptrs[0])); for (i = 0; i < page_numkeys(fp); i++) { - mdbx_cassert(mc, mp->mp_ptrs[i] + offset <= UINT16_MAX); + cASSERT(mc, mp->mp_ptrs[i] + offset <= UINT16_MAX); mp->mp_ptrs[i] += (indx_t)offset; } } @@ -15993,7 +15856,7 @@ __hot int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, current: if (data->iov_len == olddata.iov_len) { - mdbx_cassert(mc, EVEN(key->iov_len) == EVEN(node_ks(node))); + cASSERT(mc, EVEN(key->iov_len) == EVEN(node_ks(node))); /* same size, just replace it. Note that we could * also reuse this node if the new data is smaller, * but instead we opt to shrink the node in that case. */ @@ -16002,20 +15865,20 @@ __hot int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, else if (!(mc->mc_flags & C_SUB)) memcpy(olddata.iov_base, data->iov_base, data->iov_len); else { - mdbx_cassert(mc, page_numkeys(mc->mc_pg[mc->mc_top]) == 1); - mdbx_cassert(mc, PAGETYPE_COMPAT(mc->mc_pg[mc->mc_top]) == P_LEAF); - mdbx_cassert(mc, node_ds(node) == 0); - mdbx_cassert(mc, node_flags(node) == 0); - mdbx_cassert(mc, key->iov_len < UINT16_MAX); + cASSERT(mc, page_numkeys(mc->mc_pg[mc->mc_top]) == 1); + cASSERT(mc, PAGETYPE_COMPAT(mc->mc_pg[mc->mc_top]) == P_LEAF); + cASSERT(mc, node_ds(node) == 0); + cASSERT(mc, node_flags(node) == 0); + cASSERT(mc, key->iov_len < UINT16_MAX); node_set_ks(node, key->iov_len); memcpy(node_key(node), key->iov_base, key->iov_len); - mdbx_cassert(mc, (char *)node_key(node) + node_ds(node) < - (char *)(mc->mc_pg[mc->mc_top]) + env->me_psize); + cASSERT(mc, (char *)node_key(node) + node_ds(node) < + (char *)(mc->mc_pg[mc->mc_top]) + env->me_psize); goto fix_parent; } - if (mdbx_audit_enabled()) { - err = mdbx_cursor_check(mc); + if (AUDIT_ENABLED()) { + err = cursor_check(mc); if (unlikely(err != MDBX_SUCCESS)) return err; } @@ -16034,13 +15897,13 @@ new_sub:; if (page_room(mc->mc_pg[mc->mc_top]) < nsize) { rc = page_split(mc, key, rdata, P_INVALID, insert_key ? naf : naf | MDBX_SPLIT_REPLACE); - if (rc == MDBX_SUCCESS && mdbx_audit_enabled()) - rc = insert_key ? mdbx_cursor_check(mc) : mdbx_cursor_check_updating(mc); + if (rc == MDBX_SUCCESS && AUDIT_ENABLED()) + rc = insert_key ? cursor_check(mc) : cursor_check_updating(mc); } else { /* There is room already in this leaf page. */ if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { - mdbx_cassert(mc, !(naf & (F_BIGDATA | F_SUBDATA | F_DUPDATA)) && - rdata->iov_len == 0); + cASSERT(mc, !(naf & (F_BIGDATA | F_SUBDATA | F_DUPDATA)) && + rdata->iov_len == 0); rc = node_add_leaf2(mc, mc->mc_ki[mc->mc_top], key); } else rc = node_add_leaf(mc, mc->mc_ki[mc->mc_top], key, rdata, naf); @@ -16084,7 +15947,7 @@ new_sub:; SHIFT_MDBX_NODUPDATA_TO_MDBX_NOOVERWRITE); if ((flags & MDBX_CURRENT) == 0) { xflags -= MDBX_CURRENT; - err = mdbx_xcursor_init1(mc, node, mc->mc_pg[mc->mc_top]); + err = cursor_xinit1(mc, node, mc->mc_pg[mc->mc_top]); if (unlikely(err != MDBX_SUCCESS)) return err; } @@ -16113,7 +15976,7 @@ new_sub:; continue; if (m2->mc_pg[i] == mp) { if (m2->mc_ki[i] == mc->mc_ki[i]) { - err = mdbx_xcursor_init2(m2, mx, dupdata_flag); + err = cursor_xinit2(m2, mx, dupdata_flag); if (unlikely(err != MDBX_SUCCESS)) return err; } else if (!insert_key && m2->mc_ki[i] < nkeys) { @@ -16122,7 +15985,7 @@ new_sub:; } } } - mdbx_cassert(mc, mc->mc_xcursor->mx_db.md_entries < PTRDIFF_MAX); + cASSERT(mc, mc->mc_xcursor->mx_db.md_entries < PTRDIFF_MAX); ecount = (size_t)mc->mc_xcursor->mx_db.md_entries; #define SHIFT_MDBX_APPENDDUP_TO_MDBX_APPEND 1 STATIC_ASSERT((MDBX_APPENDDUP >> SHIFT_MDBX_APPENDDUP_TO_MDBX_APPEND) == @@ -16160,13 +16023,13 @@ new_sub:; } } } - if (rc == MDBX_SUCCESS && mdbx_audit_enabled()) - rc = mdbx_cursor_check(mc); + if (rc == MDBX_SUCCESS && AUDIT_ENABLED()) + rc = cursor_check(mc); return rc; bad_sub: if (unlikely(rc == MDBX_KEYEXIST)) { /* should not happen, we deleted that item */ - mdbx_error("Unexpected %i error while put to nested dupsort's hive", rc); + ERROR("Unexpected %i error while put to nested dupsort's hive", rc); rc = MDBX_PROBLEM; } } @@ -16196,17 +16059,17 @@ __hot int mdbx_cursor_del(MDBX_cursor *mc, MDBX_put_flags_t flags) { return MDBX_NOTFOUND; if (likely((flags & MDBX_NOSPILL) == 0) && - unlikely(rc = mdbx_cursor_spill(mc, NULL, NULL))) + unlikely(rc = cursor_spill(mc, NULL, NULL))) return rc; - rc = mdbx_cursor_touch(mc); + rc = cursor_touch(mc); if (unlikely(rc != MDBX_SUCCESS)) return rc; MDBX_page *mp = mc->mc_pg[mc->mc_top]; if (!MDBX_DISABLE_VALIDATION && unlikely(!CHECK_LEAF_TYPE(mc, mp))) { - mdbx_error("unexpected leaf-page #%" PRIaPGNO " type 0x%x seen by cursor", - mp->mp_pgno, mp->mp_flags); + ERROR("unexpected leaf-page #%" PRIaPGNO " type 0x%x seen by cursor", + mp->mp_pgno, mp->mp_flags); return MDBX_CORRUPTED; } if (IS_LEAF2(mp)) @@ -16215,7 +16078,7 @@ __hot int mdbx_cursor_del(MDBX_cursor *mc, MDBX_put_flags_t flags) { MDBX_node *node = page_node(mp, mc->mc_ki[mc->mc_top]); if (F_ISSET(node_flags(node), F_DUPDATA)) { if (flags & (MDBX_ALLDUPS | /* for compatibility */ MDBX_NODUPDATA)) { - /* mdbx_cursor_del0() will subtract the final entry */ + /* cursor_del() will subtract the final entry */ mc->mc_db->md_entries -= mc->mc_xcursor->mx_db.md_entries - 1; mc->mc_xcursor->mx_cursor.mc_flags &= ~C_INITIALIZED; } else { @@ -16234,7 +16097,7 @@ __hot int mdbx_cursor_del(MDBX_cursor *mc, MDBX_put_flags_t flags) { } else { MDBX_cursor *m2; /* shrink fake page */ - mdbx_node_shrink(mp, mc->mc_ki[mc->mc_top]); + node_shrink(mp, mc->mc_ki[mc->mc_top]); node = page_node(mp, mc->mc_ki[mc->mc_top]); mc->mc_xcursor->mx_cursor.mc_pg[0] = node_data(node); /* fix other sub-DB cursors pointed at fake pages on this page */ @@ -16257,8 +16120,8 @@ __hot int mdbx_cursor_del(MDBX_cursor *mc, MDBX_put_flags_t flags) { } } mc->mc_db->md_entries--; - mdbx_cassert(mc, mc->mc_db->md_entries > 0 && mc->mc_db->md_depth > 0 && - mc->mc_db->md_root != P_INVALID); + cASSERT(mc, mc->mc_db->md_entries > 0 && mc->mc_db->md_depth > 0 && + mc->mc_db->md_root != P_INVALID); return rc; } else { mc->mc_xcursor->mx_cursor.mc_flags &= ~C_INITIALIZED; @@ -16268,7 +16131,7 @@ __hot int mdbx_cursor_del(MDBX_cursor *mc, MDBX_put_flags_t flags) { if (node_flags(node) & F_SUBDATA) { /* add all the child DB's pages to the free list */ - rc = mdbx_drop_tree(&mc->mc_xcursor->mx_cursor, false); + rc = drop_tree(&mc->mc_xcursor->mx_cursor, false); if (unlikely(rc)) goto fail; } @@ -16280,12 +16143,12 @@ __hot int mdbx_cursor_del(MDBX_cursor *mc, MDBX_put_flags_t flags) { /* add large/overflow pages to free list */ if (F_ISSET(node_flags(node), F_BIGDATA)) { pgr_t lp = page_get_large(mc, node_largedata_pgno(node), mp->mp_txnid); - if (unlikely((rc = lp.err) || (rc = mdbx_page_retire(mc, lp.page)))) + if (unlikely((rc = lp.err) || (rc = page_retire(mc, lp.page)))) goto fail; } del_key: - return mdbx_cursor_del0(mc); + return cursor_del(mc); fail: mc->mc_txn->mt_flags |= MDBX_TXN_ERROR; @@ -16295,17 +16158,16 @@ fail: /* Allocate and initialize new pages for a database. * Set MDBX_TXN_ERROR on failure. */ static pgr_t page_new(MDBX_cursor *mc, const unsigned flags) { - mdbx_cassert(mc, (flags & P_OVERFLOW) == 0); + cASSERT(mc, (flags & P_OVERFLOW) == 0); pgr_t ret = page_alloc(mc); if (unlikely(ret.err != MDBX_SUCCESS)) return ret; - mdbx_debug("db %u allocated new page %" PRIaPGNO, mc->mc_dbi, - ret.page->mp_pgno); + DEBUG("db %u allocated new page %" PRIaPGNO, mc->mc_dbi, ret.page->mp_pgno); ret.page->mp_flags = (uint16_t)flags; ret.page->mp_txnid = mc->mc_txn->mt_front; - mdbx_cassert(mc, *mc->mc_dbistate & DBI_DIRTY); - mdbx_cassert(mc, mc->mc_txn->mt_flags & MDBX_TXN_DIRTY); + cASSERT(mc, *mc->mc_dbistate & DBI_DIRTY); + cASSERT(mc, mc->mc_txn->mt_flags & MDBX_TXN_DIRTY); #if MDBX_ENABLE_PGOP_STAT mc->mc_txn->mt_env->me_lck->mti_pgop_stat.newly.weak += 1; #endif /* MDBX_ENABLE_PGOP_STAT */ @@ -16318,7 +16180,7 @@ static pgr_t page_new(MDBX_cursor *mc, const unsigned flags) { mc->mc_db->md_branch_pages += is_branch; mc->mc_db->md_leaf_pages += 1 - is_branch; if (unlikely(mc->mc_flags & C_SUB)) { - MDBX_db *outer = mdbx_outer_db(mc); + MDBX_db *outer = outer_db(mc); outer->md_branch_pages += is_branch; outer->md_leaf_pages += 1 - is_branch; } @@ -16332,19 +16194,19 @@ static pgr_t page_new_large(MDBX_cursor *mc, const unsigned npages) { if (unlikely(ret.err != MDBX_SUCCESS)) return ret; - mdbx_debug("db %u allocated new large-page %" PRIaPGNO ", num %u", mc->mc_dbi, - ret.page->mp_pgno, npages); + DEBUG("db %u allocated new large-page %" PRIaPGNO ", num %u", mc->mc_dbi, + ret.page->mp_pgno, npages); ret.page->mp_flags = P_OVERFLOW; ret.page->mp_txnid = mc->mc_txn->mt_front; - mdbx_cassert(mc, *mc->mc_dbistate & DBI_DIRTY); - mdbx_cassert(mc, mc->mc_txn->mt_flags & MDBX_TXN_DIRTY); + cASSERT(mc, *mc->mc_dbistate & DBI_DIRTY); + cASSERT(mc, mc->mc_txn->mt_flags & MDBX_TXN_DIRTY); #if MDBX_ENABLE_PGOP_STAT mc->mc_txn->mt_env->me_lck->mti_pgop_stat.newly.weak += npages; #endif /* MDBX_ENABLE_PGOP_STAT */ mc->mc_db->md_overflow_pages += npages; ret.page->mp_pages = npages; - mdbx_cassert(mc, !(mc->mc_flags & C_SUB)); + cASSERT(mc, !(mc->mc_flags & C_SUB)); return ret; } @@ -16353,15 +16215,15 @@ __hot static int __must_check_result node_add_leaf2(MDBX_cursor *mc, const MDBX_val *key) { MDBX_page *mp = mc->mc_pg[mc->mc_top]; DKBUF_DEBUG; - mdbx_debug("add to leaf2-%spage %" PRIaPGNO " index %i, " - " key size %" PRIuPTR " [%s]", - IS_SUBP(mp) ? "sub-" : "", mp->mp_pgno, indx, - key ? key->iov_len : 0, DKEY_DEBUG(key)); + DEBUG("add to leaf2-%spage %" PRIaPGNO " index %i, " + " key size %" PRIuPTR " [%s]", + IS_SUBP(mp) ? "sub-" : "", mp->mp_pgno, indx, key ? key->iov_len : 0, + DKEY_DEBUG(key)); - mdbx_cassert(mc, key); - mdbx_cassert(mc, PAGETYPE_COMPAT(mp) == (P_LEAF | P_LEAF2)); + cASSERT(mc, key); + cASSERT(mc, PAGETYPE_COMPAT(mp) == (P_LEAF | P_LEAF2)); const unsigned ksize = mc->mc_db->md_xsize; - mdbx_cassert(mc, ksize == key->iov_len); + cASSERT(mc, ksize == key->iov_len); const unsigned nkeys = page_numkeys(mp); /* Just using these for counting */ @@ -16375,7 +16237,7 @@ __hot static int __must_check_result node_add_leaf2(MDBX_cursor *mc, mp->mp_upper = (indx_t)upper; char *const ptr = page_leaf2key(mp, indx, ksize); - mdbx_cassert(mc, nkeys >= indx); + cASSERT(mc, nkeys >= indx); const unsigned diff = nkeys - indx; if (likely(diff > 0)) /* Move higher keys up one slot. */ @@ -16390,17 +16252,17 @@ static int __must_check_result node_add_branch(MDBX_cursor *mc, unsigned indx, pgno_t pgno) { MDBX_page *mp = mc->mc_pg[mc->mc_top]; DKBUF_DEBUG; - mdbx_debug("add to branch-%spage %" PRIaPGNO " index %i, node-pgno %" PRIaPGNO - " key size %" PRIuPTR " [%s]", - IS_SUBP(mp) ? "sub-" : "", mp->mp_pgno, indx, pgno, - key ? key->iov_len : 0, DKEY_DEBUG(key)); + DEBUG("add to branch-%spage %" PRIaPGNO " index %i, node-pgno %" PRIaPGNO + " key size %" PRIuPTR " [%s]", + IS_SUBP(mp) ? "sub-" : "", mp->mp_pgno, indx, pgno, + key ? key->iov_len : 0, DKEY_DEBUG(key)); - mdbx_cassert(mc, PAGETYPE_WHOLE(mp) == P_BRANCH); + cASSERT(mc, PAGETYPE_WHOLE(mp) == P_BRANCH); STATIC_ASSERT(NODESIZE % 2 == 0); /* Move higher pointers up one slot. */ const unsigned nkeys = page_numkeys(mp); - mdbx_cassert(mc, nkeys >= indx); + cASSERT(mc, nkeys >= indx); for (unsigned i = nkeys; i > indx; --i) mp->mp_ptrs[i] = mp->mp_ptrs[i - 1]; @@ -16435,13 +16297,13 @@ __hot static int __must_check_result node_add_leaf(MDBX_cursor *mc, unsigned flags) { MDBX_page *mp = mc->mc_pg[mc->mc_top]; DKBUF_DEBUG; - mdbx_debug("add to leaf-%spage %" PRIaPGNO " index %i, data size %" PRIuPTR - " key size %" PRIuPTR " [%s]", - IS_SUBP(mp) ? "sub-" : "", mp->mp_pgno, indx, - data ? data->iov_len : 0, key ? key->iov_len : 0, DKEY_DEBUG(key)); - mdbx_cassert(mc, key != NULL && data != NULL); - mdbx_cassert(mc, PAGETYPE_COMPAT(mp) == P_LEAF); - mdbx_cassert(mc, page_room(mp) >= leaf_size(mc->mc_txn->mt_env, key, data)); + DEBUG("add to leaf-%spage %" PRIaPGNO " index %i, data size %" PRIuPTR + " key size %" PRIuPTR " [%s]", + IS_SUBP(mp) ? "sub-" : "", mp->mp_pgno, indx, data ? data->iov_len : 0, + key ? key->iov_len : 0, DKEY_DEBUG(key)); + cASSERT(mc, key != NULL && data != NULL); + cASSERT(mc, PAGETYPE_COMPAT(mp) == P_LEAF); + cASSERT(mc, page_room(mp) >= leaf_size(mc->mc_txn->mt_env, key, data)); MDBX_page *largepage = NULL; size_t node_bytes; @@ -16454,13 +16316,13 @@ __hot static int __must_check_result node_add_leaf(MDBX_cursor *mc, mc->mc_txn->mt_env->me_leaf_nodemax)) { /* Put data on large/overflow page. */ if (unlikely(mc->mc_db->md_flags & MDBX_DUPSORT)) { - mdbx_error("Unexpected target %s flags 0x%x for large data-item", - "dupsort-db", mc->mc_db->md_flags); + ERROR("Unexpected target %s flags 0x%x for large data-item", "dupsort-db", + mc->mc_db->md_flags); return MDBX_PROBLEM; } if (unlikely(flags & (F_DUPDATA | F_SUBDATA))) { - mdbx_error("Unexpected target %s flags 0x%x for large data-item", "node", - flags); + ERROR("Unexpected target %s flags 0x%x for large data-item", "node", + flags); return MDBX_PROBLEM; } const pgno_t ovpages = number_of_ovpages(mc->mc_txn->mt_env, data->iov_len); @@ -16468,20 +16330,20 @@ __hot static int __must_check_result node_add_leaf(MDBX_cursor *mc, if (unlikely(npr.err != MDBX_SUCCESS)) return npr.err; largepage = npr.page; - mdbx_debug("allocated %u large/overflow page(s) %" PRIaPGNO "for %" PRIuPTR - " data bytes", - largepage->mp_pages, largepage->mp_pgno, data->iov_len); + DEBUG("allocated %u large/overflow page(s) %" PRIaPGNO "for %" PRIuPTR + " data bytes", + largepage->mp_pages, largepage->mp_pgno, data->iov_len); flags |= F_BIGDATA; node_bytes = node_size_len(key->iov_len, 0) + sizeof(pgno_t) + sizeof(indx_t); } else { node_bytes = node_size(key, data) + sizeof(indx_t); } - mdbx_cassert(mc, node_bytes == leaf_size(mc->mc_txn->mt_env, key, data)); + cASSERT(mc, node_bytes == leaf_size(mc->mc_txn->mt_env, key, data)); /* Move higher pointers up one slot. */ const unsigned nkeys = page_numkeys(mp); - mdbx_cassert(mc, nkeys >= indx); + cASSERT(mc, nkeys >= indx); for (unsigned i = nkeys; i > indx; --i) mp->mp_ptrs[i] = mp->mp_ptrs[i - 1]; @@ -16530,26 +16392,25 @@ __hot static void node_del(MDBX_cursor *mc, size_t ksize) { const unsigned hole = mc->mc_ki[mc->mc_top]; const unsigned nkeys = page_numkeys(mp); - mdbx_debug("delete node %u on %s page %" PRIaPGNO, hole, - IS_LEAF(mp) ? "leaf" : "branch", mp->mp_pgno); - mdbx_cassert(mc, hole < nkeys); + DEBUG("delete node %u on %s page %" PRIaPGNO, hole, + IS_LEAF(mp) ? "leaf" : "branch", mp->mp_pgno); + cASSERT(mc, hole < nkeys); if (IS_LEAF2(mp)) { - mdbx_cassert(mc, ksize >= sizeof(indx_t)); + cASSERT(mc, ksize >= sizeof(indx_t)); unsigned diff = nkeys - 1 - hole; char *base = page_leaf2key(mp, hole, ksize); if (diff) memmove(base, base + ksize, diff * ksize); - mdbx_cassert(mc, mp->mp_lower >= sizeof(indx_t)); + cASSERT(mc, mp->mp_lower >= sizeof(indx_t)); mp->mp_lower -= sizeof(indx_t); - mdbx_cassert(mc, - (size_t)UINT16_MAX - mp->mp_upper >= ksize - sizeof(indx_t)); + cASSERT(mc, (size_t)UINT16_MAX - mp->mp_upper >= ksize - sizeof(indx_t)); mp->mp_upper += (indx_t)(ksize - sizeof(indx_t)); return; } MDBX_node *node = page_node(mp, hole); - mdbx_cassert(mc, !IS_BRANCH(mp) || hole || node_ks(node) == 0); + cASSERT(mc, !IS_BRANCH(mp) || hole || node_ks(node) == 0); size_t hole_size = NODESIZE + node_ks(node); if (IS_LEAF(mp)) hole_size += @@ -16567,24 +16428,24 @@ __hot static void node_del(MDBX_cursor *mc, size_t ksize) { char *base = (char *)mp + mp->mp_upper + PAGEHDRSZ; memmove(base + hole_size, base, hole_offset - mp->mp_upper); - mdbx_cassert(mc, mp->mp_lower >= sizeof(indx_t)); + cASSERT(mc, mp->mp_lower >= sizeof(indx_t)); mp->mp_lower -= sizeof(indx_t); - mdbx_cassert(mc, (size_t)UINT16_MAX - mp->mp_upper >= hole_size); + cASSERT(mc, (size_t)UINT16_MAX - mp->mp_upper >= hole_size); mp->mp_upper += (indx_t)hole_size; - if (mdbx_audit_enabled()) { + if (AUDIT_ENABLED()) { const uint8_t checking = mc->mc_checking; mc->mc_checking |= CC_UPDATING; - const int page_check_err = mdbx_page_check(mc, mp); + const int page_check_err = page_check(mc, mp); mc->mc_checking = checking; - mdbx_cassert(mc, page_check_err == MDBX_SUCCESS); + cASSERT(mc, page_check_err == MDBX_SUCCESS); } } /* Compact the main page after deleting a node on a subpage. * [in] mp The main page to operate on. * [in] indx The index of the subpage on the main page. */ -static void mdbx_node_shrink(MDBX_page *mp, unsigned indx) { +static void node_shrink(MDBX_page *mp, unsigned indx) { MDBX_node *node; MDBX_page *sp, *xp; char *base; @@ -16642,11 +16503,11 @@ static void mdbx_node_shrink(MDBX_page *mp, unsigned indx) { * depend only on the parent DB. * * [in] mc The main cursor whose sorted-dups cursor is to be initialized. */ -static int mdbx_xcursor_init0(MDBX_cursor *mc) { +static int cursor_xinit0(MDBX_cursor *mc) { MDBX_xcursor *mx = mc->mc_xcursor; if (!MDBX_DISABLE_VALIDATION && unlikely(mx == nullptr)) { - mdbx_error("unexpected dupsort-page for non-dupsort db/cursor (dbi %u)", - mc->mc_dbi); + ERROR("unexpected dupsort-page for non-dupsort db/cursor (dbi %u)", + mc->mc_dbi); return MDBX_CORRUPTED; } @@ -16661,7 +16522,7 @@ static int mdbx_xcursor_init0(MDBX_cursor *mc) { mx->mx_cursor.mc_top = 0; mx->mx_cursor.mc_flags = C_SUB; STATIC_ASSERT(MDBX_DUPFIXED * 2 == P_LEAF2); - mdbx_cassert(mc, (mc->mc_checking & (P_BRANCH | P_LEAF | P_LEAF2)) == P_LEAF); + cASSERT(mc, (mc->mc_checking & (P_BRANCH | P_LEAF | P_LEAF2)) == P_LEAF); mx->mx_cursor.mc_checking = mc->mc_checking + ((mc->mc_db->md_flags & MDBX_DUPFIXED) << 1); mx->mx_dbx.md_name.iov_len = 0; @@ -16678,33 +16539,32 @@ static int mdbx_xcursor_init0(MDBX_cursor *mc) { * [in] mc The main cursor whose sorted-dups cursor is to be initialized. * [in] node The data containing the MDBX_db record for the sorted-dup database. */ -static int mdbx_xcursor_init1(MDBX_cursor *mc, MDBX_node *node, - const MDBX_page *mp) { +static int cursor_xinit1(MDBX_cursor *mc, MDBX_node *node, + const MDBX_page *mp) { MDBX_xcursor *mx = mc->mc_xcursor; if (!MDBX_DISABLE_VALIDATION && unlikely(mx == nullptr)) { - mdbx_error("unexpected dupsort-page for non-dupsort db/cursor (dbi %u)", - mc->mc_dbi); + ERROR("unexpected dupsort-page for non-dupsort db/cursor (dbi %u)", + mc->mc_dbi); return MDBX_CORRUPTED; } const uint8_t flags = node_flags(node); switch (flags) { default: - mdbx_error("invalid node flags %u", flags); + ERROR("invalid node flags %u", flags); return MDBX_CORRUPTED; case F_DUPDATA | F_SUBDATA: if (!MDBX_DISABLE_VALIDATION && unlikely(node_ds(node) != sizeof(MDBX_db))) { - mdbx_error("invalid nested-db record size %zu", node_ds(node)); + ERROR("invalid nested-db record size %zu", node_ds(node)); return MDBX_CORRUPTED; } memcpy(&mx->mx_db, node_data(node), sizeof(MDBX_db)); const txnid_t pp_txnid = mp->mp_txnid; if (!MDBX_DISABLE_VALIDATION && unlikely(mx->mx_db.md_mod_txnid > pp_txnid)) { - mdbx_error("nested-db.md_mod_txnid (%" PRIaTXN ") > page-txnid (%" PRIaTXN - ")", - mx->mx_db.md_mod_txnid, pp_txnid); + ERROR("nested-db.md_mod_txnid (%" PRIaTXN ") > page-txnid (%" PRIaTXN ")", + mx->mx_db.md_mod_txnid, pp_txnid); return MDBX_CORRUPTED; } mx->mx_cursor.mc_pg[0] = 0; @@ -16714,7 +16574,7 @@ static int mdbx_xcursor_init1(MDBX_cursor *mc, MDBX_node *node, break; case F_DUPDATA: if (!MDBX_DISABLE_VALIDATION && unlikely(node_ds(node) <= PAGEHDRSZ)) { - mdbx_error("invalid nested-page size %zu", node_ds(node)); + ERROR("invalid nested-page size %zu", node_ds(node)); return MDBX_CORRUPTED; } MDBX_page *fp = node_data(node); @@ -16738,22 +16598,21 @@ static int mdbx_xcursor_init1(MDBX_cursor *mc, MDBX_node *node, if (unlikely(mx->mx_db.md_xsize != mc->mc_db->md_xsize)) { if (!MDBX_DISABLE_VALIDATION && unlikely(mc->mc_db->md_xsize != 0)) { - mdbx_error("cursor mismatched nested-db md_xsize %u", - mc->mc_db->md_xsize); + ERROR("cursor mismatched nested-db md_xsize %u", mc->mc_db->md_xsize); return MDBX_CORRUPTED; } if (!MDBX_DISABLE_VALIDATION && unlikely((mc->mc_db->md_flags & MDBX_DUPFIXED) == 0)) { - mdbx_error("mismatched nested-db md_flags %u", mc->mc_db->md_flags); + ERROR("mismatched nested-db md_flags %u", mc->mc_db->md_flags); return MDBX_CORRUPTED; } if (!MDBX_DISABLE_VALIDATION && unlikely(mx->mx_db.md_xsize < mc->mc_dbx->md_vlen_min || mx->mx_db.md_xsize > mc->mc_dbx->md_vlen_max)) { - mdbx_error("mismatched nested-db.md_xsize (%u) <> min/max value-length " - "(%zu/%zu)", - mx->mx_db.md_xsize, mc->mc_dbx->md_vlen_min, - mc->mc_dbx->md_vlen_max); + ERROR("mismatched nested-db.md_xsize (%u) <> min/max value-length " + "(%zu/%zu)", + mx->mx_db.md_xsize, mc->mc_dbx->md_vlen_min, + mc->mc_dbx->md_vlen_max); return MDBX_CORRUPTED; } mc->mc_db->md_xsize = mx->mx_db.md_xsize; @@ -16762,8 +16621,8 @@ static int mdbx_xcursor_init1(MDBX_cursor *mc, MDBX_node *node, mx->mx_dbx.md_klen_min = mc->mc_dbx->md_vlen_min; mx->mx_dbx.md_klen_max = mc->mc_dbx->md_vlen_max; - mdbx_debug("Sub-db -%u root page %" PRIaPGNO, mx->mx_cursor.mc_dbi, - mx->mx_db.md_root); + DEBUG("Sub-db -%u root page %" PRIaPGNO, mx->mx_cursor.mc_dbi, + mx->mx_db.md_root); return MDBX_SUCCESS; } @@ -16774,12 +16633,12 @@ static int mdbx_xcursor_init1(MDBX_cursor *mc, MDBX_node *node, * [in] mc The main cursor whose sorted-dups cursor is to be fixed up. * [in] src_mx The xcursor of an up-to-date cursor. * [in] new_dupdata True if converting from a non-F_DUPDATA item. */ -static int mdbx_xcursor_init2(MDBX_cursor *mc, MDBX_xcursor *src_mx, - bool new_dupdata) { +static int cursor_xinit2(MDBX_cursor *mc, MDBX_xcursor *src_mx, + bool new_dupdata) { MDBX_xcursor *mx = mc->mc_xcursor; if (!MDBX_DISABLE_VALIDATION && unlikely(mx == nullptr)) { - mdbx_error("unexpected dupsort-page for non-dupsort db/cursor (dbi %u)", - mc->mc_dbi); + ERROR("unexpected dupsort-page for non-dupsort db/cursor (dbi %u)", + mc->mc_dbi); return MDBX_CORRUPTED; } @@ -16796,16 +16655,15 @@ static int mdbx_xcursor_init2(MDBX_cursor *mc, MDBX_xcursor *src_mx, mx->mx_db = src_mx->mx_db; mx->mx_cursor.mc_pg[0] = src_mx->mx_cursor.mc_pg[0]; if (mx->mx_cursor.mc_flags & C_INITIALIZED) { - mdbx_debug("Sub-db -%u root page %" PRIaPGNO, mx->mx_cursor.mc_dbi, - mx->mx_db.md_root); + DEBUG("Sub-db -%u root page %" PRIaPGNO, mx->mx_cursor.mc_dbi, + mx->mx_db.md_root); } return MDBX_SUCCESS; } -static __inline int mdbx_couple_init(MDBX_cursor_couple *couple, - const MDBX_dbi dbi, MDBX_txn *const txn, - MDBX_db *const db, MDBX_dbx *const dbx, - uint8_t *const dbstate) { +static __inline int couple_init(MDBX_cursor_couple *couple, const MDBX_dbi dbi, + MDBX_txn *const txn, MDBX_db *const db, + MDBX_dbx *const dbx, uint8_t *const dbstate) { couple->outer.mc_signature = MDBX_MC_LIVE; couple->outer.mc_next = NULL; couple->outer.mc_backup = NULL; @@ -16821,7 +16679,7 @@ static __inline int mdbx_couple_init(MDBX_cursor_couple *couple, STATIC_ASSERT(CC_BRANCH == P_BRANCH && CC_LEAF == P_LEAF && CC_OVERFLOW == P_OVERFLOW && CC_LEAF2 == P_LEAF2); couple->outer.mc_checking = - (mdbx_audit_enabled() || (txn->mt_env->me_flags & MDBX_VALIDATION)) + (AUDIT_ENABLED() || (txn->mt_env->me_flags & MDBX_VALIDATION)) ? CC_PAGECHECK | CC_LEAF : CC_LEAF; couple->outer.mc_ki[0] = 0; @@ -16829,17 +16687,17 @@ static __inline int mdbx_couple_init(MDBX_cursor_couple *couple, int rc = MDBX_SUCCESS; if (unlikely(*couple->outer.mc_dbistate & DBI_STALE)) { - rc = mdbx_page_search(&couple->outer, NULL, MDBX_PS_ROOTONLY); + rc = page_search(&couple->outer, NULL, MDBX_PS_ROOTONLY); rc = (rc != MDBX_NOTFOUND) ? rc : MDBX_SUCCESS; } else if (unlikely(couple->outer.mc_dbx->md_klen_max == 0)) { - rc = mdbx_setup_dbx(couple->outer.mc_dbx, couple->outer.mc_db, - txn->mt_env->me_psize); + rc = setup_dbx(couple->outer.mc_dbx, couple->outer.mc_db, + txn->mt_env->me_psize); } if (couple->outer.mc_db->md_flags & MDBX_DUPSORT) { couple->inner.mx_cursor.mc_signature = MDBX_MC_LIVE; couple->outer.mc_xcursor = &couple->inner; - rc = mdbx_xcursor_init0(&couple->outer); + rc = cursor_xinit0(&couple->outer); if (unlikely(rc != MDBX_SUCCESS)) return rc; couple->inner.mx_dbx.md_klen_min = couple->outer.mc_dbx->md_vlen_min; @@ -16849,15 +16707,15 @@ static __inline int mdbx_couple_init(MDBX_cursor_couple *couple, } /* Initialize a cursor for a given transaction and database. */ -static int mdbx_cursor_init(MDBX_cursor *mc, MDBX_txn *txn, MDBX_dbi dbi) { +static int cursor_init(MDBX_cursor *mc, MDBX_txn *txn, MDBX_dbi dbi) { STATIC_ASSERT(offsetof(MDBX_cursor_couple, outer) == 0); - return mdbx_couple_init(container_of(mc, MDBX_cursor_couple, outer), dbi, txn, - &txn->mt_dbs[dbi], &txn->mt_dbxs[dbi], - &txn->mt_dbistate[dbi]); + return couple_init(container_of(mc, MDBX_cursor_couple, outer), dbi, txn, + &txn->mt_dbs[dbi], &txn->mt_dbxs[dbi], + &txn->mt_dbistate[dbi]); } MDBX_cursor *mdbx_cursor_create(void *context) { - MDBX_cursor_couple *couple = mdbx_calloc(1, sizeof(MDBX_cursor_couple)); + MDBX_cursor_couple *couple = osal_calloc(1, sizeof(MDBX_cursor_couple)); if (unlikely(!couple)) return nullptr; @@ -16911,7 +16769,7 @@ int mdbx_cursor_bind(MDBX_txn *txn, MDBX_cursor *mc, MDBX_dbi dbi) { return MDBX_EACCESS; if (unlikely(mc->mc_backup)) /* Cursor from parent transaction */ { - mdbx_cassert(mc, mc->mc_signature == MDBX_MC_LIVE); + cASSERT(mc, mc->mc_signature == MDBX_MC_LIVE); if (unlikely(mc->mc_dbi != dbi || /* paranoia */ mc->mc_signature != MDBX_MC_LIVE || mc->mc_txn != txn)) @@ -16931,16 +16789,16 @@ int mdbx_cursor_bind(MDBX_txn *txn, MDBX_cursor *mc, MDBX_dbi dbi) { if (mc->mc_signature == MDBX_MC_LIVE) { if (unlikely(!mc->mc_txn || mc->mc_txn->mt_signature != MDBX_MT_SIGNATURE)) { - mdbx_error("Wrong cursor's transaction %p 0x%x", - __Wpedantic_format_voidptr(mc->mc_txn), - mc->mc_txn ? mc->mc_txn->mt_signature : 0); + ERROR("Wrong cursor's transaction %p 0x%x", + __Wpedantic_format_voidptr(mc->mc_txn), + mc->mc_txn ? mc->mc_txn->mt_signature : 0); return MDBX_PROBLEM; } if (mc->mc_flags & C_UNTRACK) { MDBX_cursor **prev = &mc->mc_txn->mt_cursors[mc->mc_dbi]; while (*prev && *prev != mc) prev = &(*prev)->mc_next; - mdbx_cassert(mc, *prev == mc); + cASSERT(mc, *prev == mc); *prev = mc->mc_next; } mc->mc_signature = MDBX_MC_READY4CLOSE; @@ -16951,9 +16809,9 @@ int mdbx_cursor_bind(MDBX_txn *txn, MDBX_cursor *mc, MDBX_dbi dbi) { mc->mc_dbx = NULL; mc->mc_dbistate = NULL; } - mdbx_cassert(mc, !(mc->mc_flags & C_UNTRACK)); + cASSERT(mc, !(mc->mc_flags & C_UNTRACK)); - rc = mdbx_cursor_init(mc, txn, dbi); + rc = cursor_init(mc, txn, dbi); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -17025,27 +16883,27 @@ again: void mdbx_cursor_close(MDBX_cursor *mc) { if (likely(mc)) { - mdbx_ensure(NULL, mc->mc_signature == MDBX_MC_LIVE || - mc->mc_signature == MDBX_MC_READY4CLOSE); + ENSURE(NULL, mc->mc_signature == MDBX_MC_LIVE || + mc->mc_signature == MDBX_MC_READY4CLOSE); MDBX_txn *const txn = mc->mc_txn; if (!mc->mc_backup) { mc->mc_txn = NULL; /* Unlink from txn, if tracked. */ if (mc->mc_flags & C_UNTRACK) { - mdbx_ensure(txn->mt_env, check_txn(txn, 0) == MDBX_SUCCESS); + ENSURE(txn->mt_env, check_txn(txn, 0) == MDBX_SUCCESS); MDBX_cursor **prev = &txn->mt_cursors[mc->mc_dbi]; while (*prev && *prev != mc) prev = &(*prev)->mc_next; - mdbx_tassert(txn, *prev == mc); + tASSERT(txn, *prev == mc); *prev = mc->mc_next; } mc->mc_signature = 0; mc->mc_next = mc; - mdbx_free(mc); + osal_free(mc); } else { /* Cursor closed before nested txn ends */ - mdbx_tassert(txn, mc->mc_signature == MDBX_MC_LIVE); - mdbx_ensure(txn->mt_env, check_txn_rw(txn, 0) == MDBX_SUCCESS); + tASSERT(txn, mc->mc_signature == MDBX_MC_LIVE); + ENSURE(txn->mt_env, check_txn_rw(txn, 0) == MDBX_SUCCESS); mc->mc_signature = MDBX_MC_WAIT4EOT; } } @@ -17099,8 +16957,8 @@ int mdbx_cursor_count(const MDBX_cursor *mc, size_t *countp) { if (mc->mc_xcursor != NULL) { MDBX_node *node = page_node(mp, mc->mc_ki[mc->mc_top]); if (F_ISSET(node_flags(node), F_DUPDATA)) { - mdbx_cassert(mc, mc->mc_xcursor && (mc->mc_xcursor->mx_cursor.mc_flags & - C_INITIALIZED)); + cASSERT(mc, mc->mc_xcursor && + (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)); *countp = unlikely(mc->mc_xcursor->mx_db.md_entries > PTRDIFF_MAX) ? PTRDIFF_MAX : (size_t)mc->mc_xcursor->mx_db.md_entries; @@ -17114,7 +16972,7 @@ int mdbx_cursor_count(const MDBX_cursor *mc, size_t *countp) { * [in] mc Cursor pointing to the node to operate on. * [in] key The new key to use. * Returns 0 on success, non-zero on failure. */ -static int mdbx_update_key(MDBX_cursor *mc, const MDBX_val *key) { +static int update_key(MDBX_cursor *mc, const MDBX_val *key) { MDBX_page *mp; MDBX_node *node; char *base; @@ -17123,7 +16981,7 @@ static int mdbx_update_key(MDBX_cursor *mc, const MDBX_val *key) { int ptr, i, nkeys, indx; DKBUF_DEBUG; - mdbx_cassert(mc, cursor_is_tracked(mc)); + cASSERT(mc, cursor_is_tracked(mc)); indx = mc->mc_ki[mc->mc_top]; mp = mc->mc_pg[mc->mc_top]; node = page_node(mp, indx); @@ -17132,8 +16990,8 @@ static int mdbx_update_key(MDBX_cursor *mc, const MDBX_val *key) { MDBX_val k2; k2.iov_base = node_key(node); k2.iov_len = node_ks(node); - mdbx_debug("update key %u (offset %u) [%s] to [%s] on page %" PRIaPGNO, indx, - ptr, DVAL_DEBUG(&k2), DKEY_DEBUG(key), mp->mp_pgno); + DEBUG("update key %u (offset %u) [%s] to [%s] on page %" PRIaPGNO, indx, ptr, + DVAL_DEBUG(&k2), DKEY_DEBUG(key), mp->mp_pgno); #endif /* MDBX_DEBUG */ /* Sizes must be 2-byte aligned. */ @@ -17145,19 +17003,19 @@ static int mdbx_update_key(MDBX_cursor *mc, const MDBX_val *key) { if (delta) { if (delta > (int)page_room(mp)) { /* not enough space left, do a delete and split */ - mdbx_debug("Not enough room, delta = %zd, splitting...", delta); + DEBUG("Not enough room, delta = %zd, splitting...", delta); pgno_t pgno = node_pgno(node); node_del(mc, 0); int err = page_split(mc, key, NULL, pgno, MDBX_SPLIT_REPLACE); - if (err == MDBX_SUCCESS && mdbx_audit_enabled()) - err = mdbx_cursor_check_updating(mc); + if (err == MDBX_SUCCESS && AUDIT_ENABLED()) + err = cursor_check_updating(mc); return err; } nkeys = page_numkeys(mp); for (i = 0; i < nkeys; i++) { if (mp->mp_ptrs[i] <= ptr) { - mdbx_cassert(mc, mp->mp_ptrs[i] >= delta); + cASSERT(mc, mp->mp_ptrs[i] >= delta); mp->mp_ptrs[i] -= (indx_t)delta; } } @@ -17165,7 +17023,7 @@ static int mdbx_update_key(MDBX_cursor *mc, const MDBX_val *key) { base = (char *)mp + mp->mp_upper + PAGEHDRSZ; len = ptr - mp->mp_upper + NODESIZE; memmove(base - delta, base, len); - mdbx_cassert(mc, mp->mp_upper >= delta); + cASSERT(mc, mp->mp_upper >= delta); mp->mp_upper -= (indx_t)delta; node = page_node(mp, indx); @@ -17180,19 +17038,19 @@ static int mdbx_update_key(MDBX_cursor *mc, const MDBX_val *key) { } /* Move a node from csrc to cdst. */ -static int mdbx_node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, bool fromleft) { +static int node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, bool fromleft) { int rc; DKBUF_DEBUG; MDBX_page *psrc = csrc->mc_pg[csrc->mc_top]; MDBX_page *pdst = cdst->mc_pg[cdst->mc_top]; - mdbx_cassert(csrc, PAGETYPE_WHOLE(psrc) == PAGETYPE_WHOLE(pdst)); - mdbx_cassert(csrc, csrc->mc_dbi == cdst->mc_dbi); - mdbx_cassert(csrc, csrc->mc_top == cdst->mc_top); + cASSERT(csrc, PAGETYPE_WHOLE(psrc) == PAGETYPE_WHOLE(pdst)); + cASSERT(csrc, csrc->mc_dbi == cdst->mc_dbi); + cASSERT(csrc, csrc->mc_top == cdst->mc_top); if (unlikely(PAGETYPE_WHOLE(psrc) != PAGETYPE_WHOLE(pdst))) { bailout: - mdbx_error("Wrong or mismatch pages's types (src %d, dst %d) to move node", - PAGETYPE_WHOLE(psrc), PAGETYPE_WHOLE(pdst)); + ERROR("Wrong or mismatch pages's types (src %d, dst %d) to move node", + PAGETYPE_WHOLE(psrc), PAGETYPE_WHOLE(pdst)); csrc->mc_txn->mt_flags |= MDBX_TXN_ERROR; return MDBX_PROBLEM; } @@ -17201,20 +17059,20 @@ static int mdbx_node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, bool fromleft) { switch (PAGETYPE_WHOLE(psrc)) { case P_BRANCH: { const MDBX_node *srcnode = page_node(psrc, csrc->mc_ki[csrc->mc_top]); - mdbx_cassert(csrc, node_flags(srcnode) == 0); + cASSERT(csrc, node_flags(srcnode) == 0); const pgno_t srcpg = node_pgno(srcnode); key4move.iov_len = node_ks(srcnode); key4move.iov_base = node_key(srcnode); if (csrc->mc_ki[csrc->mc_top] == 0) { const unsigned snum = csrc->mc_snum; - mdbx_cassert(csrc, snum > 0); + cASSERT(csrc, snum > 0); /* must find the lowest key below src */ - rc = mdbx_page_search_lowest(csrc); + rc = page_search_lowest(csrc); MDBX_page *lowest_page = csrc->mc_pg[csrc->mc_top]; if (unlikely(rc)) return rc; - mdbx_cassert(csrc, IS_LEAF(lowest_page)); + cASSERT(csrc, IS_LEAF(lowest_page)); if (unlikely(!IS_LEAF(lowest_page))) goto bailout; if (IS_LEAF2(lowest_page)) { @@ -17232,23 +17090,23 @@ static int mdbx_node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, bool fromleft) { csrc->mc_ki[csrc->mc_top] = 0; /* paranoia */ - mdbx_cassert(csrc, psrc == csrc->mc_pg[csrc->mc_top]); - mdbx_cassert(csrc, IS_BRANCH(psrc)); + cASSERT(csrc, psrc == csrc->mc_pg[csrc->mc_top]); + cASSERT(csrc, IS_BRANCH(psrc)); if (unlikely(!IS_BRANCH(psrc))) goto bailout; } if (cdst->mc_ki[cdst->mc_top] == 0) { const unsigned snum = cdst->mc_snum; - mdbx_cassert(csrc, snum > 0); + cASSERT(csrc, snum > 0); MDBX_cursor mn; cursor_copy(cdst, &mn); /* must find the lowest key below dst */ - rc = mdbx_page_search_lowest(&mn); + rc = page_search_lowest(&mn); if (unlikely(rc)) return rc; MDBX_page *const lowest_page = mn.mc_pg[mn.mc_top]; - mdbx_cassert(cdst, IS_LEAF(lowest_page)); + cASSERT(cdst, IS_LEAF(lowest_page)); if (unlikely(!IS_LEAF(lowest_page))) goto bailout; MDBX_val key; @@ -17274,13 +17132,12 @@ static int mdbx_node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, bool fromleft) { if (unlikely(needed > have)) return MDBX_RESULT_TRUE; - if (unlikely((rc = mdbx_page_touch(csrc)) || - (rc = mdbx_page_touch(cdst)))) + if (unlikely((rc = page_touch(csrc)) || (rc = page_touch(cdst)))) return rc; psrc = csrc->mc_pg[csrc->mc_top]; pdst = cdst->mc_pg[cdst->mc_top]; - WITH_CURSOR_TRACKING(mn, rc = mdbx_update_key(&mn, &key)); + WITH_CURSOR_TRACKING(mn, rc = update_key(&mn, &key)); if (unlikely(rc)) return rc; } else { @@ -17289,24 +17146,23 @@ static int mdbx_node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, bool fromleft) { if (unlikely(needed > have)) return MDBX_RESULT_TRUE; - if (unlikely((rc = mdbx_page_touch(csrc)) || - (rc = mdbx_page_touch(cdst)))) + if (unlikely((rc = page_touch(csrc)) || (rc = page_touch(cdst)))) return rc; psrc = csrc->mc_pg[csrc->mc_top]; pdst = cdst->mc_pg[cdst->mc_top]; } - mdbx_debug("moving %s-node %u [%s] on page %" PRIaPGNO - " to node %u on page %" PRIaPGNO, - "branch", csrc->mc_ki[csrc->mc_top], DKEY_DEBUG(&key4move), - psrc->mp_pgno, cdst->mc_ki[cdst->mc_top], pdst->mp_pgno); + DEBUG("moving %s-node %u [%s] on page %" PRIaPGNO + " to node %u on page %" PRIaPGNO, + "branch", csrc->mc_ki[csrc->mc_top], DKEY_DEBUG(&key4move), + psrc->mp_pgno, cdst->mc_ki[cdst->mc_top], pdst->mp_pgno); /* Add the node to the destination page. */ rc = node_add_branch(cdst, cdst->mc_ki[cdst->mc_top], &key4move, srcpg); } break; case P_LEAF: { /* Mark src and dst as dirty. */ - if (unlikely((rc = mdbx_page_touch(csrc)) || (rc = mdbx_page_touch(cdst)))) + if (unlikely((rc = page_touch(csrc)) || (rc = page_touch(cdst)))) return rc; psrc = csrc->mc_pg[csrc->mc_top]; pdst = cdst->mc_pg[cdst->mc_top]; @@ -17316,10 +17172,10 @@ static int mdbx_node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, bool fromleft) { data.iov_base = node_data(srcnode); key4move.iov_len = node_ks(srcnode); key4move.iov_base = node_key(srcnode); - mdbx_debug("moving %s-node %u [%s] on page %" PRIaPGNO - " to node %u on page %" PRIaPGNO, - "leaf", csrc->mc_ki[csrc->mc_top], DKEY_DEBUG(&key4move), - psrc->mp_pgno, cdst->mc_ki[cdst->mc_top], pdst->mp_pgno); + DEBUG("moving %s-node %u [%s] on page %" PRIaPGNO + " to node %u on page %" PRIaPGNO, + "leaf", csrc->mc_ki[csrc->mc_top], DKEY_DEBUG(&key4move), + psrc->mp_pgno, cdst->mc_ki[cdst->mc_top], pdst->mp_pgno); /* Add the node to the destination page. */ rc = node_add_leaf(cdst, cdst->mc_ki[cdst->mc_top], &key4move, &data, node_flags(srcnode)); @@ -17327,17 +17183,17 @@ static int mdbx_node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, bool fromleft) { case P_LEAF | P_LEAF2: { /* Mark src and dst as dirty. */ - if (unlikely((rc = mdbx_page_touch(csrc)) || (rc = mdbx_page_touch(cdst)))) + if (unlikely((rc = page_touch(csrc)) || (rc = page_touch(cdst)))) return rc; psrc = csrc->mc_pg[csrc->mc_top]; pdst = cdst->mc_pg[cdst->mc_top]; key4move.iov_len = csrc->mc_db->md_xsize; key4move.iov_base = page_leaf2key(psrc, csrc->mc_ki[csrc->mc_top], key4move.iov_len); - mdbx_debug("moving %s-node %u [%s] on page %" PRIaPGNO - " to node %u on page %" PRIaPGNO, - "leaf2", csrc->mc_ki[csrc->mc_top], DKEY_DEBUG(&key4move), - psrc->mp_pgno, cdst->mc_ki[cdst->mc_top], pdst->mp_pgno); + DEBUG("moving %s-node %u [%s] on page %" PRIaPGNO + " to node %u on page %" PRIaPGNO, + "leaf2", csrc->mc_ki[csrc->mc_top], DKEY_DEBUG(&key4move), + psrc->mp_pgno, cdst->mc_ki[cdst->mc_top], pdst->mp_pgno); /* Add the node to the destination page. */ rc = node_add_leaf2(cdst, cdst->mc_ki[cdst->mc_top], &key4move); } break; @@ -17353,15 +17209,15 @@ static int mdbx_node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, bool fromleft) { /* Delete the node from the source page. */ node_del(csrc, key4move.iov_len); - mdbx_cassert(csrc, psrc == csrc->mc_pg[csrc->mc_top]); - mdbx_cassert(cdst, pdst == cdst->mc_pg[cdst->mc_top]); - mdbx_cassert(csrc, PAGETYPE_WHOLE(psrc) == PAGETYPE_WHOLE(pdst)); + cASSERT(csrc, psrc == csrc->mc_pg[csrc->mc_top]); + cASSERT(cdst, pdst == cdst->mc_pg[cdst->mc_top]); + cASSERT(csrc, PAGETYPE_WHOLE(psrc) == PAGETYPE_WHOLE(pdst)); { /* Adjust other cursors pointing to mp */ MDBX_cursor *m2, *m3; const MDBX_dbi dbi = csrc->mc_dbi; - mdbx_cassert(csrc, csrc->mc_top == cdst->mc_top); + cASSERT(csrc, csrc->mc_top == cdst->mc_top); if (fromleft) { /* If we're adding on the left, bump others up */ for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) { @@ -17376,7 +17232,7 @@ static int mdbx_node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, bool fromleft) { m3->mc_ki[csrc->mc_top] == csrc->mc_ki[csrc->mc_top]) { m3->mc_pg[csrc->mc_top] = pdst; m3->mc_ki[csrc->mc_top] = cdst->mc_ki[cdst->mc_top]; - mdbx_cassert(csrc, csrc->mc_top > 0); + cASSERT(csrc, csrc->mc_top > 0); m3->mc_ki[csrc->mc_top - 1]++; } if (XCURSOR_INITED(m3) && IS_LEAF(psrc)) @@ -17394,7 +17250,7 @@ static int mdbx_node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, bool fromleft) { if (!m3->mc_ki[csrc->mc_top]) { m3->mc_pg[csrc->mc_top] = pdst; m3->mc_ki[csrc->mc_top] = cdst->mc_ki[cdst->mc_top]; - mdbx_cassert(csrc, csrc->mc_top > 0); + cASSERT(csrc, csrc->mc_top > 0); m3->mc_ki[csrc->mc_top - 1]--; } else { m3->mc_ki[csrc->mc_top]--; @@ -17409,7 +17265,7 @@ static int mdbx_node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, bool fromleft) { /* Update the parent separators. */ if (csrc->mc_ki[csrc->mc_top] == 0) { - mdbx_cassert(csrc, csrc->mc_top > 0); + cASSERT(csrc, csrc->mc_top > 0); if (csrc->mc_ki[csrc->mc_top - 1] != 0) { MDBX_val key; if (IS_LEAF2(psrc)) { @@ -17420,15 +17276,15 @@ static int mdbx_node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, bool fromleft) { key.iov_len = node_ks(srcnode); key.iov_base = node_key(srcnode); } - mdbx_debug("update separator for source page %" PRIaPGNO " to [%s]", - psrc->mp_pgno, DKEY_DEBUG(&key)); + DEBUG("update separator for source page %" PRIaPGNO " to [%s]", + psrc->mp_pgno, DKEY_DEBUG(&key)); MDBX_cursor mn; cursor_copy(csrc, &mn); - mdbx_cassert(csrc, mn.mc_snum > 0); + cASSERT(csrc, mn.mc_snum > 0); mn.mc_snum--; mn.mc_top--; - /* We want mdbx_rebalance to find mn when doing fixups */ - WITH_CURSOR_TRACKING(mn, rc = mdbx_update_key(&mn, &key)); + /* We want rebalance to find mn when doing fixups */ + WITH_CURSOR_TRACKING(mn, rc = update_key(&mn, &key)); if (unlikely(rc != MDBX_SUCCESS)) return rc; } @@ -17436,14 +17292,14 @@ static int mdbx_node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, bool fromleft) { const MDBX_val nullkey = {0, 0}; const indx_t ix = csrc->mc_ki[csrc->mc_top]; csrc->mc_ki[csrc->mc_top] = 0; - rc = mdbx_update_key(csrc, &nullkey); + rc = update_key(csrc, &nullkey); csrc->mc_ki[csrc->mc_top] = ix; - mdbx_cassert(csrc, rc == MDBX_SUCCESS); + cASSERT(csrc, rc == MDBX_SUCCESS); } } if (cdst->mc_ki[cdst->mc_top] == 0) { - mdbx_cassert(cdst, cdst->mc_top > 0); + cASSERT(cdst, cdst->mc_top > 0); if (cdst->mc_ki[cdst->mc_top - 1] != 0) { MDBX_val key; if (IS_LEAF2(pdst)) { @@ -17454,15 +17310,15 @@ static int mdbx_node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, bool fromleft) { key.iov_len = node_ks(srcnode); key.iov_base = node_key(srcnode); } - mdbx_debug("update separator for destination page %" PRIaPGNO " to [%s]", - pdst->mp_pgno, DKEY_DEBUG(&key)); + DEBUG("update separator for destination page %" PRIaPGNO " to [%s]", + pdst->mp_pgno, DKEY_DEBUG(&key)); MDBX_cursor mn; cursor_copy(cdst, &mn); - mdbx_cassert(cdst, mn.mc_snum > 0); + cASSERT(cdst, mn.mc_snum > 0); mn.mc_snum--; mn.mc_top--; - /* We want mdbx_rebalance to find mn when doing fixups */ - WITH_CURSOR_TRACKING(mn, rc = mdbx_update_key(&mn, &key)); + /* We want rebalance to find mn when doing fixups */ + WITH_CURSOR_TRACKING(mn, rc = update_key(&mn, &key)); if (unlikely(rc != MDBX_SUCCESS)) return rc; } @@ -17470,9 +17326,9 @@ static int mdbx_node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, bool fromleft) { const MDBX_val nullkey = {0, 0}; const indx_t ix = cdst->mc_ki[cdst->mc_top]; cdst->mc_ki[cdst->mc_top] = 0; - rc = mdbx_update_key(cdst, &nullkey); + rc = update_key(cdst, &nullkey); cdst->mc_ki[cdst->mc_top] = ix; - mdbx_cassert(cdst, rc == MDBX_SUCCESS); + cASSERT(cdst, rc == MDBX_SUCCESS); } } @@ -17488,39 +17344,38 @@ static int mdbx_node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, bool fromleft) { * [in] cdst Cursor pointing to the destination page. * * Returns 0 on success, non-zero on failure. */ -static int mdbx_page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { +static int page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { MDBX_val key; int rc; - mdbx_cassert(csrc, csrc != cdst); - mdbx_cassert(csrc, cursor_is_tracked(csrc)); - mdbx_cassert(cdst, cursor_is_tracked(cdst)); + cASSERT(csrc, csrc != cdst); + cASSERT(csrc, cursor_is_tracked(csrc)); + cASSERT(cdst, cursor_is_tracked(cdst)); const MDBX_page *const psrc = csrc->mc_pg[csrc->mc_top]; MDBX_page *pdst = cdst->mc_pg[cdst->mc_top]; - mdbx_debug("merging page %" PRIaPGNO " into %" PRIaPGNO, psrc->mp_pgno, - pdst->mp_pgno); + DEBUG("merging page %" PRIaPGNO " into %" PRIaPGNO, psrc->mp_pgno, + pdst->mp_pgno); - mdbx_cassert(csrc, PAGETYPE_WHOLE(psrc) == PAGETYPE_WHOLE(pdst)); - mdbx_cassert(csrc, - csrc->mc_dbi == cdst->mc_dbi && csrc->mc_db == cdst->mc_db); - mdbx_cassert(csrc, csrc->mc_snum > 1); /* can't merge root page */ - mdbx_cassert(cdst, cdst->mc_snum > 1); - mdbx_cassert(cdst, cdst->mc_snum < cdst->mc_db->md_depth || - IS_LEAF(cdst->mc_pg[cdst->mc_db->md_depth - 1])); - mdbx_cassert(csrc, csrc->mc_snum < csrc->mc_db->md_depth || - IS_LEAF(csrc->mc_pg[csrc->mc_db->md_depth - 1])); - mdbx_cassert(cdst, page_room(pdst) >= page_used(cdst->mc_txn->mt_env, psrc)); + cASSERT(csrc, PAGETYPE_WHOLE(psrc) == PAGETYPE_WHOLE(pdst)); + cASSERT(csrc, csrc->mc_dbi == cdst->mc_dbi && csrc->mc_db == cdst->mc_db); + cASSERT(csrc, csrc->mc_snum > 1); /* can't merge root page */ + cASSERT(cdst, cdst->mc_snum > 1); + cASSERT(cdst, cdst->mc_snum < cdst->mc_db->md_depth || + IS_LEAF(cdst->mc_pg[cdst->mc_db->md_depth - 1])); + cASSERT(csrc, csrc->mc_snum < csrc->mc_db->md_depth || + IS_LEAF(csrc->mc_pg[csrc->mc_db->md_depth - 1])); + cASSERT(cdst, page_room(pdst) >= page_used(cdst->mc_txn->mt_env, psrc)); const int pagetype = PAGETYPE_WHOLE(psrc); /* Move all nodes from src to dst */ const unsigned dst_nkeys = page_numkeys(pdst); const unsigned src_nkeys = page_numkeys(psrc); - mdbx_cassert(cdst, dst_nkeys + src_nkeys >= (IS_LEAF(psrc) ? 1u : 2u)); + cASSERT(cdst, dst_nkeys + src_nkeys >= (IS_LEAF(psrc) ? 1u : 2u)); if (likely(src_nkeys)) { unsigned j = dst_nkeys; if (unlikely(pagetype & P_LEAF2)) { /* Mark dst as dirty. */ - if (unlikely(rc = mdbx_page_touch(cdst))) + if (unlikely(rc = page_touch(cdst))) return rc; key.iov_len = csrc->mc_db->md_xsize; @@ -17540,23 +17395,23 @@ static int mdbx_page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { MDBX_cursor mn; cursor_copy(csrc, &mn); /* must find the lowest key below src */ - rc = mdbx_page_search_lowest(&mn); + rc = page_search_lowest(&mn); if (unlikely(rc)) return rc; const MDBX_page *mp = mn.mc_pg[mn.mc_top]; if (likely(!IS_LEAF2(mp))) { - mdbx_cassert(&mn, IS_LEAF(mp)); + cASSERT(&mn, IS_LEAF(mp)); const MDBX_node *lowest = page_node(mp, 0); key.iov_len = node_ks(lowest); key.iov_base = node_key(lowest); } else { - mdbx_cassert(&mn, mn.mc_top > csrc->mc_top); + cASSERT(&mn, mn.mc_top > csrc->mc_top); key.iov_len = mp->mp_leaf2_ksize; key.iov_base = page_leaf2key(mp, mn.mc_ki[mn.mc_top], key.iov_len); } - mdbx_cassert(&mn, key.iov_len >= csrc->mc_dbx->md_klen_min); - mdbx_cassert(&mn, key.iov_len <= csrc->mc_dbx->md_klen_max); + cASSERT(&mn, key.iov_len >= csrc->mc_dbx->md_klen_min); + cASSERT(&mn, key.iov_len <= csrc->mc_dbx->md_klen_max); const size_t dst_room = page_room(pdst); const size_t src_used = page_used(cdst->mc_txn->mt_env, psrc); @@ -17566,7 +17421,7 @@ static int mdbx_page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { } /* Mark dst as dirty. */ - if (unlikely(rc = mdbx_page_touch(cdst))) + if (unlikely(rc = page_touch(cdst))) return rc; unsigned i = 0; @@ -17577,7 +17432,7 @@ static int mdbx_page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { data.iov_base = node_data(srcnode); rc = node_add_leaf(cdst, j++, &key, &data, node_flags(srcnode)); } else { - mdbx_cassert(csrc, node_flags(srcnode) == 0); + cASSERT(csrc, node_flags(srcnode) == 0); rc = node_add_branch(cdst, j++, &key, node_pgno(srcnode)); } if (unlikely(rc != MDBX_SUCCESS)) @@ -17592,12 +17447,12 @@ static int mdbx_page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { } pdst = cdst->mc_pg[cdst->mc_top]; - mdbx_debug("dst page %" PRIaPGNO " now has %u keys (%.1f%% filled)", - pdst->mp_pgno, page_numkeys(pdst), - page_fill(cdst->mc_txn->mt_env, pdst)); + DEBUG("dst page %" PRIaPGNO " now has %u keys (%.1f%% filled)", + pdst->mp_pgno, page_numkeys(pdst), + page_fill(cdst->mc_txn->mt_env, pdst)); - mdbx_cassert(csrc, psrc == csrc->mc_pg[csrc->mc_top]); - mdbx_cassert(cdst, pdst == cdst->mc_pg[cdst->mc_top]); + cASSERT(csrc, psrc == csrc->mc_pg[csrc->mc_top]); + cASSERT(cdst, pdst == cdst->mc_pg[cdst->mc_top]); } /* Unlink the src page from parent and add to free list. */ @@ -17605,7 +17460,7 @@ static int mdbx_page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { node_del(csrc, 0); if (csrc->mc_ki[csrc->mc_top] == 0) { const MDBX_val nullkey = {0, 0}; - rc = mdbx_update_key(csrc, &nullkey); + rc = update_key(csrc, &nullkey); if (unlikely(rc)) { csrc->mc_top++; return rc; @@ -17613,8 +17468,8 @@ static int mdbx_page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { } csrc->mc_top++; - mdbx_cassert(csrc, psrc == csrc->mc_pg[csrc->mc_top]); - mdbx_cassert(cdst, pdst == cdst->mc_pg[cdst->mc_top]); + cASSERT(csrc, psrc == csrc->mc_pg[csrc->mc_top]); + cASSERT(cdst, pdst == cdst->mc_pg[cdst->mc_top]); { /* Adjust other cursors pointing to mp */ @@ -17628,7 +17483,7 @@ static int mdbx_page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { continue; if (m3->mc_pg[top] == psrc) { m3->mc_pg[top] = pdst; - mdbx_cassert(m3, dst_nkeys + m3->mc_ki[top] <= UINT16_MAX); + cASSERT(m3, dst_nkeys + m3->mc_ki[top] <= UINT16_MAX); m3->mc_ki[top] += (indx_t)dst_nkeys; m3->mc_ki[top - 1] = cdst->mc_ki[top - 1]; } else if (m3->mc_pg[top - 1] == csrc->mc_pg[top - 1] && @@ -17642,26 +17497,26 @@ static int mdbx_page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { /* If not operating on GC, allow this page to be reused * in this txn. Otherwise just add to free list. */ - rc = mdbx_page_retire(csrc, (MDBX_page *)psrc); + rc = page_retire(csrc, (MDBX_page *)psrc); if (unlikely(rc)) return rc; - mdbx_cassert(cdst, cdst->mc_db->md_entries > 0); - mdbx_cassert(cdst, cdst->mc_snum <= cdst->mc_db->md_depth); - mdbx_cassert(cdst, cdst->mc_top > 0); - mdbx_cassert(cdst, cdst->mc_snum == cdst->mc_top + 1); + cASSERT(cdst, cdst->mc_db->md_entries > 0); + cASSERT(cdst, cdst->mc_snum <= cdst->mc_db->md_depth); + cASSERT(cdst, cdst->mc_top > 0); + cASSERT(cdst, cdst->mc_snum == cdst->mc_top + 1); MDBX_page *const top_page = cdst->mc_pg[cdst->mc_top]; const indx_t top_indx = cdst->mc_ki[cdst->mc_top]; const unsigned save_snum = cdst->mc_snum; const uint16_t save_depth = cdst->mc_db->md_depth; - mdbx_cursor_pop(cdst); - rc = mdbx_rebalance(cdst); + cursor_pop(cdst); + rc = rebalance(cdst); if (unlikely(rc)) return rc; - mdbx_cassert(cdst, cdst->mc_db->md_entries > 0); - mdbx_cassert(cdst, cdst->mc_snum <= cdst->mc_db->md_depth); - mdbx_cassert(cdst, cdst->mc_snum == cdst->mc_top + 1); + cASSERT(cdst, cdst->mc_db->md_entries > 0); + cASSERT(cdst, cdst->mc_snum <= cdst->mc_db->md_depth); + cASSERT(cdst, cdst->mc_snum == cdst->mc_top + 1); #if MDBX_ENABLE_PGOP_STAT cdst->mc_txn->mt_env->me_lck->mti_pgop_stat.merge.weak += 1; @@ -17669,13 +17524,12 @@ static int mdbx_page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { if (IS_LEAF(cdst->mc_pg[cdst->mc_top])) { /* LY: don't touch cursor if top-page is a LEAF */ - mdbx_cassert(cdst, - IS_LEAF(cdst->mc_pg[cdst->mc_top]) || - PAGETYPE_WHOLE(cdst->mc_pg[cdst->mc_top]) == pagetype); + cASSERT(cdst, IS_LEAF(cdst->mc_pg[cdst->mc_top]) || + PAGETYPE_WHOLE(cdst->mc_pg[cdst->mc_top]) == pagetype); return MDBX_SUCCESS; } - mdbx_cassert(cdst, page_numkeys(top_page) == dst_nkeys + src_nkeys); + cASSERT(cdst, page_numkeys(top_page) == dst_nkeys + src_nkeys); if (unlikely(pagetype != PAGETYPE_WHOLE(top_page))) { /* LY: LEAF-page becomes BRANCH, unable restore cursor's stack */ @@ -17684,10 +17538,9 @@ static int mdbx_page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { if (top_page == cdst->mc_pg[cdst->mc_top]) { /* LY: don't touch cursor if prev top-page already on the top */ - mdbx_cassert(cdst, cdst->mc_ki[cdst->mc_top] == top_indx); - mdbx_cassert(cdst, - IS_LEAF(cdst->mc_pg[cdst->mc_top]) || - PAGETYPE_WHOLE(cdst->mc_pg[cdst->mc_top]) == pagetype); + cASSERT(cdst, cdst->mc_ki[cdst->mc_top] == top_indx); + cASSERT(cdst, IS_LEAF(cdst->mc_pg[cdst->mc_top]) || + PAGETYPE_WHOLE(cdst->mc_pg[cdst->mc_top]) == pagetype); return MDBX_SUCCESS; } @@ -17698,15 +17551,14 @@ static int mdbx_page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { } if (top_page == cdst->mc_pg[new_snum - 1]) { - mdbx_cassert(cdst, cdst->mc_ki[new_snum - 1] == top_indx); + cASSERT(cdst, cdst->mc_ki[new_snum - 1] == top_indx); /* LY: restore cursor stack */ cdst->mc_snum = (uint8_t)new_snum; cdst->mc_top = (uint8_t)new_snum - 1; - mdbx_cassert(cdst, cdst->mc_snum < cdst->mc_db->md_depth || - IS_LEAF(cdst->mc_pg[cdst->mc_db->md_depth - 1])); - mdbx_cassert(cdst, - IS_LEAF(cdst->mc_pg[cdst->mc_top]) || - PAGETYPE_WHOLE(cdst->mc_pg[cdst->mc_top]) == pagetype); + cASSERT(cdst, cdst->mc_snum < cdst->mc_db->md_depth || + IS_LEAF(cdst->mc_pg[cdst->mc_db->md_depth - 1])); + cASSERT(cdst, IS_LEAF(cdst->mc_pg[cdst->mc_top]) || + PAGETYPE_WHOLE(cdst->mc_pg[cdst->mc_top]) == pagetype); return MDBX_SUCCESS; } @@ -17724,11 +17576,10 @@ static int mdbx_page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { cdst->mc_ki[new_snum] = ~cdst->mc_ki[new_snum]; cdst->mc_snum = (uint8_t)new_snum; cdst->mc_top = (uint8_t)new_snum - 1; - mdbx_cassert(cdst, cdst->mc_snum < cdst->mc_db->md_depth || - IS_LEAF(cdst->mc_pg[cdst->mc_db->md_depth - 1])); - mdbx_cassert(cdst, - IS_LEAF(cdst->mc_pg[cdst->mc_top]) || - PAGETYPE_WHOLE(cdst->mc_pg[cdst->mc_top]) == pagetype); + cASSERT(cdst, cdst->mc_snum < cdst->mc_db->md_depth || + IS_LEAF(cdst->mc_pg[cdst->mc_db->md_depth - 1])); + cASSERT(cdst, IS_LEAF(cdst->mc_pg[cdst->mc_top]) || + PAGETYPE_WHOLE(cdst->mc_pg[cdst->mc_top]) == pagetype); return MDBX_SUCCESS; } @@ -17739,11 +17590,11 @@ bailout: } static void cursor_restore(const MDBX_cursor *csrc, MDBX_cursor *cdst) { - mdbx_cassert(cdst, cdst->mc_dbi == csrc->mc_dbi); - mdbx_cassert(cdst, cdst->mc_txn == csrc->mc_txn); - mdbx_cassert(cdst, cdst->mc_db == csrc->mc_db); - mdbx_cassert(cdst, cdst->mc_dbx == csrc->mc_dbx); - mdbx_cassert(cdst, cdst->mc_dbistate == csrc->mc_dbistate); + cASSERT(cdst, cdst->mc_dbi == csrc->mc_dbi); + cASSERT(cdst, cdst->mc_txn == csrc->mc_txn); + cASSERT(cdst, cdst->mc_db == csrc->mc_db); + cASSERT(cdst, cdst->mc_dbx == csrc->mc_dbx); + cASSERT(cdst, cdst->mc_dbistate == csrc->mc_dbistate); cdst->mc_snum = csrc->mc_snum; cdst->mc_top = csrc->mc_top; cdst->mc_flags = csrc->mc_flags; @@ -17759,8 +17610,8 @@ static void cursor_restore(const MDBX_cursor *csrc, MDBX_cursor *cdst) { * [in] csrc The cursor to copy from. * [out] cdst The cursor to copy to. */ static void cursor_copy(const MDBX_cursor *csrc, MDBX_cursor *cdst) { - mdbx_cassert(csrc, csrc->mc_txn->mt_txnid >= - csrc->mc_txn->mt_env->me_lck->mti_oldest_reader.weak); + cASSERT(csrc, csrc->mc_txn->mt_txnid >= + csrc->mc_txn->mt_env->me_lck->mti_oldest_reader.weak); cdst->mc_dbi = csrc->mc_dbi; cdst->mc_next = NULL; cdst->mc_backup = NULL; @@ -17775,11 +17626,11 @@ static void cursor_copy(const MDBX_cursor *csrc, MDBX_cursor *cdst) { /* Rebalance the tree after a delete operation. * [in] mc Cursor pointing to the page where rebalancing should begin. * Returns 0 on success, non-zero on failure. */ -static int mdbx_rebalance(MDBX_cursor *mc) { - mdbx_cassert(mc, cursor_is_tracked(mc)); - mdbx_cassert(mc, mc->mc_snum > 0); - mdbx_cassert(mc, mc->mc_snum < mc->mc_db->md_depth || - IS_LEAF(mc->mc_pg[mc->mc_db->md_depth - 1])); +static int rebalance(MDBX_cursor *mc) { + cASSERT(mc, cursor_is_tracked(mc)); + cASSERT(mc, mc->mc_snum > 0); + cASSERT(mc, mc->mc_snum < mc->mc_db->md_depth || + IS_LEAF(mc->mc_pg[mc->mc_db->md_depth - 1])); const int pagetype = PAGETYPE_WHOLE(mc->mc_pg[mc->mc_top]); STATIC_ASSERT(P_BRANCH == 1); @@ -17793,23 +17644,22 @@ static int mdbx_rebalance(MDBX_cursor *mc) { const MDBX_page *const tp = mc->mc_pg[mc->mc_top]; const unsigned numkeys = page_numkeys(tp); const unsigned room = page_room(tp); - mdbx_debug("rebalancing %s page %" PRIaPGNO - " (has %u keys, full %.1f%%, used %u, room %u bytes )", - (pagetype & P_LEAF) ? "leaf" : "branch", tp->mp_pgno, numkeys, - page_fill(mc->mc_txn->mt_env, tp), - page_used(mc->mc_txn->mt_env, tp), room); + DEBUG("rebalancing %s page %" PRIaPGNO + " (has %u keys, full %.1f%%, used %u, room %u bytes )", + (pagetype & P_LEAF) ? "leaf" : "branch", tp->mp_pgno, numkeys, + page_fill(mc->mc_txn->mt_env, tp), page_used(mc->mc_txn->mt_env, tp), + room); if (unlikely(numkeys < minkeys)) { - mdbx_debug("page %" PRIaPGNO " must be merged due keys < %u threshold", - tp->mp_pgno, minkeys); + DEBUG("page %" PRIaPGNO " must be merged due keys < %u threshold", + tp->mp_pgno, minkeys); } else if (unlikely(room > room_threshold)) { - mdbx_debug("page %" PRIaPGNO " should be merged due room %u > %u threshold", - tp->mp_pgno, room, room_threshold); + DEBUG("page %" PRIaPGNO " should be merged due room %u > %u threshold", + tp->mp_pgno, room, room_threshold); } else { - mdbx_debug("no need to rebalance page %" PRIaPGNO - ", room %u < %u threshold", - tp->mp_pgno, room, room_threshold); - mdbx_cassert(mc, mc->mc_db->md_entries > 0); + DEBUG("no need to rebalance page %" PRIaPGNO ", room %u < %u threshold", + tp->mp_pgno, room, room_threshold); + cASSERT(mc, mc->mc_db->md_entries > 0); return MDBX_SUCCESS; } @@ -17817,21 +17667,21 @@ static int mdbx_rebalance(MDBX_cursor *mc) { if (mc->mc_snum < 2) { MDBX_page *const mp = mc->mc_pg[0]; const unsigned nkeys = page_numkeys(mp); - mdbx_cassert(mc, (mc->mc_db->md_entries == 0) == (nkeys == 0)); + cASSERT(mc, (mc->mc_db->md_entries == 0) == (nkeys == 0)); if (IS_SUBP(mp)) { - mdbx_debug("%s", "Can't rebalance a subpage, ignoring"); - mdbx_cassert(mc, pagetype & P_LEAF); + DEBUG("%s", "Can't rebalance a subpage, ignoring"); + cASSERT(mc, pagetype & P_LEAF); return MDBX_SUCCESS; } if (nkeys == 0) { - mdbx_cassert(mc, IS_LEAF(mp)); - mdbx_debug("%s", "tree is completely empty"); - mdbx_cassert(mc, (*mc->mc_dbistate & DBI_DIRTY) != 0); + cASSERT(mc, IS_LEAF(mp)); + DEBUG("%s", "tree is completely empty"); + cASSERT(mc, (*mc->mc_dbistate & DBI_DIRTY) != 0); mc->mc_db->md_root = P_INVALID; mc->mc_db->md_depth = 0; - mdbx_cassert(mc, mc->mc_db->md_branch_pages == 0 && - mc->mc_db->md_overflow_pages == 0 && - mc->mc_db->md_leaf_pages == 1); + cASSERT(mc, mc->mc_db->md_branch_pages == 0 && + mc->mc_db->md_overflow_pages == 0 && + mc->mc_db->md_leaf_pages == 1); /* Adjust cursors pointing to mp */ for (MDBX_cursor *m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2 = m2->mc_next) { @@ -17849,11 +17699,11 @@ static int mdbx_rebalance(MDBX_cursor *mc) { mc->mc_top = 0; mc->mc_flags &= ~C_INITIALIZED; - rc = mdbx_page_retire(mc, mp); + rc = page_retire(mc, mp); if (unlikely(rc != MDBX_SUCCESS)) return rc; } else if (IS_BRANCH(mp) && nkeys == 1) { - mdbx_debug("%s", "collapsing root page!"); + DEBUG("%s", "collapsing root page!"); mc->mc_db->md_root = node_pgno(page_node(mp, 0)); rc = page_get(mc, mc->mc_db->md_root, &mc->mc_pg[0], mp->mp_txnid); if (unlikely(rc != MDBX_SUCCESS)) @@ -17881,18 +17731,17 @@ static int mdbx_rebalance(MDBX_cursor *mc) { m3->mc_top--; } } - mdbx_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top]) || - PAGETYPE_WHOLE(mc->mc_pg[mc->mc_top]) == pagetype); - mdbx_cassert(mc, mc->mc_snum < mc->mc_db->md_depth || - IS_LEAF(mc->mc_pg[mc->mc_db->md_depth - 1])); + cASSERT(mc, IS_LEAF(mc->mc_pg[mc->mc_top]) || + PAGETYPE_WHOLE(mc->mc_pg[mc->mc_top]) == pagetype); + cASSERT(mc, mc->mc_snum < mc->mc_db->md_depth || + IS_LEAF(mc->mc_pg[mc->mc_db->md_depth - 1])); - rc = mdbx_page_retire(mc, mp); + rc = page_retire(mc, mp); if (unlikely(rc != MDBX_SUCCESS)) return rc; } else { - mdbx_debug("root page %" PRIaPGNO - " doesn't need rebalancing (flags 0x%x)", - mp->mp_pgno, mp->mp_flags); + DEBUG("root page %" PRIaPGNO " doesn't need rebalancing (flags 0x%x)", + mp->mp_pgno, mp->mp_flags); } return MDBX_SUCCESS; } @@ -17900,9 +17749,9 @@ static int mdbx_rebalance(MDBX_cursor *mc) { /* The parent (branch page) must have at least 2 pointers, * otherwise the tree is invalid. */ const unsigned pre_top = mc->mc_top - 1; - mdbx_cassert(mc, IS_BRANCH(mc->mc_pg[pre_top])); - mdbx_cassert(mc, !IS_SUBP(mc->mc_pg[0])); - mdbx_cassert(mc, page_numkeys(mc->mc_pg[pre_top]) > 1); + cASSERT(mc, IS_BRANCH(mc->mc_pg[pre_top])); + cASSERT(mc, !IS_SUBP(mc->mc_pg[0])); + cASSERT(mc, page_numkeys(mc->mc_pg[pre_top]) > 1); /* Leaf page fill factor is below the threshold. * Try to move keys from left or right neighbor, or @@ -17919,8 +17768,7 @@ static int mdbx_rebalance(MDBX_cursor *mc) { &left, mc->mc_pg[mc->mc_top]->mp_txnid); if (unlikely(rc != MDBX_SUCCESS)) return rc; - mdbx_cassert(mc, - PAGETYPE_WHOLE(left) == PAGETYPE_WHOLE(mc->mc_pg[mc->mc_top])); + cASSERT(mc, PAGETYPE_WHOLE(left) == PAGETYPE_WHOLE(mc->mc_pg[mc->mc_top])); } if (mn.mc_ki[pre_top] + 1u < page_numkeys(mn.mc_pg[pre_top])) { rc = page_get( @@ -17928,10 +17776,9 @@ static int mdbx_rebalance(MDBX_cursor *mc) { &right, mc->mc_pg[mc->mc_top]->mp_txnid); if (unlikely(rc != MDBX_SUCCESS)) return rc; - mdbx_cassert(mc, PAGETYPE_WHOLE(right) == - PAGETYPE_WHOLE(mc->mc_pg[mc->mc_top])); + cASSERT(mc, PAGETYPE_WHOLE(right) == PAGETYPE_WHOLE(mc->mc_pg[mc->mc_top])); } - mdbx_cassert(mc, left || right); + cASSERT(mc, left || right); const unsigned ki_top = mc->mc_ki[mc->mc_top]; const unsigned ki_pre_top = mn.mc_ki[pre_top]; @@ -17944,33 +17791,33 @@ static int mdbx_rebalance(MDBX_cursor *mc) { retry: if (left_room > room_threshold && left_room >= right_room) { /* try merge with left */ - mdbx_cassert(mc, left_nkeys >= minkeys); + cASSERT(mc, left_nkeys >= minkeys); mn.mc_pg[mn.mc_top] = left; mn.mc_ki[mn.mc_top - 1] = (indx_t)(ki_pre_top - 1); mn.mc_ki[mn.mc_top] = (indx_t)(left_nkeys - 1); mc->mc_ki[mc->mc_top] = 0; const unsigned new_ki = ki_top + left_nkeys; mn.mc_ki[mn.mc_top] += mc->mc_ki[mn.mc_top] + 1; - /* We want mdbx_rebalance to find mn when doing fixups */ - WITH_CURSOR_TRACKING(mn, rc = mdbx_page_merge(mc, &mn)); + /* We want rebalance to find mn when doing fixups */ + WITH_CURSOR_TRACKING(mn, rc = page_merge(mc, &mn)); if (likely(rc != MDBX_RESULT_TRUE)) { cursor_restore(&mn, mc); mc->mc_ki[mc->mc_top] = (indx_t)new_ki; - mdbx_cassert(mc, rc || page_numkeys(mc->mc_pg[mc->mc_top]) >= minkeys); + cASSERT(mc, rc || page_numkeys(mc->mc_pg[mc->mc_top]) >= minkeys); return rc; } } if (right_room > room_threshold) { /* try merge with right */ - mdbx_cassert(mc, right_nkeys >= minkeys); + cASSERT(mc, right_nkeys >= minkeys); mn.mc_pg[mn.mc_top] = right; mn.mc_ki[mn.mc_top - 1] = (indx_t)(ki_pre_top + 1); mn.mc_ki[mn.mc_top] = 0; mc->mc_ki[mc->mc_top] = (indx_t)nkeys; - WITH_CURSOR_TRACKING(mn, rc = mdbx_page_merge(&mn, mc)); + WITH_CURSOR_TRACKING(mn, rc = page_merge(&mn, mc)); if (likely(rc != MDBX_RESULT_TRUE)) { mc->mc_ki[mc->mc_top] = (indx_t)ki_top; - mdbx_cassert(mc, rc || page_numkeys(mc->mc_pg[mc->mc_top]) >= minkeys); + cASSERT(mc, rc || page_numkeys(mc->mc_pg[mc->mc_top]) >= minkeys); return rc; } } @@ -17982,10 +17829,10 @@ retry: mn.mc_ki[mn.mc_top - 1] = (indx_t)(ki_pre_top - 1); mn.mc_ki[mn.mc_top] = (indx_t)(left_nkeys - 1); mc->mc_ki[mc->mc_top] = 0; - WITH_CURSOR_TRACKING(mn, rc = mdbx_node_move(&mn, mc, true)); + WITH_CURSOR_TRACKING(mn, rc = node_move(&mn, mc, true)); if (likely(rc != MDBX_RESULT_TRUE)) { mc->mc_ki[mc->mc_top] = (indx_t)(ki_top + 1); - mdbx_cassert(mc, rc || page_numkeys(mc->mc_pg[mc->mc_top]) >= minkeys); + cASSERT(mc, rc || page_numkeys(mc->mc_pg[mc->mc_top]) >= minkeys); return rc; } } @@ -17995,18 +17842,18 @@ retry: mn.mc_ki[mn.mc_top - 1] = (indx_t)(ki_pre_top + 1); mn.mc_ki[mn.mc_top] = 0; mc->mc_ki[mc->mc_top] = (indx_t)nkeys; - WITH_CURSOR_TRACKING(mn, rc = mdbx_node_move(&mn, mc, false)); + WITH_CURSOR_TRACKING(mn, rc = node_move(&mn, mc, false)); if (likely(rc != MDBX_RESULT_TRUE)) { mc->mc_ki[mc->mc_top] = (indx_t)ki_top; - mdbx_cassert(mc, rc || page_numkeys(mc->mc_pg[mc->mc_top]) >= minkeys); + cASSERT(mc, rc || page_numkeys(mc->mc_pg[mc->mc_top]) >= minkeys); return rc; } } if (nkeys >= minkeys) { mc->mc_ki[mc->mc_top] = (indx_t)ki_top; - if (mdbx_audit_enabled()) - return mdbx_cursor_check_updating(mc); + if (AUDIT_ENABLED()) + return cursor_check_updating(mc); return MDBX_SUCCESS; } @@ -18014,16 +17861,15 @@ retry: room_threshold = 0; goto retry; } - mdbx_error("Unable to merge/rebalance %s page %" PRIaPGNO - " (has %u keys, full %.1f%%, used %u, room %u bytes )", - (pagetype & P_LEAF) ? "leaf" : "branch", tp->mp_pgno, numkeys, - page_fill(mc->mc_txn->mt_env, tp), - page_used(mc->mc_txn->mt_env, tp), room); + ERROR("Unable to merge/rebalance %s page %" PRIaPGNO + " (has %u keys, full %.1f%%, used %u, room %u bytes )", + (pagetype & P_LEAF) ? "leaf" : "branch", tp->mp_pgno, numkeys, + page_fill(mc->mc_txn->mt_env, tp), page_used(mc->mc_txn->mt_env, tp), + room); return MDBX_PROBLEM; } -__cold static int mdbx_page_check(MDBX_cursor *const mc, - const MDBX_page *const mp) { +__cold static int page_check(MDBX_cursor *const mc, const MDBX_page *const mp) { DKBUF; int rc = MDBX_SUCCESS; if (unlikely(mp->mp_pgno < MIN_PAGENO || mp->mp_pgno > MAX_PAGENO)) @@ -18050,8 +17896,7 @@ __cold static int mdbx_page_check(MDBX_cursor *const mc, rc = bad_page(mp, "unknown/extra page-flags (have 0x%x, expect 0x%x)\n", mp->mp_flags & flags_mask, flags_expected); - mdbx_cassert(mc, (mc->mc_checking & CC_LEAF2) == 0 || - (mc->mc_flags & C_SUB) != 0); + cASSERT(mc, (mc->mc_checking & CC_LEAF2) == 0 || (mc->mc_flags & C_SUB) != 0); const uint8_t type = PAGETYPE_WHOLE(mp); switch (type) { default: @@ -18243,7 +18088,7 @@ __cold static int mdbx_page_check(MDBX_cursor *const mc, page_get_large(mc, node_largedata_pgno(node), mp->mp_txnid); if (unlikely(lp.err != MDBX_SUCCESS)) return lp.err; - mdbx_cassert(mc, PAGETYPE_WHOLE(lp.page) == P_OVERFLOW); + cASSERT(mc, PAGETYPE_WHOLE(lp.page) == P_OVERFLOW); const unsigned npages = number_of_ovpages(env, dsize); if (unlikely(lp.page->mp_pages != npages)) { if (lp.page->mp_pages < npages) @@ -18399,20 +18244,18 @@ __cold static int mdbx_page_check(MDBX_cursor *const mc, return rc; } -__cold static int mdbx_cursor_check(MDBX_cursor *mc) { - mdbx_cassert(mc, - mc->mc_txn->tw.dirtyroom + mc->mc_txn->tw.dirtylist->length == - (mc->mc_txn->mt_parent - ? mc->mc_txn->mt_parent->tw.dirtyroom - : mc->mc_txn->mt_env->me_options.dp_limit)); - mdbx_cassert(mc, mc->mc_top == mc->mc_snum - 1 || - (mc->mc_checking & CC_UPDATING)); +__cold static int cursor_check(MDBX_cursor *mc) { + cASSERT(mc, mc->mc_txn->tw.dirtyroom + mc->mc_txn->tw.dirtylist->length == + (mc->mc_txn->mt_parent + ? mc->mc_txn->mt_parent->tw.dirtyroom + : mc->mc_txn->mt_env->me_options.dp_limit)); + cASSERT(mc, mc->mc_top == mc->mc_snum - 1 || (mc->mc_checking & CC_UPDATING)); if (unlikely(mc->mc_top != mc->mc_snum - 1) && (mc->mc_checking & CC_UPDATING) == 0) return MDBX_CURSOR_FULL; - mdbx_cassert(mc, (mc->mc_checking & CC_UPDATING) - ? mc->mc_snum <= mc->mc_db->md_depth - : mc->mc_snum == mc->mc_db->md_depth); + cASSERT(mc, (mc->mc_checking & CC_UPDATING) + ? mc->mc_snum <= mc->mc_db->md_depth + : mc->mc_snum == mc->mc_db->md_depth); if (unlikely((mc->mc_checking & CC_UPDATING) ? mc->mc_snum > mc->mc_db->md_depth : mc->mc_snum != mc->mc_db->md_depth)) @@ -18425,44 +18268,43 @@ __cold static int mdbx_cursor_check(MDBX_cursor *mc) { const bool expect_nested_leaf = (n + 1 == mc->mc_db->md_depth - 1) ? true : false; const bool branch = IS_BRANCH(mp) ? true : false; - mdbx_cassert(mc, branch == expect_branch); + cASSERT(mc, branch == expect_branch); if (unlikely(branch != expect_branch)) return MDBX_CURSOR_FULL; if ((mc->mc_checking & CC_UPDATING) == 0) { - mdbx_cassert(mc, - nkeys > mc->mc_ki[n] || (!branch && nkeys == mc->mc_ki[n] && - (mc->mc_flags & C_EOF) != 0)); + cASSERT(mc, nkeys > mc->mc_ki[n] || (!branch && nkeys == mc->mc_ki[n] && + (mc->mc_flags & C_EOF) != 0)); if (unlikely(nkeys <= mc->mc_ki[n] && !(!branch && nkeys == mc->mc_ki[n] && (mc->mc_flags & C_EOF) != 0))) return MDBX_CURSOR_FULL; } else { - mdbx_cassert(mc, nkeys + 1 >= mc->mc_ki[n]); + cASSERT(mc, nkeys + 1 >= mc->mc_ki[n]); if (unlikely(nkeys + 1 < mc->mc_ki[n])) return MDBX_CURSOR_FULL; } - int err = mdbx_page_check(mc, mp); + int err = page_check(mc, mp); if (unlikely(err != MDBX_SUCCESS)) return err; for (unsigned i = 0; i < nkeys; ++i) { if (branch) { MDBX_node *node = page_node(mp, i); - mdbx_cassert(mc, node_flags(node) == 0); + cASSERT(mc, node_flags(node) == 0); if (unlikely(node_flags(node) != 0)) return MDBX_CURSOR_FULL; pgno_t pgno = node_pgno(node); MDBX_page *np; err = page_get(mc, pgno, &np, mp->mp_txnid); - mdbx_cassert(mc, err == MDBX_SUCCESS); + cASSERT(mc, err == MDBX_SUCCESS); if (unlikely(err != MDBX_SUCCESS)) return err; const bool nested_leaf = IS_LEAF(np) ? true : false; - mdbx_cassert(mc, nested_leaf == expect_nested_leaf); + cASSERT(mc, nested_leaf == expect_nested_leaf); if (unlikely(nested_leaf != expect_nested_leaf)) return MDBX_CURSOR_FULL; - err = mdbx_page_check(mc, np); + err = page_check(mc, np); if (unlikely(err != MDBX_SUCCESS)) return err; } @@ -18471,24 +18313,24 @@ __cold static int mdbx_cursor_check(MDBX_cursor *mc) { return MDBX_SUCCESS; } -__cold static int mdbx_cursor_check_updating(MDBX_cursor *mc) { +__cold static int cursor_check_updating(MDBX_cursor *mc) { const uint8_t checking = mc->mc_checking; mc->mc_checking |= CC_UPDATING; - const int rc = mdbx_cursor_check(mc); + const int rc = cursor_check(mc); mc->mc_checking = checking; return rc; } /* Complete a delete operation started by mdbx_cursor_del(). */ -static int mdbx_cursor_del0(MDBX_cursor *mc) { +static int cursor_del(MDBX_cursor *mc) { int rc; MDBX_page *mp; indx_t ki; unsigned nkeys; MDBX_dbi dbi = mc->mc_dbi; - mdbx_cassert(mc, cursor_is_tracked(mc)); - mdbx_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); + cASSERT(mc, cursor_is_tracked(mc)); + cASSERT(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); ki = mc->mc_ki[mc->mc_top]; mp = mc->mc_pg[mc->mc_top]; node_del(mc, mc->mc_db->md_xsize); @@ -18517,27 +18359,27 @@ static int mdbx_cursor_del0(MDBX_cursor *mc) { } } - rc = mdbx_rebalance(mc); + rc = rebalance(mc); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; if (unlikely(!mc->mc_snum)) { /* DB is totally empty now, just bail out. * Other cursors adjustments were already done - * by mdbx_rebalance and aren't needed here. */ - mdbx_cassert(mc, mc->mc_db->md_entries == 0 && mc->mc_db->md_depth == 0 && - mc->mc_db->md_root == P_INVALID); + * by rebalance and aren't needed here. */ + cASSERT(mc, mc->mc_db->md_entries == 0 && mc->mc_db->md_depth == 0 && + mc->mc_db->md_root == P_INVALID); mc->mc_flags |= C_EOF; return MDBX_SUCCESS; } ki = mc->mc_ki[mc->mc_top]; mp = mc->mc_pg[mc->mc_top]; - mdbx_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); + cASSERT(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); nkeys = page_numkeys(mp); - mdbx_cassert(mc, (mc->mc_db->md_entries > 0 && nkeys > 0) || - ((mc->mc_flags & C_SUB) && mc->mc_db->md_entries == 0 && - nkeys == 0)); + cASSERT(mc, (mc->mc_db->md_entries > 0 && nkeys > 0) || + ((mc->mc_flags & C_SUB) && mc->mc_db->md_entries == 0 && + nkeys == 0)); /* Adjust this and other cursors pointing to mp */ for (MDBX_cursor *m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) { @@ -18549,7 +18391,7 @@ static int mdbx_cursor_del0(MDBX_cursor *mc) { if (m3->mc_pg[mc->mc_top] == mp) { /* if m3 points past last node in page, find next sibling */ if (m3->mc_ki[mc->mc_top] >= nkeys) { - rc = mdbx_cursor_sibling(m3, SIBLING_RIGHT); + rc = cursor_sibling(m3, SIBLING_RIGHT); if (rc == MDBX_NOTFOUND) { m3->mc_flags |= C_EOF; rc = MDBX_SUCCESS; @@ -18572,10 +18414,10 @@ static int mdbx_cursor_del0(MDBX_cursor *mc) { if (!(node_flags(node) & F_SUBDATA)) m3->mc_xcursor->mx_cursor.mc_pg[0] = node_data(node); } else { - rc = mdbx_xcursor_init1(m3, node, m3->mc_pg[m3->mc_top]); + rc = cursor_xinit1(m3, node, m3->mc_pg[m3->mc_top]); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; - rc = mdbx_cursor_first(&m3->mc_xcursor->mx_cursor, NULL, NULL); + rc = cursor_first(&m3->mc_xcursor->mx_cursor, NULL, NULL); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } @@ -18587,9 +18429,9 @@ static int mdbx_cursor_del0(MDBX_cursor *mc) { } } - mdbx_cassert(mc, rc == MDBX_SUCCESS); - if (mdbx_audit_enabled()) - rc = mdbx_cursor_check(mc); + cASSERT(mc, rc == MDBX_SUCCESS); + if (AUDIT_ENABLED()) + rc = cursor_check(mc); return rc; bailout: @@ -18612,21 +18454,21 @@ int mdbx_del(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, if (unlikely(txn->mt_flags & (MDBX_TXN_RDONLY | MDBX_TXN_BLOCKED))) return (txn->mt_flags & MDBX_TXN_RDONLY) ? MDBX_EACCESS : MDBX_BAD_TXN; - return mdbx_del0(txn, dbi, key, data, 0); + return delete (txn, dbi, key, data, 0); } -static int mdbx_del0(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, - const MDBX_val *data, unsigned flags) { +static int delete (MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, + const MDBX_val *data, unsigned flags) { MDBX_cursor_couple cx; MDBX_cursor_op op; MDBX_val rdata; int rc; DKBUF_DEBUG; - mdbx_debug("====> delete db %u key [%s], data [%s]", dbi, DKEY_DEBUG(key), - DVAL_DEBUG(data)); + DEBUG("====> delete db %u key [%s], data [%s]", dbi, DKEY_DEBUG(key), + DVAL_DEBUG(data)); - rc = mdbx_cursor_init(&cx.outer, txn, dbi); + rc = cursor_init(&cx.outer, txn, dbi); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -18638,7 +18480,7 @@ static int mdbx_del0(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, op = MDBX_SET; flags |= MDBX_ALLDUPS; } - rc = mdbx_cursor_set(&cx.outer, (MDBX_val *)key, (MDBX_val *)data, op).err; + rc = cursor_set(&cx.outer, (MDBX_val *)key, (MDBX_val *)data, op).err; if (likely(rc == MDBX_SUCCESS)) { /* let mdbx_page_split know about this cursor if needed: * delete will trigger a rebalance; if it needs to move @@ -18679,20 +18521,20 @@ static int page_split(MDBX_cursor *mc, const MDBX_val *const newkey, MDBX_page *const mp = mc->mc_pg[mc->mc_top]; const unsigned newindx = mc->mc_ki[mc->mc_top]; unsigned nkeys = page_numkeys(mp); - if (mdbx_audit_enabled()) { - rc = mdbx_cursor_check_updating(mc); + if (AUDIT_ENABLED()) { + rc = cursor_check_updating(mc); if (unlikely(rc != MDBX_SUCCESS)) return rc; } STATIC_ASSERT(P_BRANCH == 1); const unsigned minkeys = (mp->mp_flags & P_BRANCH) + 1; - mdbx_debug(">> splitting %s-page %" PRIaPGNO - " and adding %zu+%zu [%s] at %i, nkeys %i", - IS_LEAF(mp) ? "leaf" : "branch", mp->mp_pgno, newkey->iov_len, - newdata ? newdata->iov_len : 0, DKEY_DEBUG(newkey), - mc->mc_ki[mc->mc_top], nkeys); - mdbx_cassert(mc, nkeys + 1 >= minkeys * 2); + DEBUG(">> splitting %s-page %" PRIaPGNO + " and adding %zu+%zu [%s] at %i, nkeys %i", + IS_LEAF(mp) ? "leaf" : "branch", mp->mp_pgno, newkey->iov_len, + newdata ? newdata->iov_len : 0, DKEY_DEBUG(newkey), + mc->mc_ki[mc->mc_top], nkeys); + cASSERT(mc, nkeys + 1 >= minkeys * 2); /* Create a new sibling page. */ pgr_t npr = page_new(mc, mp->mp_flags); @@ -18700,10 +18542,10 @@ static int page_split(MDBX_cursor *mc, const MDBX_val *const newkey, return npr.err; MDBX_page *const sister = npr.page; sister->mp_leaf2_ksize = mp->mp_leaf2_ksize; - mdbx_debug("new sibling: page %" PRIaPGNO, sister->mp_pgno); + DEBUG("new sibling: page %" PRIaPGNO, sister->mp_pgno); /* Usually when splitting the root page, the cursor - * height is 1. But when called from mdbx_update_key, + * height is 1. But when called from update_key, * the cursor height may be greater because it walks * up the stack while finding the branch slot to update. */ if (mc->mc_top < 1) { @@ -18713,7 +18555,7 @@ static int page_split(MDBX_cursor *mc, const MDBX_val *const newkey, goto done; MDBX_page *const pp = npr.page; /* shift current top to make room for new parent */ - mdbx_cassert(mc, mc->mc_snum < 2 && mc->mc_db->md_depth > 0); + cASSERT(mc, mc->mc_snum < 2 && mc->mc_db->md_depth > 0); #if MDBX_DEBUG memset(mc->mc_pg + 3, 0, sizeof(mc->mc_pg) - sizeof(mc->mc_pg[0]) * 3); memset(mc->mc_ki + 3, -1, sizeof(mc->mc_ki) - sizeof(mc->mc_ki[0]) * 3); @@ -18725,7 +18567,7 @@ static int page_split(MDBX_cursor *mc, const MDBX_val *const newkey, mc->mc_pg[0] = pp; mc->mc_ki[0] = 0; mc->mc_db->md_root = pp->mp_pgno; - mdbx_debug("root split! new root = %" PRIaPGNO, pp->mp_pgno); + DEBUG("root split! new root = %" PRIaPGNO, pp->mp_pgno); foliage = mc->mc_db->md_depth++; /* Add left (implicit) pointer. */ @@ -18741,14 +18583,14 @@ static int page_split(MDBX_cursor *mc, const MDBX_val *const newkey, mc->mc_snum++; mc->mc_top++; ptop = 0; - if (mdbx_audit_enabled()) { - rc = mdbx_cursor_check_updating(mc); + if (AUDIT_ENABLED()) { + rc = cursor_check_updating(mc); if (unlikely(rc != MDBX_SUCCESS)) goto done; } } else { ptop = mc->mc_top - 1; - mdbx_debug("parent branch page is %" PRIaPGNO, mc->mc_pg[ptop]->mp_pgno); + DEBUG("parent branch page is %" PRIaPGNO, mc->mc_pg[ptop]->mp_pgno); } MDBX_cursor mn; @@ -18761,9 +18603,9 @@ static int page_split(MDBX_cursor *mc, const MDBX_val *const newkey, (newindx < nkeys) ? /* split at the middle */ (nkeys + 1) >> 1 : /* split at the end (i.e. like append-mode ) */ nkeys - minkeys + 1; - mdbx_assert(env, split_indx >= minkeys && split_indx <= nkeys - minkeys + 1); + eASSERT(env, split_indx >= minkeys && split_indx <= nkeys - minkeys + 1); - mdbx_cassert(mc, !IS_BRANCH(mp) || newindx > 0); + cASSERT(mc, !IS_BRANCH(mp) || newindx > 0); /* It is reasonable and possible to split the page at the begin */ if (unlikely(newindx < minkeys)) { split_indx = minkeys; @@ -18786,7 +18628,7 @@ static int page_split(MDBX_cursor *mc, const MDBX_val *const newkey, sepkey.iov_base = page_leaf2key(mp, 0, sepkey.iov_len); } else get_key(page_node(mp, 0), &sepkey); - mdbx_cassert(mc, mc->mc_dbx->md_cmp(newkey, &sepkey) < 0); + cASSERT(mc, mc->mc_dbx->md_cmp(newkey, &sepkey) < 0); /* Avoiding rare complex cases of split the parent page */ if (page_room(mn.mc_pg[ptop]) < branch_size(env, &sepkey)) split_indx = minkeys; @@ -18798,14 +18640,14 @@ static int page_split(MDBX_cursor *mc, const MDBX_val *const newkey, const bool pure_left = split_indx == 0; if (unlikely(pure_right)) { /* newindx == split_indx == nkeys */ - mdbx_trace("no-split, but add new pure page at the %s", "right/after"); - mdbx_cassert(mc, newindx == nkeys && split_indx == nkeys && minkeys == 1); + TRACE("no-split, but add new pure page at the %s", "right/after"); + cASSERT(mc, newindx == nkeys && split_indx == nkeys && minkeys == 1); sepkey = *newkey; } else if (unlikely(pure_left)) { /* newindx == split_indx == 0 */ - mdbx_trace("no-split, but add new pure page at the %s", "left/before"); - mdbx_cassert(mc, newindx == 0 && split_indx == 0 && minkeys == 1); - mdbx_trace("old-first-key is %s", DKEY_DEBUG(&sepkey)); + TRACE("no-split, but add new pure page at the %s", "left/before"); + cASSERT(mc, newindx == 0 && split_indx == 0 && minkeys == 1); + TRACE("old-first-key is %s", DKEY_DEBUG(&sepkey)); } else { if (IS_LEAF2(sister)) { char *split, *ins; @@ -18816,51 +18658,51 @@ static int page_split(MDBX_cursor *mc, const MDBX_val *const newkey, split = page_leaf2key(mp, split_indx, ksize); rsize = (nkeys - split_indx) * ksize; lsize = (nkeys - split_indx) * sizeof(indx_t); - mdbx_cassert(mc, mp->mp_lower >= lsize); + cASSERT(mc, mp->mp_lower >= lsize); mp->mp_lower -= (indx_t)lsize; - mdbx_cassert(mc, sister->mp_lower + lsize <= UINT16_MAX); + cASSERT(mc, sister->mp_lower + lsize <= UINT16_MAX); sister->mp_lower += (indx_t)lsize; - mdbx_cassert(mc, mp->mp_upper + rsize - lsize <= UINT16_MAX); + cASSERT(mc, mp->mp_upper + rsize - lsize <= UINT16_MAX); mp->mp_upper += (indx_t)(rsize - lsize); - mdbx_cassert(mc, sister->mp_upper >= rsize - lsize); + cASSERT(mc, sister->mp_upper >= rsize - lsize); sister->mp_upper -= (indx_t)(rsize - lsize); sepkey.iov_len = ksize; sepkey.iov_base = (newindx != split_indx) ? split : newkey->iov_base; if (distance < 0) { - mdbx_cassert(mc, ksize >= sizeof(indx_t)); + cASSERT(mc, ksize >= sizeof(indx_t)); ins = page_leaf2key(mp, mc->mc_ki[mc->mc_top], ksize); memcpy(sister->mp_ptrs, split, rsize); sepkey.iov_base = sister->mp_ptrs; memmove(ins + ksize, ins, (split_indx - mc->mc_ki[mc->mc_top]) * ksize); memcpy(ins, newkey->iov_base, ksize); - mdbx_cassert(mc, UINT16_MAX - mp->mp_lower >= (int)sizeof(indx_t)); + cASSERT(mc, UINT16_MAX - mp->mp_lower >= (int)sizeof(indx_t)); mp->mp_lower += sizeof(indx_t); - mdbx_cassert(mc, mp->mp_upper >= ksize - sizeof(indx_t)); + cASSERT(mc, mp->mp_upper >= ksize - sizeof(indx_t)); mp->mp_upper -= (indx_t)(ksize - sizeof(indx_t)); } else { memcpy(sister->mp_ptrs, split, distance * ksize); ins = page_leaf2key(sister, distance, ksize); memcpy(ins, newkey->iov_base, ksize); memcpy(ins + ksize, split + distance * ksize, rsize - distance * ksize); - mdbx_cassert(mc, UINT16_MAX - sister->mp_lower >= (int)sizeof(indx_t)); + cASSERT(mc, UINT16_MAX - sister->mp_lower >= (int)sizeof(indx_t)); sister->mp_lower += sizeof(indx_t); - mdbx_cassert(mc, sister->mp_upper >= ksize - sizeof(indx_t)); + cASSERT(mc, sister->mp_upper >= ksize - sizeof(indx_t)); sister->mp_upper -= (indx_t)(ksize - sizeof(indx_t)); - mdbx_cassert(mc, distance <= (int)UINT16_MAX); + cASSERT(mc, distance <= (int)UINT16_MAX); mc->mc_ki[mc->mc_top] = (indx_t)distance; } - if (mdbx_audit_enabled()) { - rc = mdbx_cursor_check_updating(mc); + if (AUDIT_ENABLED()) { + rc = cursor_check_updating(mc); if (unlikely(rc != MDBX_SUCCESS)) goto done; - rc = mdbx_cursor_check_updating(&mn); + rc = cursor_check_updating(&mn); if (unlikely(rc != MDBX_SUCCESS)) goto done; } } else { /* grab a page to hold a temporary copy */ - tmp_ki_copy = mdbx_page_malloc(mc->mc_txn, 1); + tmp_ki_copy = page_malloc(mc->mc_txn, 1); if (unlikely(tmp_ki_copy == NULL)) { rc = MDBX_ENOMEM; goto done; @@ -18901,8 +18743,7 @@ static int page_split(MDBX_cursor *mc, const MDBX_val *const newkey, STATIC_ASSERT(P_BRANCH == 1); split_indx += mp->mp_flags & P_BRANCH; } - mdbx_assert(env, - split_indx >= minkeys && split_indx <= nkeys + 1 - minkeys); + eASSERT(env, split_indx >= minkeys && split_indx <= nkeys + 1 - minkeys); const unsigned dim_nodes = (newindx >= split_indx) ? split_indx : nkeys - split_indx; const unsigned dim_used = (sizeof(indx_t) + NODESIZE + 1) * dim_nodes; @@ -18914,11 +18755,11 @@ static int page_split(MDBX_cursor *mc, const MDBX_val *const newkey, unsigned best_split = split_indx; unsigned best_shift = INT_MAX; - mdbx_trace("seek separator from %u, step %i, default %u, new-idx %u, " - "new-size %zu", - i, dir, split_indx, newindx, new_size); + TRACE("seek separator from %u, step %i, default %u, new-idx %u, " + "new-size %zu", + i, dir, split_indx, newindx, new_size); do { - mdbx_cassert(mc, i <= nkeys); + cASSERT(mc, i <= nkeys); size_t size = new_size; if (i != newindx) { MDBX_node *node = @@ -18932,8 +18773,8 @@ static int page_split(MDBX_cursor *mc, const MDBX_val *const newkey, before += size; after -= size; - mdbx_trace("step %u, size %zu, before %zu, after %zu, max %u", i, - size, before, after, max_space); + TRACE("step %u, size %zu, before %zu, after %zu, max %u", i, size, + before, after, max_space); if (before <= max_space && after <= max_space) { const unsigned split = i + (dir > 0); @@ -18951,10 +18792,9 @@ static int page_split(MDBX_cursor *mc, const MDBX_val *const newkey, } while (i < nkeys); split_indx = best_split; - mdbx_trace("chosen %u", split_indx); + TRACE("chosen %u", split_indx); } - mdbx_assert(env, - split_indx >= minkeys && split_indx <= nkeys + 1 - minkeys); + eASSERT(env, split_indx >= minkeys && split_indx <= nkeys + 1 - minkeys); sepkey = *newkey; if (split_indx != newindx) { @@ -18966,14 +18806,14 @@ static int page_split(MDBX_cursor *mc, const MDBX_val *const newkey, } } } - mdbx_debug("separator is %d [%s]", split_indx, DKEY_DEBUG(&sepkey)); + DEBUG("separator is %d [%s]", split_indx, DKEY_DEBUG(&sepkey)); bool did_split_parent = false; /* Copy separator key to the parent. */ if (page_room(mn.mc_pg[ptop]) < branch_size(env, &sepkey)) { - mdbx_trace("need split parent branch-page for key %s", DKEY_DEBUG(&sepkey)); - mdbx_cassert(mc, page_numkeys(mn.mc_pg[ptop]) > 2); - mdbx_cassert(mc, !pure_left); + TRACE("need split parent branch-page for key %s", DKEY_DEBUG(&sepkey)); + cASSERT(mc, page_numkeys(mn.mc_pg[ptop]) > 2); + cASSERT(mc, !pure_left); const int snum = mc->mc_snum; const int depth = mc->mc_db->md_depth; mn.mc_snum--; @@ -18984,9 +18824,9 @@ static int page_split(MDBX_cursor *mc, const MDBX_val *const newkey, mn, rc = page_split(&mn, &sepkey, NULL, sister->mp_pgno, 0)); if (unlikely(rc != MDBX_SUCCESS)) goto done; - mdbx_cassert(mc, (int)mc->mc_snum - snum == mc->mc_db->md_depth - depth); - if (mdbx_audit_enabled()) { - rc = mdbx_cursor_check_updating(mc); + cASSERT(mc, (int)mc->mc_snum - snum == mc->mc_db->md_depth - depth); + if (AUDIT_ENABLED()) { + rc = cursor_check_updating(mc); if (unlikely(rc != MDBX_SUCCESS)) goto done; } @@ -19008,10 +18848,10 @@ static int page_split(MDBX_cursor *mc, const MDBX_val *const newkey, } else { /* find right page's left sibling */ mc->mc_ki[ptop] = mn.mc_ki[ptop]; - rc = mdbx_cursor_sibling(mc, SIBLING_LEFT); + rc = cursor_sibling(mc, SIBLING_LEFT); if (unlikely(rc != MDBX_SUCCESS)) { if (rc == MDBX_NOTFOUND) /* improper mdbx_cursor_sibling() result */ { - mdbx_error("unexpected %i error going left sibling", rc); + ERROR("unexpected %i error going left sibling", rc); rc = MDBX_PROBLEM; } goto done; @@ -19020,25 +18860,24 @@ static int page_split(MDBX_cursor *mc, const MDBX_val *const newkey, } } else if (unlikely(pure_left)) { MDBX_page *ptop_page = mc->mc_pg[ptop]; - mdbx_debug("adding to parent page %u node[%u] left-leaf page #%u key %s", - ptop_page->mp_pgno, mc->mc_ki[ptop], sister->mp_pgno, - DKEY(mc->mc_ki[ptop] ? newkey : NULL)); + DEBUG("adding to parent page %u node[%u] left-leaf page #%u key %s", + ptop_page->mp_pgno, mc->mc_ki[ptop], sister->mp_pgno, + DKEY(mc->mc_ki[ptop] ? newkey : NULL)); mc->mc_top--; rc = node_add_branch(mc, mc->mc_ki[ptop], mc->mc_ki[ptop] ? newkey : NULL, sister->mp_pgno); - mdbx_cassert(mc, mp == mc->mc_pg[ptop + 1] && - newindx == mc->mc_ki[ptop + 1] && ptop == mc->mc_top); + cASSERT(mc, mp == mc->mc_pg[ptop + 1] && newindx == mc->mc_ki[ptop + 1] && + ptop == mc->mc_top); if (likely(rc == MDBX_SUCCESS) && mc->mc_ki[ptop] == 0) { - mdbx_debug("update prev-first key on parent %s", DKEY(&sepkey)); + DEBUG("update prev-first key on parent %s", DKEY(&sepkey)); MDBX_node *node = page_node(mc->mc_pg[ptop], 1); - mdbx_cassert(mc, node_ks(node) == 0 && node_pgno(node) == mp->mp_pgno); - mdbx_cassert(mc, mc->mc_top == ptop && mc->mc_ki[ptop] == 0); + cASSERT(mc, node_ks(node) == 0 && node_pgno(node) == mp->mp_pgno); + cASSERT(mc, mc->mc_top == ptop && mc->mc_ki[ptop] == 0); mc->mc_ki[ptop] = 1; - rc = mdbx_update_key(mc, &sepkey); - mdbx_cassert(mc, mc->mc_top == ptop && mc->mc_ki[ptop] == 1); - mdbx_cassert(mc, - mp == mc->mc_pg[ptop + 1] && newindx == mc->mc_ki[ptop + 1]); + rc = update_key(mc, &sepkey); + cASSERT(mc, mc->mc_top == ptop && mc->mc_ki[ptop] == 1); + cASSERT(mc, mp == mc->mc_pg[ptop + 1] && newindx == mc->mc_ki[ptop + 1]); mc->mc_ki[ptop] = 0; } @@ -19047,12 +18886,11 @@ static int page_split(MDBX_cursor *mc, const MDBX_val *const newkey, goto done; MDBX_node *node = page_node(mc->mc_pg[ptop], mc->mc_ki[ptop] + 1); - mdbx_cassert(mc, node_pgno(node) == mp->mp_pgno && - mc->mc_pg[ptop] == ptop_page); + cASSERT(mc, node_pgno(node) == mp->mp_pgno && mc->mc_pg[ptop] == ptop_page); } else { mn.mc_top--; - mdbx_trace("add-to-parent the right-entry[%u] for new sibling-page", - mn.mc_ki[ptop]); + TRACE("add-to-parent the right-entry[%u] for new sibling-page", + mn.mc_ki[ptop]); rc = node_add_branch(&mn, mn.mc_ki[ptop], &sepkey, sister->mp_pgno); mn.mc_top++; if (unlikely(rc != MDBX_SUCCESS)) @@ -19064,12 +18902,12 @@ static int page_split(MDBX_cursor *mc, const MDBX_val *const newkey, mc->mc_ki[mc->mc_top] = 0; switch (PAGETYPE_WHOLE(sister)) { case P_LEAF: { - mdbx_cassert(mc, newpgno == 0 || newpgno == P_INVALID); + cASSERT(mc, newpgno == 0 || newpgno == P_INVALID); rc = node_add_leaf(mc, 0, newkey, newdata, naf); } break; case P_LEAF | P_LEAF2: { - mdbx_cassert(mc, (naf & (F_BIGDATA | F_SUBDATA | F_DUPDATA)) == 0); - mdbx_cassert(mc, newpgno == 0 || newpgno == P_INVALID); + cASSERT(mc, (naf & (F_BIGDATA | F_SUBDATA | F_DUPDATA)) == 0); + cASSERT(mc, newpgno == 0 || newpgno == P_INVALID); rc = node_add_leaf2(mc, 0, newkey); } break; default: @@ -19089,10 +18927,10 @@ static int page_split(MDBX_cursor *mc, const MDBX_val *const newkey, &sepkey); if (mc->mc_dbx->md_cmp(newkey, &sepkey) < 0) { mc->mc_top -= (uint8_t)i; - mdbx_debug("update new-first on parent [%i] page %u key %s", - mc->mc_ki[mc->mc_top], mc->mc_pg[mc->mc_top]->mp_pgno, - DKEY(newkey)); - rc = mdbx_update_key(mc, newkey); + DEBUG("update new-first on parent [%i] page %u key %s", + mc->mc_ki[mc->mc_top], mc->mc_pg[mc->mc_top]->mp_pgno, + DKEY(newkey)); + rc = update_key(mc, newkey); mc->mc_top += (uint8_t)i; if (unlikely(rc != MDBX_SUCCESS)) goto done; @@ -19106,8 +18944,7 @@ static int page_split(MDBX_cursor *mc, const MDBX_val *const newkey, i = split_indx; unsigned n = 0; do { - mdbx_trace("i %u, nkeys %u => n %u, rp #%u", i, nkeys, n, - sister->mp_pgno); + TRACE("i %u, nkeys %u => n %u, rp #%u", i, nkeys, n, sister->mp_pgno); pgno_t pgno = 0; MDBX_val *rdata = NULL; if (i == newindx) { @@ -19135,18 +18972,18 @@ static int page_split(MDBX_cursor *mc, const MDBX_val *const newkey, switch (PAGETYPE_WHOLE(sister)) { case P_BRANCH: { - mdbx_cassert(mc, 0 == (uint16_t)flags); + cASSERT(mc, 0 == (uint16_t)flags); /* First branch index doesn't need key data. */ rc = node_add_branch(mc, n, n ? &rkey : NULL, pgno); } break; case P_LEAF: { - mdbx_cassert(mc, pgno == 0); - mdbx_cassert(mc, rdata != NULL); + cASSERT(mc, pgno == 0); + cASSERT(mc, rdata != NULL); rc = node_add_leaf(mc, n, &rkey, rdata, flags); } break; /* case P_LEAF | P_LEAF2: { - mdbx_cassert(mc, (nflags & (F_BIGDATA | F_SUBDATA | F_DUPDATA)) == 0); - mdbx_cassert(mc, gno == 0); + cASSERT(mc, (nflags & (F_BIGDATA | F_SUBDATA | F_DUPDATA)) == 0); + cASSERT(mc, gno == 0); rc = mdbx_node_add_leaf2(mc, n, &rkey); } break; */ default: @@ -19160,12 +18997,12 @@ static int page_split(MDBX_cursor *mc, const MDBX_val *const newkey, i = 0; n = 0; mc->mc_pg[mc->mc_top] = tmp_ki_copy; - mdbx_trace("switch to mp #%u", tmp_ki_copy->mp_pgno); + TRACE("switch to mp #%u", tmp_ki_copy->mp_pgno); } } while (i != split_indx); - mdbx_trace("i %u, nkeys %u, n %u, pgno #%u", i, nkeys, n, - mc->mc_pg[mc->mc_top]->mp_pgno); + TRACE("i %u, nkeys %u, n %u, pgno #%u", i, nkeys, n, + mc->mc_pg[mc->mc_top]->mp_pgno); nkeys = page_numkeys(tmp_ki_copy); for (i = 0; i < nkeys; i++) @@ -19232,7 +19069,7 @@ static int page_split(MDBX_cursor *mc, const MDBX_val *const newkey, m3->mc_ki[mc->mc_top]++; if (m3->mc_ki[mc->mc_top] >= nkeys) { m3->mc_pg[mc->mc_top] = sister; - mdbx_cassert(mc, m3->mc_ki[mc->mc_top] >= nkeys); + cASSERT(mc, m3->mc_ki[mc->mc_top] >= nkeys); m3->mc_ki[mc->mc_top] -= (indx_t)nkeys; for (i = 0; i < mc->mc_top; i++) { m3->mc_ki[i] = mn.mc_ki[i]; @@ -19247,18 +19084,18 @@ static int page_split(MDBX_cursor *mc, const MDBX_val *const newkey, if (XCURSOR_INITED(m3) && IS_LEAF(mp)) XCURSOR_REFRESH(m3, m3->mc_pg[mc->mc_top], m3->mc_ki[mc->mc_top]); } - mdbx_trace("mp #%u left: %d, sister #%u left: %d", mp->mp_pgno, page_room(mp), - sister->mp_pgno, page_room(sister)); + TRACE("mp #%u left: %d, sister #%u left: %d", mp->mp_pgno, page_room(mp), + sister->mp_pgno, page_room(sister)); done: if (tmp_ki_copy) - mdbx_dpage_free(env, tmp_ki_copy, 1); + dpage_free(env, tmp_ki_copy, 1); if (unlikely(rc != MDBX_SUCCESS)) mc->mc_txn->mt_flags |= MDBX_TXN_ERROR; else { - if (mdbx_audit_enabled()) - rc = mdbx_cursor_check_updating(mc); + if (AUDIT_ENABLED()) + rc = cursor_check_updating(mc); if (unlikely(naf & MDBX_RESERVE)) { MDBX_node *node = page_node(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); if (!(node_flags(node) & F_BIGDATA)) @@ -19269,7 +19106,7 @@ done: #endif /* MDBX_ENABLE_PGOP_STAT */ } - mdbx_debug("<< mp #%u, rc %d", mp->mp_pgno, rc); + DEBUG("<< mp #%u, rc %d", mp->mp_pgno, rc); return rc; } @@ -19294,7 +19131,7 @@ int mdbx_put(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, MDBX_val *data, return (txn->mt_flags & MDBX_TXN_RDONLY) ? MDBX_EACCESS : MDBX_BAD_TXN; MDBX_cursor_couple cx; - rc = mdbx_cursor_init(&cx.outer, txn, dbi); + rc = cursor_init(&cx.outer, txn, dbi); if (unlikely(rc != MDBX_SUCCESS)) return rc; cx.outer.mc_next = txn->mt_cursors[dbi]; @@ -19310,8 +19147,8 @@ int mdbx_put(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, MDBX_val *data, MDBX_node *node = page_node(cx.outer.mc_pg[cx.outer.mc_top], cx.outer.mc_ki[cx.outer.mc_top]); if (F_ISSET(node_flags(node), F_DUPDATA)) { - mdbx_tassert(txn, XCURSOR_INITED(&cx.outer) && - cx.outer.mc_xcursor->mx_db.md_entries > 1); + tASSERT(txn, XCURSOR_INITED(&cx.outer) && + cx.outer.mc_xcursor->mx_db.md_entries > 1); rc = MDBX_EMULTIVAL; } } @@ -19330,7 +19167,7 @@ int mdbx_put(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, MDBX_val *data, typedef struct mdbx_compacting_ctx { MDBX_env *mc_env; MDBX_txn *mc_txn; - mdbx_condpair_t mc_condpair; + osal_condpair_t mc_condpair; uint8_t *mc_wbuf[2]; size_t mc_wlen[2]; mdbx_filehandle_t mc_fd; @@ -19353,10 +19190,10 @@ __cold static THREAD_RESULT THREAD_CALL compacting_write_thread(void *arg) { ctx->mc_error = pthread_sigmask(SIG_BLOCK, &sigset, NULL); #endif /* EPIPE */ - mdbx_condpair_lock(&ctx->mc_condpair); + osal_condpair_lock(&ctx->mc_condpair); while (!ctx->mc_error) { while (ctx->mc_tail == ctx->mc_head && !ctx->mc_error) { - int err = mdbx_condpair_wait(&ctx->mc_condpair, true); + int err = osal_condpair_wait(&ctx->mc_condpair, true); if (err != MDBX_SUCCESS) { ctx->mc_error = err; goto bailout; @@ -19371,7 +19208,7 @@ __cold static THREAD_RESULT THREAD_CALL compacting_write_thread(void *arg) { ctx->mc_wlen[toggle] = 0; uint8_t *ptr = ctx->mc_wbuf[toggle]; if (!ctx->mc_error) { - int err = mdbx_write(ctx->mc_fd, ptr, wsize); + int err = osal_write(ctx->mc_fd, ptr, wsize); if (err != MDBX_SUCCESS) { #if defined(EPIPE) && !(defined(_WIN32) || defined(_WIN64)) if (err == EPIPE) { @@ -19386,26 +19223,26 @@ __cold static THREAD_RESULT THREAD_CALL compacting_write_thread(void *arg) { } } ctx->mc_tail += 1; - mdbx_condpair_signal(&ctx->mc_condpair, false); + osal_condpair_signal(&ctx->mc_condpair, false); } bailout: - mdbx_condpair_unlock(&ctx->mc_condpair); + osal_condpair_unlock(&ctx->mc_condpair); return (THREAD_RESULT)0; } /* Give buffer and/or MDBX_EOF to writer thread, await unused buffer. */ __cold static int compacting_toggle_write_buffers(mdbx_compacting_ctx *ctx) { - mdbx_condpair_lock(&ctx->mc_condpair); - mdbx_assert(ctx->mc_env, ctx->mc_head - ctx->mc_tail < 2 || ctx->mc_error); + osal_condpair_lock(&ctx->mc_condpair); + eASSERT(ctx->mc_env, ctx->mc_head - ctx->mc_tail < 2 || ctx->mc_error); ctx->mc_head += 1; - mdbx_condpair_signal(&ctx->mc_condpair, true); + osal_condpair_signal(&ctx->mc_condpair, true); while (!ctx->mc_error && ctx->mc_head - ctx->mc_tail == 2 /* both buffers in use */) { - int err = mdbx_condpair_wait(&ctx->mc_condpair, false); + int err = osal_condpair_wait(&ctx->mc_condpair, false); if (err != MDBX_SUCCESS) ctx->mc_error = err; } - mdbx_condpair_unlock(&ctx->mc_condpair); + osal_condpair_unlock(&ctx->mc_condpair); return ctx->mc_error; } @@ -19484,18 +19321,18 @@ __cold static int compacting_walk_tree(mdbx_compacting_ctx *ctx, if (unlikely(rc != MDBX_SUCCESS)) return rc; - rc = mdbx_page_search_root(mc, nullptr, MDBX_PS_FIRST); + rc = page_search_root(mc, nullptr, MDBX_PS_FIRST); if (unlikely(rc != MDBX_SUCCESS)) return rc; /* Make cursor pages writable */ - char *const buf = mdbx_malloc(pgno2bytes(ctx->mc_env, mc->mc_snum)); + char *const buf = osal_malloc(pgno2bytes(ctx->mc_env, mc->mc_snum)); if (buf == NULL) return MDBX_ENOMEM; char *ptr = buf; for (unsigned i = 0; i < mc->mc_top; i++) { - mdbx_page_copy((MDBX_page *)ptr, mc->mc_pg[i], ctx->mc_env->me_psize); + page_copy((MDBX_page *)ptr, mc->mc_pg[i], ctx->mc_env->me_psize); mc->mc_pg[i] = (MDBX_page *)ptr; ptr += ctx->mc_env->me_psize; } @@ -19515,7 +19352,7 @@ __cold static int compacting_walk_tree(mdbx_compacting_ctx *ctx, /* Need writable leaf */ if (mp != leaf) { mc->mc_pg[mc->mc_top] = leaf; - mdbx_page_copy(leaf, mp, ctx->mc_env->me_psize); + page_copy(leaf, mp, ctx->mc_env->me_psize); mp = leaf; node = page_node(mp, i); } @@ -19541,29 +19378,28 @@ __cold static int compacting_walk_tree(mdbx_compacting_ctx *ctx, /* Need writable leaf */ if (mp != leaf) { mc->mc_pg[mc->mc_top] = leaf; - mdbx_page_copy(leaf, mp, ctx->mc_env->me_psize); + page_copy(leaf, mp, ctx->mc_env->me_psize); mp = leaf; node = page_node(mp, i); } MDBX_db *nested = nullptr; if (node_flags(node) & F_DUPDATA) { - rc = mdbx_xcursor_init1(mc, node, mp); + rc = cursor_xinit1(mc, node, mp); if (likely(rc == MDBX_SUCCESS)) { nested = &mc->mc_xcursor->mx_db; rc = compacting_walk_tree(ctx, &mc->mc_xcursor->mx_cursor, &nested->md_root, mp->mp_txnid); } } else { - mdbx_cassert(mc, - (mc->mc_flags & C_SUB) == 0 && mc->mc_xcursor == 0); + cASSERT(mc, (mc->mc_flags & C_SUB) == 0 && mc->mc_xcursor == 0); MDBX_cursor_couple *couple = container_of(mc, MDBX_cursor_couple, outer); - mdbx_cassert(mc, couple->inner.mx_cursor.mc_signature == - ~MDBX_MC_LIVE && - !couple->inner.mx_cursor.mc_flags && - !couple->inner.mx_cursor.mc_db && - !couple->inner.mx_cursor.mc_dbx); + cASSERT(mc, + couple->inner.mx_cursor.mc_signature == ~MDBX_MC_LIVE && + !couple->inner.mx_cursor.mc_flags && + !couple->inner.mx_cursor.mc_db && + !couple->inner.mx_cursor.mc_dbx); nested = &couple->inner.mx_db; memcpy(nested, node_data(node), sizeof(MDBX_db)); rc = compacting_walk_sdb(ctx, nested); @@ -19591,7 +19427,7 @@ __cold static int compacting_walk_tree(mdbx_compacting_ctx *ctx, } /* Whenever we advance to a sibling branch page, * we must proceed all the way down to its first leaf. */ - mdbx_page_copy(mc->mc_pg[mc->mc_top], mp, ctx->mc_env->me_psize); + page_copy(mc->mc_pg[mc->mc_top], mp, ctx->mc_env->me_psize); } continue; } @@ -19614,7 +19450,7 @@ __cold static int compacting_walk_tree(mdbx_compacting_ctx *ctx, node_set_pgno( page_node(mc->mc_pg[mc->mc_top - 1], mc->mc_ki[mc->mc_top - 1]), pgno); - mdbx_cursor_pop(mc); + cursor_pop(mc); } else { /* Otherwise we're done */ *root = pgno; @@ -19622,7 +19458,7 @@ __cold static int compacting_walk_tree(mdbx_compacting_ctx *ctx, } } done: - mdbx_free(buf); + osal_free(buf); return rc; } @@ -19635,7 +19471,7 @@ __cold static int compacting_walk_sdb(mdbx_compacting_ctx *ctx, MDBX_db *sdb) { couple.inner.mx_cursor.mc_signature = ~MDBX_MC_LIVE; MDBX_dbx dbx = {.md_klen_min = INT_MAX}; uint8_t dbistate = DBI_VALID | DBI_AUDITED; - int rc = mdbx_couple_init(&couple, ~0u, ctx->mc_txn, sdb, &dbx, &dbistate); + int rc = couple_init(&couple, ~0u, ctx->mc_txn, sdb, &dbx, &dbistate); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -19648,10 +19484,10 @@ __cold static int compacting_walk_sdb(mdbx_compacting_ctx *ctx, MDBX_db *sdb) { } __cold static void compacting_fixup_meta(MDBX_env *env, MDBX_meta *meta) { - mdbx_assert(env, meta->mm_dbs[FREE_DBI].md_mod_txnid || - meta->mm_dbs[FREE_DBI].md_root == P_INVALID); - mdbx_assert(env, meta->mm_dbs[MAIN_DBI].md_mod_txnid || - meta->mm_dbs[MAIN_DBI].md_root == P_INVALID); + eASSERT(env, meta->mm_dbs[FREE_DBI].md_mod_txnid || + meta->mm_dbs[FREE_DBI].md_root == P_INVALID); + eASSERT(env, meta->mm_dbs[MAIN_DBI].md_mod_txnid || + meta->mm_dbs[MAIN_DBI].md_root == P_INVALID); /* Calculate filesize taking in account shrink/growing thresholds */ if (meta->mm_geo.next != meta->mm_geo.now) { @@ -19689,13 +19525,13 @@ __cold static void meta_make_sizeable(MDBX_meta *meta) { } /* Copy environment with compaction. */ -__cold static int mdbx_env_compact(MDBX_env *env, MDBX_txn *read_txn, - mdbx_filehandle_t fd, uint8_t *buffer, - const bool dest_is_pipe, const int flags) { +__cold static int env_compact(MDBX_env *env, MDBX_txn *read_txn, + mdbx_filehandle_t fd, uint8_t *buffer, + const bool dest_is_pipe, const int flags) { const size_t meta_bytes = pgno2bytes(env, NUM_METAS); uint8_t *const data_buffer = buffer + ceil_powerof2(meta_bytes, env->me_os_psize); - MDBX_meta *const meta = mdbx_init_metas(env, buffer); + MDBX_meta *const meta = init_metas(env, buffer); meta_set_txnid(env, meta, read_txn->mt_txnid); if (flags & MDBX_CP_FORCE_DYNAMIC_SIZE) @@ -19713,14 +19549,14 @@ __cold static int mdbx_env_compact(MDBX_env *env, MDBX_txn *read_txn, meta->mm_dbs[MAIN_DBI].md_flags = read_txn->mt_dbs[MAIN_DBI].md_flags; compacting_fixup_meta(env, meta); if (dest_is_pipe) { - int rc = mdbx_write(fd, buffer, meta_bytes); + int rc = osal_write(fd, buffer, meta_bytes); if (unlikely(rc != MDBX_SUCCESS)) return rc; } } else { /* Count free pages + GC pages. */ MDBX_cursor_couple couple; - int rc = mdbx_cursor_init(&couple.outer, read_txn, FREE_DBI); + int rc = cursor_init(&couple.outer, read_txn, FREE_DBI); if (unlikely(rc != MDBX_SUCCESS)) return rc; pgno_t gc = read_txn->mt_dbs[FREE_DBI].md_branch_pages + @@ -19746,7 +19582,7 @@ __cold static int mdbx_env_compact(MDBX_env *env, MDBX_txn *read_txn, mdbx_compacting_ctx ctx; memset(&ctx, 0, sizeof(ctx)); - rc = mdbx_condpair_init(&ctx.mc_condpair); + rc = osal_condpair_init(&ctx.mc_condpair); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -19758,14 +19594,14 @@ __cold static int mdbx_env_compact(MDBX_env *env, MDBX_txn *read_txn, ctx.mc_fd = fd; ctx.mc_txn = read_txn; - mdbx_thread_t thread; - int thread_err = mdbx_thread_create(&thread, compacting_write_thread, &ctx); + osal_thread_t thread; + int thread_err = osal_thread_create(&thread, compacting_write_thread, &ctx); if (likely(thread_err == MDBX_SUCCESS)) { if (dest_is_pipe) { if (!meta->mm_dbs[MAIN_DBI].md_mod_txnid) meta->mm_dbs[MAIN_DBI].md_mod_txnid = read_txn->mt_txnid; compacting_fixup_meta(env, meta); - rc = mdbx_write(fd, buffer, meta_bytes); + rc = osal_write(fd, buffer, meta_bytes); } if (likely(rc == MDBX_SUCCESS)) rc = compacting_walk_sdb(&ctx, &meta->mm_dbs[MAIN_DBI]); @@ -19776,15 +19612,14 @@ __cold static int mdbx_env_compact(MDBX_env *env, MDBX_txn *read_txn, if (likely(rc == MDBX_SUCCESS) && unlikely(meta->mm_geo.next != ctx.mc_next_pgno)) { if (ctx.mc_next_pgno > meta->mm_geo.next) { - mdbx_error( - "the source DB %s: post-compactification used pages %" PRIaPGNO - " %c expected %" PRIaPGNO, - "has double-used pages or other corruption", ctx.mc_next_pgno, - '>', meta->mm_geo.next); + ERROR("the source DB %s: post-compactification used pages %" PRIaPGNO + " %c expected %" PRIaPGNO, + "has double-used pages or other corruption", ctx.mc_next_pgno, + '>', meta->mm_geo.next); rc = MDBX_CORRUPTED; /* corrupted DB */ } if (ctx.mc_next_pgno < meta->mm_geo.next) { - mdbx_warning( + WARNING( "the source DB %s: post-compactification used pages %" PRIaPGNO " %c expected %" PRIaPGNO, "has page leak(s)", ctx.mc_next_pgno, '<', meta->mm_geo.next); @@ -19797,13 +19632,13 @@ __cold static int mdbx_env_compact(MDBX_env *env, MDBX_txn *read_txn, } /* toggle with empty buffers to exit thread's loop */ - mdbx_assert(env, (ctx.mc_wlen[ctx.mc_head & 1]) == 0); + eASSERT(env, (ctx.mc_wlen[ctx.mc_head & 1]) == 0); compacting_toggle_write_buffers(&ctx); - thread_err = mdbx_thread_join(thread); - mdbx_assert(env, (ctx.mc_tail == ctx.mc_head && - ctx.mc_wlen[ctx.mc_head & 1] == 0) || - ctx.mc_error); - mdbx_condpair_destroy(&ctx.mc_condpair); + thread_err = osal_thread_join(thread); + eASSERT(env, (ctx.mc_tail == ctx.mc_head && + ctx.mc_wlen[ctx.mc_head & 1] == 0) || + ctx.mc_error); + osal_condpair_destroy(&ctx.mc_condpair); } if (unlikely(thread_err != MDBX_SUCCESS)) return thread_err; @@ -19819,7 +19654,7 @@ __cold static int mdbx_env_compact(MDBX_env *env, MDBX_txn *read_txn, if (meta->mm_geo.now != meta->mm_geo.next) { const size_t whole_size = pgno2bytes(env, meta->mm_geo.now); if (!dest_is_pipe) - return mdbx_ftruncate(fd, whole_size); + return osal_ftruncate(fd, whole_size); const size_t used_size = pgno2bytes(env, meta->mm_geo.next); memset(data_buffer, 0, (size_t)MDBX_ENVCOPY_WRITEBUF); @@ -19828,7 +19663,7 @@ __cold static int mdbx_env_compact(MDBX_env *env, MDBX_txn *read_txn, ? (size_t)MDBX_ENVCOPY_WRITEBUF : whole_size - offset; /* copy to avoid EFAULT in case swapped-out */ - int rc = mdbx_write(fd, data_buffer, chunk); + int rc = osal_write(fd, data_buffer, chunk); if (unlikely(rc != MDBX_SUCCESS)) return rc; offset += chunk; @@ -19838,11 +19673,11 @@ __cold static int mdbx_env_compact(MDBX_env *env, MDBX_txn *read_txn, } /* Copy environment as-is. */ -__cold static int mdbx_env_copy_asis(MDBX_env *env, MDBX_txn *read_txn, - mdbx_filehandle_t fd, uint8_t *buffer, - const bool dest_is_pipe, const int flags) { +__cold static int env_copy_asis(MDBX_env *env, MDBX_txn *read_txn, + mdbx_filehandle_t fd, uint8_t *buffer, + const bool dest_is_pipe, const int flags) { /* We must start the actual read txn after blocking writers */ - int rc = mdbx_txn_end(read_txn, MDBX_END_RESET_TMP); + int rc = txn_end(read_txn, MDBX_END_RESET_TMP); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -19851,13 +19686,13 @@ __cold static int mdbx_env_copy_asis(MDBX_env *env, MDBX_txn *read_txn, if (unlikely(rc != MDBX_SUCCESS)) return rc; - rc = mdbx_txn_renew0(read_txn, MDBX_TXN_RDONLY); + rc = txn_renew(read_txn, MDBX_TXN_RDONLY); if (unlikely(rc != MDBX_SUCCESS)) { mdbx_txn_unlock(env); return rc; } - mdbx_jitter4testing(false); + jitter4testing(false); const size_t meta_bytes = pgno2bytes(env, NUM_METAS); /* Make a snapshot of meta-pages, * but writing ones after the data was flushed */ @@ -19874,10 +19709,10 @@ __cold static int mdbx_env_copy_asis(MDBX_env *env, MDBX_txn *read_txn, /* Copy the data */ const size_t whole_size = pgno_align2os_bytes(env, read_txn->mt_end_pgno); const size_t used_size = pgno2bytes(env, read_txn->mt_next_pgno); - mdbx_jitter4testing(false); + jitter4testing(false); if (dest_is_pipe) - rc = mdbx_write(fd, buffer, meta_bytes); + rc = osal_write(fd, buffer, meta_bytes); uint8_t *const data_buffer = buffer + ceil_powerof2(meta_bytes, env->me_os_psize); @@ -19932,14 +19767,14 @@ __cold static int mdbx_env_copy_asis(MDBX_env *env, MDBX_txn *read_txn, : used_size - offset; /* copy to avoid EFAULT in case swapped-out */ memcpy(data_buffer, env->me_map + offset, chunk); - rc = mdbx_write(fd, data_buffer, chunk); + rc = osal_write(fd, data_buffer, chunk); offset += chunk; } /* Extend file if required */ if (likely(rc == MDBX_SUCCESS) && whole_size != used_size) { if (!dest_is_pipe) - rc = mdbx_ftruncate(fd, whole_size); + rc = osal_ftruncate(fd, whole_size); else { memset(data_buffer, 0, (size_t)MDBX_ENVCOPY_WRITEBUF); for (size_t offset = used_size; @@ -19949,7 +19784,7 @@ __cold static int mdbx_env_copy_asis(MDBX_env *env, MDBX_txn *read_txn, ? (size_t)MDBX_ENVCOPY_WRITEBUF : whole_size - offset; /* copy to avoid EFAULT in case swapped-out */ - rc = mdbx_write(fd, data_buffer, chunk); + rc = osal_write(fd, data_buffer, chunk); offset += chunk; } } @@ -19964,12 +19799,12 @@ __cold int mdbx_env_copy2fd(MDBX_env *env, mdbx_filehandle_t fd, if (unlikely(rc != MDBX_SUCCESS)) return rc; - const int dest_is_pipe = mdbx_is_pipe(fd); + const int dest_is_pipe = osal_is_pipe(fd); if (MDBX_IS_ERROR(dest_is_pipe)) return dest_is_pipe; if (!dest_is_pipe) { - rc = mdbx_fseek(fd, 0); + rc = osal_fseek(fd, 0); if (unlikely(rc != MDBX_SUCCESS)) return rc; } @@ -19982,7 +19817,7 @@ __cold int mdbx_env_copy2fd(MDBX_env *env, mdbx_filehandle_t fd, env->me_os_psize); uint8_t *buffer = NULL; - rc = mdbx_memalign_alloc(env->me_os_psize, buffer_size, (void **)&buffer); + rc = osal_memalign_alloc(env->me_os_psize, buffer_size, (void **)&buffer); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -19991,7 +19826,7 @@ __cold int mdbx_env_copy2fd(MDBX_env *env, mdbx_filehandle_t fd, * write txn. Otherwise other read txns could block writers. */ rc = mdbx_txn_begin(env, NULL, MDBX_TXN_RDONLY, &read_txn); if (unlikely(rc != MDBX_SUCCESS)) { - mdbx_memalign_free(buffer); + osal_memalign_free(buffer); return rc; } @@ -19999,29 +19834,29 @@ __cold int mdbx_env_copy2fd(MDBX_env *env, mdbx_filehandle_t fd, /* Firstly write a stub to meta-pages. * Now we sure to incomplete copy will not be used. */ memset(buffer, -1, pgno2bytes(env, NUM_METAS)); - rc = mdbx_write(fd, buffer, pgno2bytes(env, NUM_METAS)); + rc = osal_write(fd, buffer, pgno2bytes(env, NUM_METAS)); } if (likely(rc == MDBX_SUCCESS)) { memset(buffer, 0, pgno2bytes(env, NUM_METAS)); - rc = ((flags & MDBX_CP_COMPACT) ? mdbx_env_compact : mdbx_env_copy_asis)( + rc = ((flags & MDBX_CP_COMPACT) ? env_compact : env_copy_asis)( env, read_txn, fd, buffer, dest_is_pipe, flags); } mdbx_txn_abort(read_txn); if (!dest_is_pipe) { if (likely(rc == MDBX_SUCCESS)) - rc = mdbx_fsync(fd, MDBX_SYNC_DATA | MDBX_SYNC_SIZE); + rc = osal_fsync(fd, MDBX_SYNC_DATA | MDBX_SYNC_SIZE); /* Write actual meta */ if (likely(rc == MDBX_SUCCESS)) - rc = mdbx_pwrite(fd, buffer, pgno2bytes(env, NUM_METAS), 0); + rc = osal_pwrite(fd, buffer, pgno2bytes(env, NUM_METAS), 0); if (likely(rc == MDBX_SUCCESS)) - rc = mdbx_fsync(fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); + rc = osal_fsync(fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); } - mdbx_memalign_free(buffer); + osal_memalign_free(buffer); return rc; } @@ -20029,7 +19864,7 @@ __cold int mdbx_env_copy(MDBX_env *env, const char *dest_path, MDBX_copy_flags_t flags) { #if defined(_WIN32) || defined(_WIN64) const wchar_t *dest_pathW = nullptr; - MUSTDIE_MB2WIDE(dest_path, dest_pathW); + OSAL_MB2WIDE(dest_path, dest_pathW); return mdbx_env_copyW(env, dest_pathW, flags); } @@ -20048,7 +19883,7 @@ LIBMDBX_API int mdbx_env_copyW(MDBX_env *env, const wchar_t *dest_path, * We don't want the OS to cache the writes, since the source data is * already in the OS cache. */ mdbx_filehandle_t newfd; - rc = mdbx_openfile(MDBX_OPEN_COPY, env, dest_path, &newfd, + rc = osal_openfile(MDBX_OPEN_COPY, env, dest_path, &newfd, #if defined(_WIN32) || defined(_WIN64) (mdbx_mode_t)-1 #else @@ -20085,11 +19920,11 @@ LIBMDBX_API int mdbx_env_copyW(MDBX_env *env, const wchar_t *dest_path, rc = mdbx_env_copy2fd(env, newfd, flags); if (newfd != INVALID_HANDLE_VALUE) { - int err = mdbx_closefile(newfd); + int err = osal_closefile(newfd); if (rc == MDBX_SUCCESS && err != rc) rc = err; if (rc != MDBX_SUCCESS) - (void)mdbx_removefile(dest_path); + (void)osal_removefile(dest_path); } return rc; @@ -20112,11 +19947,11 @@ __cold int mdbx_env_set_flags(MDBX_env *env, MDBX_env_flags_t flags, return MDBX_EACCESS; if ((env->me_flags & MDBX_ENV_ACTIVE) && - unlikely(env->me_txn0->mt_owner == mdbx_thread_self())) + unlikely(env->me_txn0->mt_owner == osal_thread_self())) return MDBX_BUSY; const bool lock_needed = (env->me_flags & MDBX_ENV_ACTIVE) && - env->me_txn0->mt_owner != mdbx_thread_self(); + env->me_txn0->mt_owner != osal_thread_self(); bool should_unlock = false; if (lock_needed) { rc = mdbx_txn_lock(env, false); @@ -20265,12 +20100,12 @@ __cold static int stat_acc(const MDBX_txn *txn, MDBX_stat *st, size_t bytes) { if (!(txn->mt_dbs[MAIN_DBI].md_flags & (MDBX_DUPSORT | MDBX_INTEGERKEY)) && txn->mt_dbs[MAIN_DBI].md_entries /* TODO: use `md_subs` field */) { MDBX_cursor_couple cx; - err = mdbx_cursor_init(&cx.outer, (MDBX_txn *)txn, MAIN_DBI); + err = cursor_init(&cx.outer, (MDBX_txn *)txn, MAIN_DBI); if (unlikely(err != MDBX_SUCCESS)) return err; /* scan and account not opened named subDBs */ - err = mdbx_page_search(&cx.outer, NULL, MDBX_PS_FIRST); + err = page_search(&cx.outer, NULL, MDBX_PS_FIRST); while (err == MDBX_SUCCESS) { const MDBX_page *mp = cx.outer.mc_pg[cx.outer.mc_top]; for (unsigned i = 0; i < page_numkeys(mp); i++) { @@ -20296,7 +20131,7 @@ __cold static int stat_acc(const MDBX_txn *txn, MDBX_stat *st, size_t bytes) { stat_add(&db, st, bytes); } } - err = mdbx_cursor_sibling(&cx.outer, SIBLING_RIGHT); + err = cursor_sibling(&cx.outer, SIBLING_RIGHT); } if (unlikely(err != MDBX_NOTFOUND)) return err; @@ -20323,7 +20158,7 @@ __cold int mdbx_env_stat_ex(const MDBX_env *env, const MDBX_txn *txn, if (unlikely(err != MDBX_SUCCESS)) return err; - if (env->me_txn0 && env->me_txn0->mt_owner == mdbx_thread_self()) + if (env->me_txn0 && env->me_txn0->mt_owner == osal_thread_self()) /* inside write-txn */ return stat_acc(env->me_txn, dest, bytes); @@ -20352,14 +20187,14 @@ __cold int mdbx_dbi_dupsort_depthmask(MDBX_txn *txn, MDBX_dbi dbi, return MDBX_BAD_DBI; MDBX_cursor_couple cx; - rc = mdbx_cursor_init(&cx.outer, txn, dbi); + rc = cursor_init(&cx.outer, txn, dbi); if (unlikely(rc != MDBX_SUCCESS)) return rc; if ((cx.outer.mc_db->md_flags & MDBX_DUPSORT) == 0) return MDBX_RESULT_TRUE; MDBX_val key, data; - rc = mdbx_cursor_first(&cx.outer, &key, &data); + rc = cursor_first(&cx.outer, &key, &data); *mask = 0; while (rc == MDBX_SUCCESS) { const MDBX_node *node = page_node(cx.outer.mc_pg[cx.outer.mc_top], @@ -20381,10 +20216,10 @@ __cold int mdbx_dbi_dupsort_depthmask(MDBX_txn *txn, MDBX_dbi dbi, *mask |= 1 << UNALIGNED_PEEK_16(db, MDBX_db, md_depth); break; default: - mdbx_error("wrong node-flags %u", flags); + ERROR("wrong node-flags %u", flags); return MDBX_CORRUPTED; } - rc = mdbx_cursor_next(&cx.outer, &key, &data, MDBX_NEXT_NODUP); + rc = cursor_next(&cx.outer, &key, &data, MDBX_NEXT_NODUP); } return (rc == MDBX_NOTFOUND) ? MDBX_SUCCESS : rc; @@ -20435,7 +20270,7 @@ __cold static int fetch_envinfo_ex(const MDBX_env *env, const MDBX_txn *txn, if (unlikely(env->me_flags & MDBX_FATAL_ERROR)) return MDBX_PANIC; - mdbx_memory_fence(mo_AcquireRelease, false); + osal_memory_fence(mo_AcquireRelease, false); volatile const MDBX_meta *const recent_meta = meta_prefer_last(env); arg->mi_recent_txnid = meta_txnid(env, recent_meta); arg->mi_meta0_txnid = meta_txnid(env, meta0); @@ -20485,16 +20320,16 @@ __cold static int fetch_envinfo_ex(const MDBX_env *env, const MDBX_txn *txn, if (likely(bytes > size_before_bootid)) { arg->mi_unsync_volume = pgno2bytes(env, unsynced_pages); - const uint64_t monotime_now = mdbx_osal_monotime(); + const uint64_t monotime_now = osal_monotime(); uint64_t ts = atomic_load64(&lck->mti_sync_timestamp, mo_Relaxed); arg->mi_since_sync_seconds16dot16 = - ts ? mdbx_osal_monotime_to_16dot16(monotime_now - ts) : 0; + ts ? osal_monotime_to_16dot16(monotime_now - ts) : 0; ts = atomic_load64(&lck->mti_reader_check_timestamp, mo_Relaxed); arg->mi_since_reader_check_seconds16dot16 = - ts ? mdbx_osal_monotime_to_16dot16(monotime_now - ts) : 0; + ts ? osal_monotime_to_16dot16(monotime_now - ts) : 0; arg->mi_autosync_threshold = pgno2bytes( env, atomic_load32(&lck->mti_autosync_threshold, mo_Relaxed)); - arg->mi_autosync_period_seconds16dot16 = mdbx_osal_monotime_to_16dot16( + arg->mi_autosync_period_seconds16dot16 = osal_monotime_to_16dot16( atomic_load64(&lck->mti_autosync_period, mo_Relaxed)); arg->mi_bootid.current.x = bootid.x; arg->mi_bootid.current.y = bootid.y; @@ -20518,7 +20353,7 @@ __cold static int fetch_envinfo_ex(const MDBX_env *env, const MDBX_txn *txn, atomic_load64(&lck->mti_pgop_stat.unspill, mo_Relaxed); arg->mi_pgop_stat.wops = atomic_load64(&lck->mti_pgop_stat.wops, mo_Relaxed); - arg->mi_pgop_stat.gcrtime_seconds16dot16 = mdbx_osal_monotime_to_16dot16( + arg->mi_pgop_stat.gcrtime_seconds16dot16 = osal_monotime_to_16dot16( atomic_load64(&lck->mti_pgop_stat.gcrtime, mo_Relaxed)); #else memset(&arg->mi_pgop_stat, 0, sizeof(arg->mi_pgop_stat)); @@ -20541,7 +20376,7 @@ __cold static int fetch_envinfo_ex(const MDBX_env *env, const MDBX_txn *txn, } } - mdbx_compiler_barrier(); + osal_compiler_barrier(); return MDBX_SUCCESS; } @@ -20603,8 +20438,8 @@ static __inline MDBX_cmp_func *get_default_datacmp(unsigned flags) { : ((flags & MDBX_REVERSEDUP) ? cmp_reverse : cmp_lexical)); } -static int mdbx_dbi_bind(MDBX_txn *txn, const MDBX_dbi dbi, unsigned user_flags, - MDBX_cmp_func *keycmp, MDBX_cmp_func *datacmp) { +static int dbi_bind(MDBX_txn *txn, const MDBX_dbi dbi, unsigned user_flags, + MDBX_cmp_func *keycmp, MDBX_cmp_func *datacmp) { /* LY: so, accepting only three cases for the table's flags: * 1) user_flags and both comparators are zero * = assume that a by-default mode/flags is requested for reading; @@ -20692,7 +20527,7 @@ static int dbi_open(MDBX_txn *txn, const char *table_name, unsigned user_flags, /* main table? */ if (!table_name) { - rc = mdbx_dbi_bind(txn, MAIN_DBI, user_flags, keycmp, datacmp); + rc = dbi_bind(txn, MAIN_DBI, user_flags, keycmp, datacmp); if (unlikely(rc != MDBX_SUCCESS)) goto early_bailout; *dbi = MAIN_DBI; @@ -20721,7 +20556,7 @@ static int dbi_open(MDBX_txn *txn, const char *table_name, unsigned user_flags, } if (len == txn->mt_dbxs[scan].md_name.iov_len && !strncmp(table_name, txn->mt_dbxs[scan].md_name.iov_base, len)) { - rc = mdbx_dbi_bind(txn, scan, user_flags, keycmp, datacmp); + rc = dbi_bind(txn, scan, user_flags, keycmp, datacmp); if (unlikely(rc != MDBX_SUCCESS)) goto early_bailout; *dbi = scan; @@ -20747,10 +20582,10 @@ static int dbi_open(MDBX_txn *txn, const char *table_name, unsigned user_flags, key.iov_len = len; key.iov_base = (void *)table_name; MDBX_cursor_couple couple; - rc = mdbx_cursor_init(&couple.outer, txn, MAIN_DBI); + rc = cursor_init(&couple.outer, txn, MAIN_DBI); if (unlikely(rc != MDBX_SUCCESS)) goto early_bailout; - rc = mdbx_cursor_set(&couple.outer, &key, &data, MDBX_SET).err; + rc = cursor_set(&couple.outer, &key, &data, MDBX_SET).err; if (unlikely(rc != MDBX_SUCCESS)) { if (rc != MDBX_NOTFOUND || !(user_flags & MDBX_CREATE)) goto early_bailout; @@ -20774,16 +20609,16 @@ static int dbi_open(MDBX_txn *txn, const char *table_name, unsigned user_flags, } /* Done here so we cannot fail after creating a new DB */ - char *namedup = mdbx_strdup(table_name); + char *namedup = osal_strdup(table_name); if (unlikely(!namedup)) { rc = MDBX_ENOMEM; goto early_bailout; } - int err = mdbx_fastmutex_acquire(&env->me_dbi_lock); + int err = osal_fastmutex_acquire(&env->me_dbi_lock); if (unlikely(err != MDBX_SUCCESS)) { rc = err; - mdbx_free(namedup); + osal_free(namedup); goto early_bailout; } @@ -20799,7 +20634,7 @@ static int dbi_open(MDBX_txn *txn, const char *table_name, unsigned user_flags, } if (len == txn->mt_dbxs[scan].md_name.iov_len && !strncmp(table_name, txn->mt_dbxs[scan].md_name.iov_base, len)) { - rc = mdbx_dbi_bind(txn, scan, user_flags, keycmp, datacmp); + rc = dbi_bind(txn, scan, user_flags, keycmp, datacmp); if (unlikely(rc != MDBX_SUCCESS)) goto later_bailout; *dbi = scan; @@ -20816,7 +20651,7 @@ static int dbi_open(MDBX_txn *txn, const char *table_name, unsigned user_flags, MDBX_db db_dummy; if (unlikely(rc)) { /* MDBX_NOTFOUND and MDBX_CREATE: Create new DB */ - mdbx_tassert(txn, rc == MDBX_NOTFOUND); + tASSERT(txn, rc == MDBX_NOTFOUND); memset(&db_dummy, 0, sizeof(db_dummy)); db_dummy.md_root = P_INVALID; db_dummy.md_mod_txnid = txn->mt_txnid; @@ -20832,20 +20667,20 @@ static int dbi_open(MDBX_txn *txn, const char *table_name, unsigned user_flags, dbiflags |= DBI_DIRTY | DBI_CREAT; txn->mt_flags |= MDBX_TXN_DIRTY; - mdbx_tassert(txn, (txn->mt_dbistate[MAIN_DBI] & DBI_DIRTY) != 0); + tASSERT(txn, (txn->mt_dbistate[MAIN_DBI] & DBI_DIRTY) != 0); } /* Got info, register DBI in this txn */ memset(txn->mt_dbxs + slot, 0, sizeof(MDBX_dbx)); memcpy(&txn->mt_dbs[slot], data.iov_base, sizeof(MDBX_db)); env->me_dbflags[slot] = 0; - rc = mdbx_dbi_bind(txn, slot, user_flags, keycmp, datacmp); + rc = dbi_bind(txn, slot, user_flags, keycmp, datacmp); if (unlikely(rc != MDBX_SUCCESS)) { - mdbx_tassert(txn, (dbiflags & DBI_CREAT) == 0); + tASSERT(txn, (dbiflags & DBI_CREAT) == 0); later_bailout: *dbi = 0; later_exit: - mdbx_free(namedup); + osal_free(namedup); } else { txn->mt_dbistate[slot] = (uint8_t)dbiflags; txn->mt_dbxs[slot].md_name.iov_base = namedup; @@ -20854,7 +20689,7 @@ static int dbi_open(MDBX_txn *txn, const char *table_name, unsigned user_flags, if (!(dbiflags & DBI_CREAT)) env->me_dbflags[slot] = txn->mt_dbs[slot].md_flags | DB_VALID; if (txn->mt_numdbs == slot) { - mdbx_compiler_barrier(); + osal_compiler_barrier(); txn->mt_numdbs = slot + 1; txn->mt_cursors[slot] = NULL; } @@ -20863,7 +20698,7 @@ static int dbi_open(MDBX_txn *txn, const char *table_name, unsigned user_flags, *dbi = slot; } - mdbx_ensure(env, mdbx_fastmutex_release(&env->me_dbi_lock) == MDBX_SUCCESS); + ENSURE(env, osal_fastmutex_release(&env->me_dbi_lock) == MDBX_SUCCESS); return rc; } @@ -20898,7 +20733,7 @@ __cold int mdbx_dbi_stat(MDBX_txn *txn, MDBX_dbi dbi, MDBX_stat *dest, return MDBX_BAD_TXN; if (unlikely(txn->mt_dbistate[dbi] & DBI_STALE)) { - rc = mdbx_fetch_sdb(txn, dbi); + rc = fetch_sdb(txn, dbi); if (unlikely(rc != MDBX_SUCCESS)) return rc; } @@ -20908,8 +20743,8 @@ __cold int mdbx_dbi_stat(MDBX_txn *txn, MDBX_dbi dbi, MDBX_stat *dest, return MDBX_SUCCESS; } -static int mdbx_dbi_close_locked(MDBX_env *env, MDBX_dbi dbi) { - mdbx_assert(env, dbi >= CORE_DBS); +static int dbi_close_locked(MDBX_env *env, MDBX_dbi dbi) { + eASSERT(env, dbi >= CORE_DBS); if (unlikely(dbi >= env->me_numdbs)) return MDBX_BAD_DBI; @@ -20920,9 +20755,9 @@ static int mdbx_dbi_close_locked(MDBX_env *env, MDBX_dbi dbi) { env->me_dbflags[dbi] = 0; env->me_dbxs[dbi].md_name.iov_len = 0; - mdbx_memory_fence(mo_AcquireRelease, true); + osal_memory_fence(mo_AcquireRelease, true); env->me_dbxs[dbi].md_name.iov_base = NULL; - mdbx_free(ptr); + osal_free(ptr); if (env->me_numdbs == dbi + 1) { unsigned i = env->me_numdbs; @@ -20943,12 +20778,12 @@ int mdbx_dbi_close(MDBX_env *env, MDBX_dbi dbi) { if (unlikely(dbi < CORE_DBS || dbi >= env->me_maxdbs)) return MDBX_BAD_DBI; - rc = mdbx_fastmutex_acquire(&env->me_dbi_lock); + rc = osal_fastmutex_acquire(&env->me_dbi_lock); if (likely(rc == MDBX_SUCCESS)) { rc = (dbi < env->me_maxdbs && (env->me_dbflags[dbi] & DB_VALID)) - ? mdbx_dbi_close_locked(env, dbi) + ? dbi_close_locked(env, dbi) : MDBX_BAD_DBI; - mdbx_ensure(env, mdbx_fastmutex_release(&env->me_dbi_lock) == MDBX_SUCCESS); + ENSURE(env, osal_fastmutex_release(&env->me_dbi_lock) == MDBX_SUCCESS); } return rc; } @@ -20978,8 +20813,8 @@ int mdbx_dbi_flags(MDBX_txn *txn, MDBX_dbi dbi, unsigned *flags) { } #endif /* LIBMDBX_NO_EXPORTS_LEGACY_API */ -static int mdbx_drop_tree(MDBX_cursor *mc, const bool may_have_subDBs) { - int rc = mdbx_page_search(mc, NULL, MDBX_PS_FIRST); +static int drop_tree(MDBX_cursor *mc, const bool may_have_subDBs) { + int rc = page_search(mc, NULL, MDBX_PS_FIRST); if (likely(rc == MDBX_SUCCESS)) { MDBX_txn *txn = mc->mc_txn; @@ -20988,11 +20823,11 @@ static int mdbx_drop_tree(MDBX_cursor *mc, const bool may_have_subDBs) { * Also if the DB doesn't have sub-DBs and has no large/overflow * pages, omit scanning leaves. */ if (!(may_have_subDBs | mc->mc_db->md_overflow_pages)) - mdbx_cursor_pop(mc); + cursor_pop(mc); - rc = mdbx_pnl_need(&txn->tw.retired_pages, - mc->mc_db->md_branch_pages + mc->mc_db->md_leaf_pages + - mc->mc_db->md_overflow_pages); + rc = pnl_need(&txn->tw.retired_pages, mc->mc_db->md_branch_pages + + mc->mc_db->md_leaf_pages + + mc->mc_db->md_overflow_pages); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; @@ -21002,11 +20837,11 @@ static int mdbx_drop_tree(MDBX_cursor *mc, const bool may_have_subDBs) { MDBX_page *const mp = mc->mc_pg[mc->mc_top]; const unsigned nkeys = page_numkeys(mp); if (IS_LEAF(mp)) { - mdbx_cassert(mc, mc->mc_snum == mc->mc_db->md_depth); + cASSERT(mc, mc->mc_snum == mc->mc_db->md_depth); for (unsigned i = 0; i < nkeys; i++) { MDBX_node *node = page_node(mp, i); if (node_flags(node) & F_BIGDATA) { - rc = mdbx_page_retire_ex(mc, node_largedata_pgno(node), nullptr, 0); + rc = page_retire_ex(mc, node_largedata_pgno(node), nullptr, 0); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; if (!(may_have_subDBs | mc->mc_db->md_overflow_pages)) @@ -21016,26 +20851,26 @@ static int mdbx_drop_tree(MDBX_cursor *mc, const bool may_have_subDBs) { rc = /* disallowing implicit subDB deletion */ MDBX_INCOMPATIBLE; goto bailout; } - rc = mdbx_xcursor_init1(mc, node, mp); + rc = cursor_xinit1(mc, node, mp); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; - rc = mdbx_drop_tree(&mc->mc_xcursor->mx_cursor, false); + rc = drop_tree(&mc->mc_xcursor->mx_cursor, false); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } } } else { - mdbx_cassert(mc, mc->mc_snum < mc->mc_db->md_depth); + cASSERT(mc, mc->mc_snum < mc->mc_db->md_depth); mc->mc_checking |= CC_RETIRING; const unsigned pagetype = (IS_FROZEN(txn, mp) ? P_FROZEN : 0) + ((mc->mc_snum + 1 == mc->mc_db->md_depth) ? P_LEAF : P_BRANCH); for (unsigned i = 0; i < nkeys; i++) { MDBX_node *node = page_node(mp, i); - mdbx_tassert(txn, (node_flags(node) & - (F_BIGDATA | F_SUBDATA | F_DUPDATA)) == 0); + tASSERT(txn, (node_flags(node) & + (F_BIGDATA | F_SUBDATA | F_DUPDATA)) == 0); const pgno_t pgno = node_pgno(node); - rc = mdbx_page_retire_ex(mc, pgno, nullptr, pagetype); + rc = page_retire_ex(mc, pgno, nullptr, pagetype); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } @@ -21043,16 +20878,16 @@ static int mdbx_drop_tree(MDBX_cursor *mc, const bool may_have_subDBs) { } if (!mc->mc_top) break; - mdbx_cassert(mc, nkeys > 0); + cASSERT(mc, nkeys > 0); mc->mc_ki[mc->mc_top] = (indx_t)nkeys; - rc = mdbx_cursor_sibling(mc, SIBLING_RIGHT); + rc = cursor_sibling(mc, SIBLING_RIGHT); if (unlikely(rc != MDBX_SUCCESS)) { if (unlikely(rc != MDBX_NOTFOUND)) goto bailout; /* no more siblings, go back to beginning * of previous level. */ pop: - mdbx_cursor_pop(mc); + cursor_pop(mc); mc->mc_ki[0] = 0; for (unsigned i = 1; i < mc->mc_snum; i++) { mc->mc_ki[i] = 0; @@ -21060,7 +20895,7 @@ static int mdbx_drop_tree(MDBX_cursor *mc, const bool may_have_subDBs) { } } } - rc = mdbx_page_retire(mc, mc->mc_pg[0]); + rc = page_retire(mc, mc->mc_pg[0]); bailout: if (unlikely(rc != MDBX_SUCCESS)) txn->mt_flags |= MDBX_TXN_ERROR; @@ -21081,8 +20916,8 @@ int mdbx_drop(MDBX_txn *txn, MDBX_dbi dbi, bool del) { if (unlikely(rc != MDBX_SUCCESS)) return rc; - rc = mdbx_drop_tree(mc, dbi == MAIN_DBI || - (mc->mc_db->md_flags & MDBX_DUPSORT) != 0); + rc = drop_tree(mc, + dbi == MAIN_DBI || (mc->mc_db->md_flags & MDBX_DUPSORT) != 0); /* Invalidate the dropped DB's cursors */ for (MDBX_cursor *m2 = txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) m2->mc_flags &= ~(C_INITIALIZED | C_EOF); @@ -21091,20 +20926,19 @@ int mdbx_drop(MDBX_txn *txn, MDBX_dbi dbi, bool del) { /* Can't delete the main DB */ if (del && dbi >= CORE_DBS) { - rc = mdbx_del0(txn, MAIN_DBI, &mc->mc_dbx->md_name, NULL, F_SUBDATA); + rc = delete (txn, MAIN_DBI, &mc->mc_dbx->md_name, NULL, F_SUBDATA); if (likely(rc == MDBX_SUCCESS)) { - mdbx_tassert(txn, txn->mt_dbistate[MAIN_DBI] & DBI_DIRTY); - mdbx_tassert(txn, txn->mt_flags & MDBX_TXN_DIRTY); + tASSERT(txn, txn->mt_dbistate[MAIN_DBI] & DBI_DIRTY); + tASSERT(txn, txn->mt_flags & MDBX_TXN_DIRTY); txn->mt_dbistate[dbi] = DBI_STALE; MDBX_env *env = txn->mt_env; - rc = mdbx_fastmutex_acquire(&env->me_dbi_lock); + rc = osal_fastmutex_acquire(&env->me_dbi_lock); if (unlikely(rc != MDBX_SUCCESS)) { txn->mt_flags |= MDBX_TXN_ERROR; goto bailout; } - mdbx_dbi_close_locked(env, dbi); - mdbx_ensure(env, - mdbx_fastmutex_release(&env->me_dbi_lock) == MDBX_SUCCESS); + dbi_close_locked(env, dbi); + ENSURE(env, osal_fastmutex_release(&env->me_dbi_lock) == MDBX_SUCCESS); } else { txn->mt_flags |= MDBX_TXN_ERROR; } @@ -21188,7 +21022,7 @@ __cold int mdbx_reader_list(const MDBX_env *env, MDBX_reader_list_func *func, atomic_load64(&r->mr_snapshot_pages_retired, mo_Relaxed))) goto retry_reader; - mdbx_assert(env, txnid > 0); + eASSERT(env, txnid > 0); if (txnid >= SAFE64_INVALID_THRESHOLD) txnid = 0; @@ -21201,7 +21035,7 @@ __cold int mdbx_reader_list(const MDBX_env *env, MDBX_reader_list_func *func, const uint64_t head_pages_retired = unaligned_peek_u64_volatile(4, recent_meta->mm_pages_retired); const txnid_t head_txnid = meta_txnid(env, recent_meta); - mdbx_memory_fence(mo_AcquireRelease, false); + osal_memory_fence(mo_AcquireRelease, false); if (unlikely(recent_meta != meta_prefer_last(env) || head_pages_retired != unaligned_peek_u64_volatile( @@ -21228,7 +21062,7 @@ __cold int mdbx_reader_list(const MDBX_env *env, MDBX_reader_list_func *func, /* Insert pid into list if not already present. * return -1 if already present. */ -__cold static bool mdbx_pid_insert(uint32_t *ids, uint32_t pid) { +__cold static bool pid_insert(uint32_t *ids, uint32_t pid) { /* binary search of pid in list */ unsigned base = 0; unsigned cursor = 1; @@ -21264,20 +21098,20 @@ __cold static bool mdbx_pid_insert(uint32_t *ids, uint32_t pid) { __cold int mdbx_reader_check(MDBX_env *env, int *dead) { if (dead) *dead = 0; - return mdbx_cleanup_dead_readers(env, false, dead); + return cleanup_dead_readers(env, false, dead); } /* Return: * MDBX_RESULT_TRUE - done and mutex recovered * MDBX_SUCCESS - done * Otherwise errcode. */ -__cold MDBX_INTERNAL_FUNC int -mdbx_cleanup_dead_readers(MDBX_env *env, int rdt_locked, int *dead) { +__cold MDBX_INTERNAL_FUNC int cleanup_dead_readers(MDBX_env *env, + int rdt_locked, int *dead) { int rc = check_env(env, true); if (unlikely(rc != MDBX_SUCCESS)) return rc; - mdbx_assert(env, rdt_locked >= 0); + eASSERT(env, rdt_locked >= 0); MDBX_lockinfo *const lck = env->me_lck_mmap.lck; if (unlikely(lck == NULL)) { /* exclusive mode */ @@ -21292,7 +21126,7 @@ mdbx_cleanup_dead_readers(MDBX_env *env, int rdt_locked, int *dead) { uint32_t *const pids = (snap_nreaders < ARRAY_LENGTH(pidsbuf_onstask)) ? pidsbuf_onstask - : mdbx_malloc((snap_nreaders + 1) * sizeof(uint32_t)); + : osal_malloc((snap_nreaders + 1) * sizeof(uint32_t)); if (unlikely(!pids)) return MDBX_ENOMEM; @@ -21305,21 +21139,21 @@ mdbx_cleanup_dead_readers(MDBX_env *env, int rdt_locked, int *dead) { continue /* skip empty */; if (pid == env->me_pid) continue /* skip self */; - if (!mdbx_pid_insert(pids, pid)) + if (!pid_insert(pids, pid)) continue /* such pid already processed */; - int err = mdbx_rpid_check(env, pid); + int err = osal_rpid_check(env, pid); if (err == MDBX_RESULT_TRUE) continue /* reader is live */; if (err != MDBX_SUCCESS) { rc = err; - break /* mdbx_rpid_check() failed */; + break /* osal_rpid_check() failed */; } /* stale reader found */ if (!rdt_locked) { - err = mdbx_rdt_lock(env); + err = osal_rdt_lock(env); if (MDBX_IS_ERROR(err)) { rc = err; break; @@ -21336,7 +21170,7 @@ mdbx_cleanup_dead_readers(MDBX_env *env, int rdt_locked, int *dead) { if (lck->mti_readers[i].mr_pid.weak != pid) continue; - err = mdbx_rpid_check(env, pid); + err = osal_rpid_check(env, pid); if (MDBX_IS_ERROR(err)) { rc = err; break; @@ -21349,8 +21183,8 @@ mdbx_cleanup_dead_readers(MDBX_env *env, int rdt_locked, int *dead) { /* clean it */ for (unsigned j = i; j < snap_nreaders; j++) { if (lck->mti_readers[j].mr_pid.weak == pid) { - mdbx_debug("clear stale reader pid %" PRIuPTR " txn %" PRIaTXN, - (size_t)pid, lck->mti_readers[j].mr_txnid.weak); + DEBUG("clear stale reader pid %" PRIuPTR " txn %" PRIaTXN, (size_t)pid, + lck->mti_readers[j].mr_txnid.weak); atomic_store32(&lck->mti_readers[j].mr_pid, 0, mo_Relaxed); atomic_store32(&lck->mti_readers_refresh_flag, true, mo_AcquireRelease); count++; @@ -21359,25 +21193,25 @@ mdbx_cleanup_dead_readers(MDBX_env *env, int rdt_locked, int *dead) { } if (likely(!MDBX_IS_ERROR(rc))) - atomic_store64(&lck->mti_reader_check_timestamp, mdbx_osal_monotime(), + atomic_store64(&lck->mti_reader_check_timestamp, osal_monotime(), mo_Relaxed); if (rdt_locked < 0) - mdbx_rdt_unlock(env); + osal_rdt_unlock(env); if (pids != pidsbuf_onstask) - mdbx_free(pids); + osal_free(pids); if (dead) *dead = count; return rc; } -__cold int mdbx_setup_debug(int loglevel, int flags, MDBX_debug_func *logger) { - const int rc = mdbx_runtime_flags | (mdbx_loglevel << 16); +__cold int mdbx_setup_debug(int level, int flags, MDBX_debug_func *logger) { + const int rc = runtime_flags | (loglevel << 16); - if (loglevel != MDBX_LOG_DONTCHANGE) - mdbx_loglevel = (uint8_t)loglevel; + if (level != MDBX_LOG_DONTCHANGE) + loglevel = (uint8_t)level; if (flags != MDBX_DBG_DONTCHANGE) { flags &= @@ -21386,17 +21220,17 @@ __cold int mdbx_setup_debug(int loglevel, int flags, MDBX_debug_func *logger) { #endif MDBX_DBG_DUMP | MDBX_DBG_LEGACY_MULTIOPEN | MDBX_DBG_LEGACY_OVERLAP | MDBX_DBG_DONT_UPGRADE; - mdbx_runtime_flags = (uint8_t)flags; + runtime_flags = (uint8_t)flags; } if (logger != MDBX_LOGGER_DONTCHANGE) - mdbx_debug_logger = logger; + debug_logger = logger; return rc; } __cold static txnid_t kick_longlived_readers(MDBX_env *env, const txnid_t laggard) { - mdbx_debug("DB size maxed out by reading #%" PRIaTXN, laggard); + DEBUG("DB size maxed out by reading #%" PRIaTXN, laggard); MDBX_hsr_func *const callback = env->me_hsr_callback; txnid_t oldest = 0; bool notify_eof_of_loop = false; @@ -21404,17 +21238,17 @@ __cold static txnid_t kick_longlived_readers(MDBX_env *env, do { env->me_lck->mti_readers_refresh_flag.weak = /* force refresh */ true; oldest = find_oldest_reader(env); - mdbx_assert(env, oldest < env->me_txn0->mt_txnid); - mdbx_assert(env, oldest >= laggard); - mdbx_assert(env, oldest >= env->me_lck->mti_oldest_reader.weak); + eASSERT(env, oldest < env->me_txn0->mt_txnid); + eASSERT(env, oldest >= laggard); + eASSERT(env, oldest >= env->me_lck->mti_oldest_reader.weak); - mdbx_memory_fence(mo_AcquireRelease, false); + osal_memory_fence(mo_AcquireRelease, false); const txnid_t steady = meta_txnid(env, meta_prefer_steady(env)); MDBX_lockinfo *const lck = env->me_lck_mmap.lck; if (oldest == steady || oldest > laggard || /* without-LCK mode */ !lck) break; - if (MDBX_IS_ERROR(mdbx_cleanup_dead_readers(env, false, NULL))) + if (MDBX_IS_ERROR(cleanup_dead_readers(env, false, NULL))) break; if (!callback) @@ -21477,8 +21311,8 @@ __cold static txnid_t kick_longlived_readers(MDBX_env *env, /* notify end of hsr-loop */ const txnid_t turn = oldest - laggard; if (turn) - mdbx_notice("hsr-kick: done turn %" PRIaTXN " -> %" PRIaTXN " +%" PRIaTXN, - laggard, oldest, turn); + NOTICE("hsr-kick: done turn %" PRIaTXN " -> %" PRIaTXN " +%" PRIaTXN, + laggard, oldest, turn); callback(env, env->me_txn, 0, 0, laggard, (turn < UINT_MAX) ? (unsigned)turn : UINT_MAX, 0, -retry); } @@ -21538,7 +21372,7 @@ int mdbx_txn_straggler(const MDBX_txn *txn, int *percent) const pgno_t maxpg = meta->mm_geo.now; *percent = (int)((meta->mm_geo.next * UINT64_C(100) + maxpg / 2) / maxpg); } - mdbx_memory_fence(mo_AcquireRelease, false); + osal_memory_fence(mo_AcquireRelease, false); } while (unlikely(recent != meta_txnid(env, meta))); txnid_t lag = (recent - txn->mt_txnid) / xMDBX_TXNID_STEP; @@ -21553,8 +21387,8 @@ typedef struct mdbx_walk_ctx { bool mw_dont_check_keys_ordering; } mdbx_walk_ctx_t; -__cold static int mdbx_walk_sdb(mdbx_walk_ctx_t *ctx, MDBX_db *const sdb, - const char *name, int deep); +__cold static int walk_sdb(mdbx_walk_ctx_t *ctx, MDBX_db *const sdb, + const char *name, int deep); static MDBX_page_type_t walk_page_type(const MDBX_page *mp) { if (mp) @@ -21574,9 +21408,8 @@ static MDBX_page_type_t walk_page_type(const MDBX_page *mp) { } /* Depth-first tree traversal. */ -__cold static int mdbx_walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, - const char *name, int deep, - txnid_t parent_txnid) { +__cold static int walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, + const char *name, int deep, txnid_t parent_txnid) { assert(pgno != P_INVALID); MDBX_page *mp = nullptr; int err = page_get(ctx->mw_cursor, pgno, &mp, parent_txnid); @@ -21625,7 +21458,7 @@ __cold static int mdbx_walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, pgr_t lp = page_get_large(ctx->mw_cursor, large_pgno, mp->mp_txnid); err = lp.err; if (err == MDBX_SUCCESS) { - mdbx_cassert(ctx->mw_cursor, PAGETYPE_WHOLE(lp.page) == P_OVERFLOW); + cASSERT(ctx->mw_cursor, PAGETYPE_WHOLE(lp.page) == P_OVERFLOW); npages = lp.page->mp_pages; } @@ -21732,7 +21565,7 @@ __cold static int mdbx_walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, MDBX_node *node = page_node(mp, i); if (type == MDBX_page_branch) { assert(err == MDBX_SUCCESS); - err = mdbx_walk_tree(ctx, node_pgno(node), name, deep + 1, mp->mp_txnid); + err = walk_tree(ctx, node_pgno(node), name, deep + 1, mp->mp_txnid); if (unlikely(err != MDBX_SUCCESS)) { if (err == MDBX_RESULT_TRUE) break; @@ -21758,16 +21591,16 @@ __cold static int mdbx_walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, char namebuf_onstask[64]; char *const sub_name = (namelen < sizeof(namebuf_onstask)) ? namebuf_onstask - : mdbx_malloc(namelen + 1); + : osal_malloc(namelen + 1); if (unlikely(!sub_name)) return MDBX_ENOMEM; memcpy(sub_name, node_key(node), namelen); sub_name[namelen] = 0; memcpy(&db, node_data(node), sizeof(db)); assert(err == MDBX_SUCCESS); - err = mdbx_walk_sdb(ctx, &db, sub_name, deep + 1); + err = walk_sdb(ctx, &db, sub_name, deep + 1); if (sub_name != namebuf_onstask) - mdbx_free(sub_name); + osal_free(sub_name); } break; case F_SUBDATA | F_DUPDATA /* dupsorted sub-tree */: @@ -21780,10 +21613,10 @@ __cold static int mdbx_walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, assert(ctx->mw_cursor->mc_xcursor == &container_of(ctx->mw_cursor, MDBX_cursor_couple, outer)->inner); assert(err == MDBX_SUCCESS); - err = mdbx_xcursor_init1(ctx->mw_cursor, node, mp); + err = cursor_xinit1(ctx->mw_cursor, node, mp); if (likely(err == MDBX_SUCCESS)) { ctx->mw_cursor = &ctx->mw_cursor->mc_xcursor->mx_cursor; - err = mdbx_walk_tree(ctx, db.md_root, name, deep + 1, mp->mp_txnid); + err = walk_tree(ctx, db.md_root, name, deep + 1, mp->mp_txnid); MDBX_xcursor *inner_xcursor = container_of(ctx->mw_cursor, MDBX_xcursor, mx_cursor); MDBX_cursor_couple *couple = @@ -21798,15 +21631,15 @@ __cold static int mdbx_walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, return MDBX_SUCCESS; } -__cold static int mdbx_walk_sdb(mdbx_walk_ctx_t *ctx, MDBX_db *const sdb, - const char *name, int deep) { +__cold static int walk_sdb(mdbx_walk_ctx_t *ctx, MDBX_db *const sdb, + const char *name, int deep) { if (unlikely(sdb->md_root == P_INVALID)) return MDBX_SUCCESS; /* empty db */ MDBX_cursor_couple couple; MDBX_dbx dbx = {.md_klen_min = INT_MAX}; uint8_t dbistate = DBI_VALID | DBI_AUDITED; - int rc = mdbx_couple_init(&couple, ~0u, ctx->mw_txn, sdb, &dbx, &dbistate); + int rc = couple_init(&couple, ~0u, ctx->mw_txn, sdb, &dbx, &dbistate); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -21818,9 +21651,8 @@ __cold static int mdbx_walk_sdb(mdbx_walk_ctx_t *ctx, MDBX_db *const sdb, : CC_PAGECHECK; couple.outer.mc_next = ctx->mw_cursor; ctx->mw_cursor = &couple.outer; - rc = mdbx_walk_tree(ctx, sdb->md_root, name, deep, - sdb->md_mod_txnid ? sdb->md_mod_txnid - : ctx->mw_txn->mt_txnid); + rc = walk_tree(ctx, sdb->md_root, name, deep, + sdb->md_mod_txnid ? sdb->md_mod_txnid : ctx->mw_txn->mt_txnid); ctx->mw_cursor = couple.outer.mc_next; return rc; } @@ -21844,9 +21676,9 @@ __cold int mdbx_env_pgwalk(MDBX_txn *txn, MDBX_pgvisitor_func *visitor, (txn->mt_env->me_psize - sizeof(MDBX_meta) - PAGEHDRSZ) * NUM_METAS); if (!MDBX_IS_ERROR(rc)) - rc = mdbx_walk_sdb(&ctx, &txn->mt_dbs[FREE_DBI], MDBX_PGWALK_GC, 0); + rc = walk_sdb(&ctx, &txn->mt_dbs[FREE_DBI], MDBX_PGWALK_GC, 0); if (!MDBX_IS_ERROR(rc)) - rc = mdbx_walk_sdb(&ctx, &txn->mt_dbs[MAIN_DBI], MDBX_PGWALK_MAIN, 0); + rc = walk_sdb(&ctx, &txn->mt_dbs[MAIN_DBI], MDBX_PGWALK_MAIN, 0); return rc; } @@ -21974,7 +21806,7 @@ __hot static int cursor_diff(const MDBX_cursor *const __restrict x, while (likely(r->level < y->mc_snum && r->level < x->mc_snum)) { if (unlikely(y->mc_pg[r->level] != x->mc_pg[r->level])) { - mdbx_error("Mismatch cursors's pages at %u level", r->level); + ERROR("Mismatch cursors's pages at %u level", r->level); return MDBX_PROBLEM; } @@ -22139,7 +21971,7 @@ int mdbx_estimate_move(const MDBX_cursor *cursor, MDBX_val *key, MDBX_val *data, cursor_copy(cursor, &next.outer); if (cursor->mc_db->md_flags & MDBX_DUPSORT) { next.outer.mc_xcursor = &next.inner; - rc = mdbx_xcursor_init0(&next.outer); + rc = cursor_xinit0(&next.outer); if (unlikely(rc != MDBX_SUCCESS)) return rc; MDBX_xcursor *mx = &container_of(cursor, MDBX_cursor_couple, outer)->inner; @@ -22197,7 +22029,7 @@ int mdbx_estimate_range(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *begin_key, MDBX_cursor_couple begin; /* LY: first, initialize cursor to refresh a DB in case it have DB_STALE */ - rc = mdbx_cursor_init(&begin.outer, txn, dbi); + rc = cursor_init(&begin.outer, txn, dbi); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -22213,7 +22045,7 @@ int mdbx_estimate_range(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *begin_key, return MDBX_SUCCESS; } MDBX_val stub = {0, 0}; - rc = mdbx_cursor_first(&begin.outer, &stub, &stub); + rc = cursor_first(&begin.outer, &stub, &stub); if (unlikely(end_key == MDBX_EPSILON)) { /* LY: FIRST..+epsilon case */ return (rc == MDBX_SUCCESS) @@ -22225,7 +22057,7 @@ int mdbx_estimate_range(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *begin_key, if (end_key == NULL) { /* LY: -epsilon..LAST case */ MDBX_val stub = {0, 0}; - rc = mdbx_cursor_last(&begin.outer, &stub, &stub); + rc = cursor_last(&begin.outer, &stub, &stub); return (rc == MDBX_SUCCESS) ? mdbx_cursor_count(&begin.outer, (size_t *)size_items) : rc; @@ -22242,7 +22074,7 @@ int mdbx_estimate_range(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *begin_key, (begin_key == end_key || begin.outer.mc_dbx->md_cmp(begin_key, end_key) == 0)) { /* LY: single key case */ - rc = mdbx_cursor_set(&begin.outer, begin_key, NULL, MDBX_SET).err; + rc = cursor_set(&begin.outer, begin_key, NULL, MDBX_SET).err; if (unlikely(rc != MDBX_SUCCESS)) { *size_items = 0; return (rc == MDBX_NOTFOUND) ? MDBX_SUCCESS : rc; @@ -22253,8 +22085,7 @@ int mdbx_estimate_range(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *begin_key, begin.outer.mc_ki[begin.outer.mc_top]); if (F_ISSET(node_flags(node), F_DUPDATA)) { /* LY: return the number of duplicates for given key */ - mdbx_tassert(txn, - begin.outer.mc_xcursor == &begin.inner && + tASSERT(txn, begin.outer.mc_xcursor == &begin.inner && (begin.inner.mx_cursor.mc_flags & C_INITIALIZED)); *size_items = (sizeof(*size_items) >= sizeof(begin.inner.mx_db.md_entries) || @@ -22265,8 +22096,8 @@ int mdbx_estimate_range(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *begin_key, } return MDBX_SUCCESS; } else { - rc = mdbx_cursor_set(&begin.outer, begin_key, begin_data, - begin_data ? MDBX_GET_BOTH_RANGE : MDBX_SET_RANGE) + rc = cursor_set(&begin.outer, begin_key, begin_data, + begin_data ? MDBX_GET_BOTH_RANGE : MDBX_SET_RANGE) .err; } } @@ -22277,15 +22108,15 @@ int mdbx_estimate_range(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *begin_key, } MDBX_cursor_couple end; - rc = mdbx_cursor_init(&end.outer, txn, dbi); + rc = cursor_init(&end.outer, txn, dbi); if (unlikely(rc != MDBX_SUCCESS)) return rc; if (!end_key) { MDBX_val stub = {0, 0}; - rc = mdbx_cursor_last(&end.outer, &stub, &stub); + rc = cursor_last(&end.outer, &stub, &stub); } else { - rc = mdbx_cursor_set(&end.outer, end_key, end_data, - end_data ? MDBX_GET_BOTH_RANGE : MDBX_SET_RANGE) + rc = cursor_set(&end.outer, end_key, end_data, + end_data ? MDBX_GET_BOTH_RANGE : MDBX_SET_RANGE) .err; } if (unlikely(rc != MDBX_SUCCESS)) { @@ -22378,7 +22209,7 @@ int mdbx_replace_ex(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, return MDBX_EINVAL; MDBX_cursor_couple cx; - rc = mdbx_cursor_init(&cx.outer, txn, dbi); + rc = cursor_init(&cx.outer, txn, dbi); if (unlikely(rc != MDBX_SUCCESS)) return rc; cx.outer.mc_next = txn->mt_cursors[dbi]; @@ -22420,8 +22251,8 @@ int mdbx_replace_ex(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, /* disallow update/delete for multi-values */ MDBX_node *node = page_node(page, cx.outer.mc_ki[cx.outer.mc_top]); if (F_ISSET(node_flags(node), F_DUPDATA)) { - mdbx_tassert(txn, XCURSOR_INITED(&cx.outer) && - cx.outer.mc_xcursor->mx_db.md_entries > 1); + tASSERT(txn, XCURSOR_INITED(&cx.outer) && + cx.outer.mc_xcursor->mx_db.md_entries > 1); if (cx.outer.mc_xcursor->mx_db.md_entries > 1) { rc = MDBX_EMULTIVAL; goto bailout; @@ -22553,7 +22384,7 @@ int mdbx_dbi_sequence(MDBX_txn *txn, MDBX_dbi dbi, uint64_t *result, return MDBX_BAD_DBI; if (unlikely(txn->mt_dbistate[dbi] & DBI_STALE)) { - rc = mdbx_fetch_sdb(txn, dbi); + rc = fetch_sdb(txn, dbi); if (unlikely(rc != MDBX_SUCCESS)) return rc; } @@ -22570,7 +22401,7 @@ int mdbx_dbi_sequence(MDBX_txn *txn, MDBX_dbi dbi, uint64_t *result, if (unlikely(new < increment)) return MDBX_RESULT_TRUE; - mdbx_tassert(txn, new > dbs->md_seq); + tASSERT(txn, new > dbs->md_seq); dbs->md_seq = new; txn->mt_flags |= MDBX_TXN_DIRTY; txn->mt_dbistate[dbi] |= DBI_DIRTY; @@ -22648,7 +22479,7 @@ static __always_inline uint64_t double2key(const double *const ptr) { const int64_t i = *(const int64_t *)ptr; const uint64_t u = (i < 0) ? UINT64_C(0xffffFFFFffffFFFF) - i : i + UINT64_C(0x8000000000000000); - if (mdbx_assert_enabled()) { + if (ASSERT_ENABLED()) { const double f = key2double(u); assert(memcmp(&f, ptr, 8) == 0); } @@ -22671,7 +22502,7 @@ static __always_inline uint32_t float2key(const float *const ptr) { const int32_t i = *(const int32_t *)ptr; const uint32_t u = (i < 0) ? UINT32_C(0xffffFFFF) - i : i + UINT32_C(0x80000000); - if (mdbx_assert_enabled()) { + if (ASSERT_ENABLED()) { const float f = key2float(u); assert(memcmp(&f, ptr, 4) == 0); } @@ -22878,7 +22709,7 @@ __cold int mdbx_env_set_option(MDBX_env *env, const MDBX_option_t option, return err; const bool lock_needed = ((env->me_flags & MDBX_ENV_ACTIVE) && env->me_txn0 && - env->me_txn0->mt_owner != mdbx_thread_self()); + env->me_txn0->mt_owner != osal_thread_self()); bool should_unlock = false; switch (option) { case MDBX_opt_sync_bytes: @@ -22911,7 +22742,7 @@ __cold int mdbx_env_set_option(MDBX_env *env, const MDBX_option_t option, if (unlikely(value > UINT32_MAX)) return MDBX_TOO_LARGE; if (atomic_store64(&env->me_lck->mti_autosync_period, - mdbx_osal_16dot16_to_monotime((uint32_t)value), + osal_16dot16_to_monotime((uint32_t)value), mo_Relaxed) != 0 && (env->me_flags & MDBX_ENV_ACTIVE)) { err = mdbx_env_sync_poll(env); @@ -22956,13 +22787,13 @@ __cold int mdbx_env_set_option(MDBX_env *env, const MDBX_option_t option, } env->me_options.dp_reserve_limit = (unsigned)value; while (env->me_dp_reserve_len > env->me_options.dp_reserve_limit) { - mdbx_assert(env, env->me_dp_reserve != NULL); + eASSERT(env, env->me_dp_reserve != NULL); MDBX_page *dp = env->me_dp_reserve; MDBX_ASAN_UNPOISON_MEMORY_REGION(dp, env->me_psize); VALGRIND_MAKE_MEM_DEFINED(&dp->mp_next, sizeof(dp->mp_next)); env->me_dp_reserve = dp->mp_next; VALGRIND_MEMPOOL_FREE(env, dp); - mdbx_free(dp); + osal_free(dp); env->me_dp_reserve_len -= 1; } } @@ -23076,7 +22907,7 @@ __cold int mdbx_env_get_option(const MDBX_env *env, const MDBX_option_t option, case MDBX_opt_sync_period: if (unlikely(!(env->me_flags & MDBX_ENV_ACTIVE))) return MDBX_EPERM; - *pvalue = mdbx_osal_monotime_to_16dot16( + *pvalue = osal_monotime_to_16dot16( atomic_load64(&env->me_lck->mti_autosync_period, mo_Relaxed)); break; @@ -23221,7 +23052,7 @@ int mdbx_set_attr(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data, MDBX_cursor_couple cx; MDBX_val old_data; - int rc = mdbx_cursor_init(&cx.outer, txn, dbi); + int rc = cursor_init(&cx.outer, txn, dbi); if (unlikely(rc != MDBX_SUCCESS)) return rc; rc = mdbx_cursor_set(&cx.outer, key, &old_data, MDBX_SET, NULL); diff --git a/src/internals.h b/src/internals.h index af82934e..119bce90 100644 --- a/src/internals.h +++ b/src/internals.h @@ -285,15 +285,15 @@ typedef union { #ifndef __cplusplus #ifdef MDBX_HAVE_C11ATOMICS -#define mdbx_memory_fence(order, write) \ +#define osal_memory_fence(order, write) \ atomic_thread_fence((write) ? mo_c11_store(order) : mo_c11_load(order)) #else /* MDBX_HAVE_C11ATOMICS */ -#define mdbx_memory_fence(order, write) \ +#define osal_memory_fence(order, write) \ do { \ - mdbx_compiler_barrier(); \ + osal_compiler_barrier(); \ if (write && order > (MDBX_CPU_WRITEBACK_INCOHERENT ? mo_Relaxed \ : mo_AcquireRelease)) \ - mdbx_memory_barrier(); \ + osal_memory_barrier(); \ } while (0) #endif /* MDBX_HAVE_C11ATOMICS */ @@ -328,9 +328,9 @@ atomic_store32(MDBX_atomic_uint32_t *p, const uint32_t value, atomic_store_explicit(MDBX_c11a_rw(uint32_t, p), value, mo_c11_store(order)); #else /* MDBX_HAVE_C11ATOMICS */ if (order != mo_Relaxed) - mdbx_compiler_barrier(); + osal_compiler_barrier(); p->weak = value; - mdbx_memory_fence(order, true); + osal_memory_fence(order, true); #endif /* MDBX_HAVE_C11ATOMICS */ return value; } @@ -344,10 +344,10 @@ atomic_load32(const MDBX_atomic_uint32_t *p, enum MDBX_memory_order order) { assert(atomic_is_lock_free(MDBX_c11a_ro(uint32_t, p))); return atomic_load_explicit(MDBX_c11a_ro(uint32_t, p), mo_c11_load(order)); #else /* MDBX_HAVE_C11ATOMICS */ - mdbx_memory_fence(order, false); + osal_memory_fence(order, false); const uint32_t value = p->weak; if (order != mo_Relaxed) - mdbx_compiler_barrier(); + osal_compiler_barrier(); return value; #endif /* MDBX_HAVE_C11ATOMICS */ } @@ -584,17 +584,17 @@ typedef struct { wops; /* Number of explicit write operations (not a pages) to a disk */ MDBX_atomic_uint64_t gcrtime; /* Time spending for reading/searching GC (aka FreeDB). The - unit/scale is platform-depended, see mdbx_osal_monotime(). */ + unit/scale is platform-depended, see osal_monotime(). */ } MDBX_pgop_stat_t; #endif /* MDBX_ENABLE_PGOP_STAT */ #if MDBX_LOCKING == MDBX_LOCKING_WIN32FILES #define MDBX_CLOCK_SIGN UINT32_C(0xF10C) -typedef void mdbx_ipclock_t; +typedef void osal_ipclock_t; #elif MDBX_LOCKING == MDBX_LOCKING_SYSV #define MDBX_CLOCK_SIGN UINT32_C(0xF18D) -typedef mdbx_pid_t mdbx_ipclock_t; +typedef mdbx_pid_t osal_ipclock_t; #ifndef EOWNERDEAD #define EOWNERDEAD MDBX_RESULT_TRUE #endif @@ -602,17 +602,17 @@ typedef mdbx_pid_t mdbx_ipclock_t; #elif MDBX_LOCKING == MDBX_LOCKING_POSIX2001 || \ MDBX_LOCKING == MDBX_LOCKING_POSIX2008 #define MDBX_CLOCK_SIGN UINT32_C(0x8017) -typedef pthread_mutex_t mdbx_ipclock_t; +typedef pthread_mutex_t osal_ipclock_t; #elif MDBX_LOCKING == MDBX_LOCKING_POSIX1988 #define MDBX_CLOCK_SIGN UINT32_C(0xFC29) -typedef sem_t mdbx_ipclock_t; +typedef sem_t osal_ipclock_t; #else #error "FIXME" #endif /* MDBX_LOCKING */ #if MDBX_LOCKING > MDBX_LOCKING_SYSV && !defined(__cplusplus) -MDBX_INTERNAL_FUNC int mdbx_ipclock_stub(mdbx_ipclock_t *ipc); -MDBX_INTERNAL_FUNC int mdbx_ipclock_destroy(mdbx_ipclock_t *ipc); +MDBX_INTERNAL_FUNC int osal_ipclock_stub(osal_ipclock_t *ipc); +MDBX_INTERNAL_FUNC int osal_ipclock_destroy(osal_ipclock_t *ipc); #endif /* MDBX_LOCKING */ /* Reader Lock Table @@ -729,7 +729,7 @@ typedef struct MDBX_lockinfo { /* Write transaction lock. */ #if MDBX_LOCKING > 0 - mdbx_ipclock_t mti_wlock; + osal_ipclock_t mti_wlock; #endif /* MDBX_LOCKING > 0 */ atomic_txnid_t mti_oldest_reader; @@ -755,7 +755,7 @@ typedef struct MDBX_lockinfo { /* Readeaders registration lock. */ #if MDBX_LOCKING > 0 - mdbx_ipclock_t mti_rlock; + osal_ipclock_t mti_rlock; #endif /* MDBX_LOCKING > 0 */ /* The number of slots that have been used in the reader table. @@ -924,7 +924,7 @@ struct MDBX_txn { #define MDBX_TXN_RO_BEGIN_FLAGS (MDBX_TXN_RDONLY | MDBX_TXN_RDONLY_PREPARE) #define MDBX_TXN_RW_BEGIN_FLAGS \ (MDBX_TXN_NOMETASYNC | MDBX_TXN_NOSYNC | MDBX_TXN_TRY) - /* Additional flag for mdbx_sync_locked() */ + /* Additional flag for sync_locked() */ #define MDBX_SHRINK_ALLOWED UINT32_C(0x40000000) #define TXN_FLAGS \ @@ -1123,11 +1123,11 @@ struct MDBX_env { #define MDBX_DEPRECATED_COALESCE UINT32_C(0x2000000) #define ENV_INTERNAL_FLAGS (MDBX_FATAL_ERROR | MDBX_ENV_ACTIVE | MDBX_ENV_TXKEY) uint32_t me_flags; - mdbx_mmap_t me_dxb_mmap; /* The main data file */ + osal_mmap_t me_dxb_mmap; /* The main data file */ #define me_map me_dxb_mmap.dxb #define me_lazy_fd me_dxb_mmap.fd mdbx_filehandle_t me_dsync_fd; - mdbx_mmap_t me_lck_mmap; /* The lock file */ + osal_mmap_t me_lck_mmap; /* The lock file */ #define me_lfd me_lck_mmap.fd struct MDBX_lockinfo *me_lck; @@ -1138,11 +1138,11 @@ struct MDBX_env { uint16_t me_merge_threshold, me_merge_threshold_gc; /* pages emptier than this are candidates for merging */ - unsigned me_os_psize; /* OS page size, from mdbx_syspagesize() */ + unsigned me_os_psize; /* OS page size, from osal_syspagesize() */ unsigned me_maxreaders; /* size of the reader table */ MDBX_dbi me_maxdbs; /* size of the DB table */ uint32_t me_pid; /* process ID of this env */ - mdbx_thread_key_t me_txkey; /* thread-key for readers */ + osal_thread_key_t me_txkey; /* thread-key for readers */ pathchar_t *me_pathname; /* path to the DB files */ void *me_pbuf; /* scratch area for DUPSORT put() */ MDBX_txn *me_txn0; /* preallocated write transaction */ @@ -1198,7 +1198,7 @@ struct MDBX_env { /* --------------------------------------------------- mostly volatile part */ MDBX_txn *me_txn; /* current write transaction */ - mdbx_fastmutex_t me_dbi_lock; + osal_fastmutex_t me_dbi_lock; #if MDBX_CACHE_METAPTR volatile const MDBX_meta *cache_last_meta; volatile const MDBX_meta *cache_steady_meta; @@ -1211,11 +1211,11 @@ struct MDBX_env { MDBX_PNL me_retired_pages; #if defined(_WIN32) || defined(_WIN64) - MDBX_srwlock me_remap_guard; + osal_srwlock_t me_remap_guard; /* Workaround for LockFileEx and WriteFile multithread bug */ CRITICAL_SECTION me_windowsbug_lock; #else - mdbx_fastmutex_t me_remap_guard; + osal_fastmutex_t me_remap_guard; #endif /* -------------------------------------------------------------- debugging */ @@ -1250,142 +1250,138 @@ struct MDBX_env { #define MDBX_RUNTIME_FLAGS_INIT \ ((MDBX_DEBUG) > 0) * MDBX_DBG_ASSERT + ((MDBX_DEBUG) > 1) * MDBX_DBG_AUDIT -extern uint8_t mdbx_runtime_flags; -extern uint8_t mdbx_loglevel; -extern MDBX_debug_func *mdbx_debug_logger; +extern uint8_t runtime_flags; +extern uint8_t loglevel; +extern MDBX_debug_func *debug_logger; -MDBX_MAYBE_UNUSED static __inline void mdbx_jitter4testing(bool tiny) { +MDBX_MAYBE_UNUSED static __inline void jitter4testing(bool tiny) { #if MDBX_DEBUG - if (MDBX_DBG_JITTER & mdbx_runtime_flags) - mdbx_osal_jitter(tiny); + if (MDBX_DBG_JITTER & runtime_flags) + osal_jitter(tiny); #else (void)tiny; #endif } MDBX_INTERNAL_FUNC void MDBX_PRINTF_ARGS(4, 5) - mdbx_debug_log(int level, const char *function, int line, const char *fmt, - ...) MDBX_PRINTF_ARGS(4, 5); -MDBX_INTERNAL_FUNC void mdbx_debug_log_va(int level, const char *function, - int line, const char *fmt, - va_list args); + debug_log(int level, const char *function, int line, const char *fmt, ...) + MDBX_PRINTF_ARGS(4, 5); +MDBX_INTERNAL_FUNC void debug_log_va(int level, const char *function, int line, + const char *fmt, va_list args); #if MDBX_DEBUG -#define mdbx_log_enabled(msg) unlikely(msg <= mdbx_loglevel) -#define mdbx_audit_enabled() unlikely((mdbx_runtime_flags & MDBX_DBG_AUDIT)) +#define LOG_ENABLED(msg) unlikely(msg <= loglevel) +#define AUDIT_ENABLED() unlikely((runtime_flags & MDBX_DBG_AUDIT)) #else /* MDBX_DEBUG */ -#define mdbx_log_enabled(msg) (msg < MDBX_LOG_VERBOSE && msg <= mdbx_loglevel) -#define mdbx_audit_enabled() (0) +#define LOG_ENABLED(msg) (msg < MDBX_LOG_VERBOSE && msg <= loglevel) +#define AUDIT_ENABLED() (0) #endif /* MDBX_DEBUG */ #if MDBX_FORCE_ASSERTIONS -#define mdbx_assert_enabled() (1) +#define ASSERT_ENABLED() (1) #elif MDBX_DEBUG -#define mdbx_assert_enabled() likely((mdbx_runtime_flags & MDBX_DBG_ASSERT)) +#define ASSERT_ENABLED() likely((runtime_flags & MDBX_DBG_ASSERT)) #else -#define mdbx_assert_enabled() (0) +#define ASSERT_ENABLED() (0) #endif /* assertions */ -#define mdbx_debug_extra(fmt, ...) \ +#define DEBUG_EXTRA(fmt, ...) \ do { \ - if (mdbx_log_enabled(MDBX_LOG_EXTRA)) \ - mdbx_debug_log(MDBX_LOG_EXTRA, __func__, __LINE__, fmt, __VA_ARGS__); \ + if (LOG_ENABLED(MDBX_LOG_EXTRA)) \ + debug_log(MDBX_LOG_EXTRA, __func__, __LINE__, fmt, __VA_ARGS__); \ } while (0) -#define mdbx_debug_extra_print(fmt, ...) \ +#define DEBUG_EXTRA_PRINT(fmt, ...) \ do { \ - if (mdbx_log_enabled(MDBX_LOG_EXTRA)) \ - mdbx_debug_log(MDBX_LOG_EXTRA, NULL, 0, fmt, __VA_ARGS__); \ + if (LOG_ENABLED(MDBX_LOG_EXTRA)) \ + debug_log(MDBX_LOG_EXTRA, NULL, 0, fmt, __VA_ARGS__); \ } while (0) -#define mdbx_trace(fmt, ...) \ +#define TRACE(fmt, ...) \ do { \ - if (mdbx_log_enabled(MDBX_LOG_TRACE)) \ - mdbx_debug_log(MDBX_LOG_TRACE, __func__, __LINE__, fmt "\n", \ - __VA_ARGS__); \ + if (LOG_ENABLED(MDBX_LOG_TRACE)) \ + debug_log(MDBX_LOG_TRACE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ } while (0) -#define mdbx_debug(fmt, ...) \ +#define DEBUG(fmt, ...) \ do { \ - if (mdbx_log_enabled(MDBX_LOG_DEBUG)) \ - mdbx_debug_log(MDBX_LOG_DEBUG, __func__, __LINE__, fmt "\n", \ - __VA_ARGS__); \ + if (LOG_ENABLED(MDBX_LOG_DEBUG)) \ + debug_log(MDBX_LOG_DEBUG, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ } while (0) -#define mdbx_verbose(fmt, ...) \ +#define VERBOSE(fmt, ...) \ do { \ - if (mdbx_log_enabled(MDBX_LOG_VERBOSE)) \ - mdbx_debug_log(MDBX_LOG_VERBOSE, __func__, __LINE__, fmt "\n", \ - __VA_ARGS__); \ + if (LOG_ENABLED(MDBX_LOG_VERBOSE)) \ + debug_log(MDBX_LOG_VERBOSE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ } while (0) -#define mdbx_notice(fmt, ...) \ +#define NOTICE(fmt, ...) \ do { \ - if (mdbx_log_enabled(MDBX_LOG_NOTICE)) \ - mdbx_debug_log(MDBX_LOG_NOTICE, __func__, __LINE__, fmt "\n", \ - __VA_ARGS__); \ + if (LOG_ENABLED(MDBX_LOG_NOTICE)) \ + debug_log(MDBX_LOG_NOTICE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ } while (0) -#define mdbx_warning(fmt, ...) \ +#define WARNING(fmt, ...) \ do { \ - if (mdbx_log_enabled(MDBX_LOG_WARN)) \ - mdbx_debug_log(MDBX_LOG_WARN, __func__, __LINE__, fmt "\n", \ - __VA_ARGS__); \ + if (LOG_ENABLED(MDBX_LOG_WARN)) \ + debug_log(MDBX_LOG_WARN, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ } while (0) -#define mdbx_error(fmt, ...) \ +#undef ERROR /* wingdi.h \ + Yeah, morons from M$ put such definition to the public header. */ + +#define ERROR(fmt, ...) \ do { \ - if (mdbx_log_enabled(MDBX_LOG_ERROR)) \ - mdbx_debug_log(MDBX_LOG_ERROR, __func__, __LINE__, fmt "\n", \ - __VA_ARGS__); \ + if (LOG_ENABLED(MDBX_LOG_ERROR)) \ + debug_log(MDBX_LOG_ERROR, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ } while (0) -#define mdbx_fatal(fmt, ...) \ - mdbx_debug_log(MDBX_LOG_FATAL, __func__, __LINE__, fmt "\n", __VA_ARGS__); +#define FATAL(fmt, ...) \ + debug_log(MDBX_LOG_FATAL, __func__, __LINE__, fmt "\n", __VA_ARGS__); -#define mdbx_ensure_msg(env, expr, msg) \ +#define ENSURE_MSG(env, expr, msg) \ do { \ if (unlikely(!(expr))) \ mdbx_assert_fail(env, msg, __func__, __LINE__); \ } while (0) -#define mdbx_ensure(env, expr) mdbx_ensure_msg(env, expr, #expr) +#define ENSURE(env, expr) ENSURE_MSG(env, expr, #expr) /* assert(3) variant in environment context */ -#define mdbx_assert(env, expr) \ +#define eASSERT(env, expr) \ do { \ - if (mdbx_assert_enabled()) \ - mdbx_ensure(env, expr); \ + if (ASSERT_ENABLED()) \ + ENSURE(env, expr); \ } while (0) /* assert(3) variant in cursor context */ -#define mdbx_cassert(mc, expr) mdbx_assert((mc)->mc_txn->mt_env, expr) +#define cASSERT(mc, expr) eASSERT((mc)->mc_txn->mt_env, expr) /* assert(3) variant in transaction context */ -#define mdbx_tassert(txn, expr) mdbx_assert((txn)->mt_env, expr) +#define tASSERT(txn, expr) eASSERT((txn)->mt_env, expr) -#ifndef xMDBX_TOOLS /* Avoid using internal mdbx_assert() */ +#ifndef xMDBX_TOOLS /* Avoid using internal eASSERT() */ #undef assert -#define assert(expr) mdbx_assert(NULL, expr) +#define assert(expr) eASSERT(NULL, expr) #endif /*----------------------------------------------------------------------------*/ /* Cache coherence and mmap invalidation */ #if MDBX_CPU_WRITEBACK_INCOHERENT -#define mdbx_flush_incoherent_cpu_writeback() mdbx_memory_barrier() +#define osal_flush_incoherent_cpu_writeback() osal_memory_barrier() #else -#define mdbx_flush_incoherent_cpu_writeback() mdbx_compiler_barrier() +#define osal_flush_incoherent_cpu_writeback() osal_compiler_barrier() #endif /* MDBX_CPU_WRITEBACK_INCOHERENT */ MDBX_MAYBE_UNUSED static __inline void -mdbx_flush_incoherent_mmap(void *addr, size_t nbytes, const intptr_t pagesize) { +osal_flush_incoherent_mmap(void *addr, size_t nbytes, const intptr_t pagesize) { #if MDBX_MMAP_INCOHERENT_FILE_WRITE char *const begin = (char *)(-pagesize & (intptr_t)addr); char *const end = (char *)(-pagesize & (intptr_t)((char *)addr + nbytes + pagesize - 1)); int err = msync(begin, end - begin, MS_SYNC | MS_INVALIDATE) ? errno : 0; - mdbx_assert(nullptr, err == 0); + eASSERT(nullptr, err == 0); (void)err; #else (void)pagesize; @@ -1410,15 +1406,15 @@ mdbx_flush_incoherent_mmap(void *addr, size_t nbytes, const intptr_t pagesize) { /*----------------------------------------------------------------------------*/ /* Internal prototypes */ -MDBX_INTERNAL_FUNC int mdbx_cleanup_dead_readers(MDBX_env *env, int rlocked, - int *dead); -MDBX_INTERNAL_FUNC int mdbx_rthc_alloc(mdbx_thread_key_t *key, - MDBX_reader *begin, MDBX_reader *end); -MDBX_INTERNAL_FUNC void mdbx_rthc_remove(const mdbx_thread_key_t key); +MDBX_INTERNAL_FUNC int cleanup_dead_readers(MDBX_env *env, int rlocked, + int *dead); +MDBX_INTERNAL_FUNC int rthc_alloc(osal_thread_key_t *key, MDBX_reader *begin, + MDBX_reader *end); +MDBX_INTERNAL_FUNC void rthc_remove(const osal_thread_key_t key); -MDBX_INTERNAL_FUNC void mdbx_rthc_global_init(void); -MDBX_INTERNAL_FUNC void mdbx_rthc_global_dtor(void); -MDBX_INTERNAL_FUNC void mdbx_rthc_thread_dtor(void *ptr); +MDBX_INTERNAL_FUNC void global_ctor(void); +MDBX_INTERNAL_FUNC void global_dtor(void); +MDBX_INTERNAL_FUNC void thread_dtor(void *ptr); #endif /* !__cplusplus */ @@ -1648,14 +1644,14 @@ MDBX_MAYBE_UNUSED static void static_checks(void) { #define MDBX_ASAN_POISON_MEMORY_REGION(addr, size) \ do { \ - mdbx_trace("POISON_MEMORY_REGION(%p, %zu) at %u", (void *)(addr), \ - (size_t)(size), __LINE__); \ + TRACE("POISON_MEMORY_REGION(%p, %zu) at %u", (void *)(addr), \ + (size_t)(size), __LINE__); \ ASAN_POISON_MEMORY_REGION(addr, size); \ } while (0) #define MDBX_ASAN_UNPOISON_MEMORY_REGION(addr, size) \ do { \ - mdbx_trace("UNPOISON_MEMORY_REGION(%p, %zu) at %u", (void *)(addr), \ - (size_t)(size), __LINE__); \ + TRACE("UNPOISON_MEMORY_REGION(%p, %zu) at %u", (void *)(addr), \ + (size_t)(size), __LINE__); \ ASAN_UNPOISON_MEMORY_REGION(addr, size); \ } while (0) diff --git a/src/lck-posix.c b/src/lck-posix.c index d4c26e4c..acbfb849 100644 --- a/src/lck-posix.c +++ b/src/lck-posix.c @@ -28,7 +28,7 @@ #include #ifndef xMDBX_ALLOY -uint32_t mdbx_linux_kernel_version; +uint32_t linux_kernel_version; bool mdbx_RunningOnWSL1; #endif /* xMDBX_ALLOY */ @@ -43,7 +43,7 @@ __cold static uint8_t probe_for_WSL(const char *tag) { if (WSL || wsl || strcasestr(tag, "Microsoft")) /* Expecting no new kernel within WSL1, either it will explicitly * marked by an appropriate WSL-version hint. */ - return (mdbx_linux_kernel_version < /* 4.19.x */ 0x04130000) ? 1 : 2; + return (linux_kernel_version < /* 4.19.x */ 0x04130000) ? 1 : 2; return 0; } @@ -76,7 +76,7 @@ mdbx_global_constructor(void) { if (number > 0) { if (number > 255) number = 255; - mdbx_linux_kernel_version += number << (24 - i * 8); + linux_kernel_version += number << (24 - i * 8); } ++i; } else { @@ -96,13 +96,13 @@ mdbx_global_constructor(void) { } #endif /* Linux */ - mdbx_rthc_global_init(); + global_ctor(); } MDBX_EXCLUDE_FOR_GPROF __cold static __attribute__((__destructor__)) void mdbx_global_destructor(void) { - mdbx_rthc_global_dtor(); + global_dtor(); #ifdef ENABLE_GPROF if (!&__gmon_start__) _mcleanup(); @@ -118,15 +118,15 @@ mdbx_global_destructor(void) { * размещаются совместно используемые posix-мьютексы (futex). Посредством * этих мьютексов (см struct MDBX_lockinfo) реализуются: * - Блокировка таблицы читателей для регистрации, - * т.е. функции mdbx_rdt_lock() и mdbx_rdt_unlock(). + * т.е. функции osal_rdt_lock() и osal_rdt_unlock(). * - Блокировка БД для пишущих транзакций, * т.е. функции mdbx_txn_lock() и mdbx_txn_unlock(). * * Остальной функционал реализуется отдельно посредством файловых блокировок: * - Первоначальный захват БД в режиме exclusive/shared и последующий перевод - * в операционный режим, функции mdbx_lck_seize() и mdbx_lck_downgrade(). + * в операционный режим, функции osal_lck_seize() и osal_lck_downgrade(). * - Проверка присутствие процессов-читателей, - * т.е. функции mdbx_rpid_set(), mdbx_rpid_clear() и mdbx_rpid_check(). + * т.е. функции osal_rpid_set(), osal_rpid_clear() и osal_rpid_check(). * * Для блокировки файлов используется fcntl(F_SETLK), так как: * - lockf() оперирует только эксклюзивной блокировкой и требует @@ -170,9 +170,9 @@ mdbx_global_destructor(void) { static int op_setlk, op_setlkw, op_getlk; __cold static void choice_fcntl(void) { assert(!op_setlk && !op_setlkw && !op_getlk); - if ((mdbx_runtime_flags & MDBX_DBG_LEGACY_MULTIOPEN) == 0 + if ((runtime_flags & MDBX_DBG_LEGACY_MULTIOPEN) == 0 #if defined(__linux__) || defined(__gnu_linux__) - && mdbx_linux_kernel_version > + && linux_kernel_version > 0x030f0000 /* OFD locks are available since 3.15, but engages here only for 3.16 and later kernels (i.e. LTS) because of reliability reasons */ @@ -207,7 +207,7 @@ static int lck_op(const mdbx_filehandle_t fd, int cmd, const int lck, "The bitness of system `off_t` type is mismatch. Please " "fix build and/or NDK configuration."); #endif /* Android */ - mdbx_jitter4testing(true); + jitter4testing(true); assert(offset >= 0 && len > 0); assert((uint64_t)offset < (uint64_t)INT64_MAX && (uint64_t)len < (uint64_t)INT64_MAX && @@ -230,7 +230,7 @@ static int lck_op(const mdbx_filehandle_t fd, int cmd, const int lck, lock_op.l_start = offset; lock_op.l_len = len; int rc = fcntl(fd, cmd, &lock_op); - mdbx_jitter4testing(true); + jitter4testing(true); if (rc != -1) { if (cmd == op_getlk) { /* Checks reader by pid. Returns: @@ -265,7 +265,7 @@ static int lck_op(const mdbx_filehandle_t fd, int cmd, const int lck, } } -MDBX_INTERNAL_FUNC int mdbx_lockfile(mdbx_filehandle_t fd, bool wait) { +MDBX_INTERNAL_FUNC int osal_lockfile(mdbx_filehandle_t fd, bool wait) { #if MDBX_USE_OFDLOCKS if (unlikely(op_setlk == 0)) choice_fcntl(); @@ -273,21 +273,21 @@ MDBX_INTERNAL_FUNC int mdbx_lockfile(mdbx_filehandle_t fd, bool wait) { return lck_op(fd, wait ? op_setlkw : op_setlk, F_WRLCK, 0, OFF_T_MAX); } -MDBX_INTERNAL_FUNC int mdbx_rpid_set(MDBX_env *env) { +MDBX_INTERNAL_FUNC int osal_rpid_set(MDBX_env *env) { assert(env->me_lfd != INVALID_HANDLE_VALUE); assert(env->me_pid > 0); - if (unlikely(mdbx_getpid() != env->me_pid)) + if (unlikely(osal_getpid() != env->me_pid)) return MDBX_PANIC; return lck_op(env->me_lfd, op_setlk, F_WRLCK, env->me_pid, 1); } -MDBX_INTERNAL_FUNC int mdbx_rpid_clear(MDBX_env *env) { +MDBX_INTERNAL_FUNC int osal_rpid_clear(MDBX_env *env) { assert(env->me_lfd != INVALID_HANDLE_VALUE); assert(env->me_pid > 0); return lck_op(env->me_lfd, op_setlk, F_UNLCK, env->me_pid, 1); } -MDBX_INTERNAL_FUNC int mdbx_rpid_check(MDBX_env *env, uint32_t pid) { +MDBX_INTERNAL_FUNC int osal_rpid_check(MDBX_env *env, uint32_t pid) { assert(env->me_lfd != INVALID_HANDLE_VALUE); assert(pid > 0); return lck_op(env->me_lfd, op_getlk, F_WRLCK, pid, 1); @@ -296,7 +296,7 @@ MDBX_INTERNAL_FUNC int mdbx_rpid_check(MDBX_env *env, uint32_t pid) { /*---------------------------------------------------------------------------*/ #if MDBX_LOCKING > MDBX_LOCKING_SYSV -MDBX_INTERNAL_FUNC int mdbx_ipclock_stub(mdbx_ipclock_t *ipc) { +MDBX_INTERNAL_FUNC int osal_ipclock_stub(osal_ipclock_t *ipc) { #if MDBX_LOCKING == MDBX_LOCKING_POSIX1988 return sem_init(ipc, false, 1) ? errno : 0; #elif MDBX_LOCKING == MDBX_LOCKING_POSIX2001 || \ @@ -307,7 +307,7 @@ MDBX_INTERNAL_FUNC int mdbx_ipclock_stub(mdbx_ipclock_t *ipc) { #endif } -MDBX_INTERNAL_FUNC int mdbx_ipclock_destroy(mdbx_ipclock_t *ipc) { +MDBX_INTERNAL_FUNC int osal_ipclock_destroy(osal_ipclock_t *ipc) { #if MDBX_LOCKING == MDBX_LOCKING_POSIX1988 return sem_destroy(ipc) ? errno : 0; #elif MDBX_LOCKING == MDBX_LOCKING_POSIX2001 || \ @@ -325,7 +325,7 @@ static int check_fstat(MDBX_env *env) { int rc = MDBX_SUCCESS; if (fstat(env->me_lazy_fd, &st)) { rc = errno; - mdbx_error("fstat(%s), err %d", "DXB", rc); + ERROR("fstat(%s), err %d", "DXB", rc); return rc; } @@ -335,15 +335,14 @@ static int check_fstat(MDBX_env *env) { #else rc = EPERM; #endif - mdbx_error("%s %s, err %d", "DXB", - (st.st_nlink < 1) ? "file was removed" : "not a regular file", - rc); + ERROR("%s %s, err %d", "DXB", + (st.st_nlink < 1) ? "file was removed" : "not a regular file", rc); return rc; } if (st.st_size < (off_t)(MDBX_MIN_PAGESIZE * NUM_METAS)) { - mdbx_verbose("dxb-file is too short (%u), exclusive-lock needed", - (unsigned)st.st_size); + VERBOSE("dxb-file is too short (%u), exclusive-lock needed", + (unsigned)st.st_size); rc = MDBX_RESULT_TRUE; } @@ -351,7 +350,7 @@ static int check_fstat(MDBX_env *env) { if (fstat(env->me_lfd, &st)) { rc = errno; - mdbx_error("fstat(%s), err %d", "LCK", rc); + ERROR("fstat(%s), err %d", "LCK", rc); return rc; } @@ -361,26 +360,25 @@ static int check_fstat(MDBX_env *env) { #else rc = EPERM; #endif - mdbx_error("%s %s, err %d", "LCK", - (st.st_nlink < 1) ? "file was removed" : "not a regular file", - rc); + ERROR("%s %s, err %d", "LCK", + (st.st_nlink < 1) ? "file was removed" : "not a regular file", rc); return rc; } /* Checking file size for detect the situation when we got the shared lock - * immediately after mdbx_lck_destroy(). */ + * immediately after osal_lck_destroy(). */ if (st.st_size < (off_t)(sizeof(MDBX_lockinfo) + sizeof(MDBX_reader))) { - mdbx_verbose("lck-file is too short (%u), exclusive-lock needed", - (unsigned)st.st_size); + VERBOSE("lck-file is too short (%u), exclusive-lock needed", + (unsigned)st.st_size); rc = MDBX_RESULT_TRUE; } return rc; } -__cold MDBX_INTERNAL_FUNC int mdbx_lck_seize(MDBX_env *env) { +__cold MDBX_INTERNAL_FUNC int osal_lck_seize(MDBX_env *env) { assert(env->me_lazy_fd != INVALID_HANDLE_VALUE); - if (unlikely(mdbx_getpid() != env->me_pid)) + if (unlikely(osal_getpid() != env->me_pid)) return MDBX_PANIC; #if MDBX_USE_OFDLOCKS if (unlikely(op_setlk == 0)) @@ -391,10 +389,10 @@ __cold MDBX_INTERNAL_FUNC int mdbx_lck_seize(MDBX_env *env) { #if defined(__linux__) || defined(__gnu_linux__) if (unlikely(mdbx_RunningOnWSL1)) { rc = ENOLCK /* No record locks available */; - mdbx_error("%s, err %u", - "WSL1 (Windows Subsystem for Linux) is mad and trouble-full, " - "injecting failure to avoid data loss", - rc); + ERROR("%s, err %u", + "WSL1 (Windows Subsystem for Linux) is mad and trouble-full, " + "injecting failure to avoid data loss", + rc); return rc; } #endif /* Linux */ @@ -405,8 +403,8 @@ __cold MDBX_INTERNAL_FUNC int mdbx_lck_seize(MDBX_env *env) { lck_op(env->me_lazy_fd, op_setlk, (env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, 0, OFF_T_MAX); if (rc != MDBX_SUCCESS) { - mdbx_error("%s, err %u", "without-lck", rc); - mdbx_assert(env, MDBX_IS_ERROR(rc)); + ERROR("%s, err %u", "without-lck", rc); + eASSERT(env, MDBX_IS_ERROR(rc)); return rc; } return MDBX_RESULT_TRUE /* Done: return with exclusive locking. */; @@ -419,8 +417,8 @@ retry: if (rc == MDBX_RESULT_TRUE) { rc = lck_op(env->me_lfd, op_setlk, F_UNLCK, 0, 1); if (rc != MDBX_SUCCESS) { - mdbx_error("%s, err %u", "unlock-before-retry", rc); - mdbx_assert(env, MDBX_IS_ERROR(rc)); + ERROR("%s, err %u", "unlock-before-retry", rc); + eASSERT(env, MDBX_IS_ERROR(rc)); return rc; } } @@ -446,23 +444,23 @@ retry: /* the cause may be a collision with POSIX's file-lock recovery. */ if (!(rc == EAGAIN || rc == EACCES || rc == EBUSY || rc == EWOULDBLOCK || rc == EDEADLK)) { - mdbx_error("%s, err %u", "dxb-exclusive", rc); - mdbx_assert(env, MDBX_IS_ERROR(rc)); + ERROR("%s, err %u", "dxb-exclusive", rc); + eASSERT(env, MDBX_IS_ERROR(rc)); return rc; } /* Fallback to lck-shared */ } else if (!(rc == EAGAIN || rc == EACCES || rc == EBUSY || rc == EWOULDBLOCK || rc == EDEADLK)) { - mdbx_error("%s, err %u", "try-exclusive", rc); - mdbx_assert(env, MDBX_IS_ERROR(rc)); + ERROR("%s, err %u", "try-exclusive", rc); + eASSERT(env, MDBX_IS_ERROR(rc)); return rc; } /* Here could be one of two: - * - mdbx_lck_destroy() from the another process was hold the lock + * - osal_lck_destroy() from the another process was hold the lock * during a destruction. - * - either mdbx_lck_seize() from the another process was got the exclusive + * - either osal_lck_seize() from the another process was got the exclusive * lock and doing initialization. * For distinguish these cases will use size of the lck-file later. */ @@ -471,8 +469,8 @@ retry: * competing process doesn't call lck_downgrade(). */ rc = lck_op(env->me_lfd, op_setlkw, F_RDLCK, 0, 1); if (rc != MDBX_SUCCESS) { - mdbx_error("%s, err %u", "try-shared", rc); - mdbx_assert(env, MDBX_IS_ERROR(rc)); + ERROR("%s, err %u", "try-shared", rc); + eASSERT(env, MDBX_IS_ERROR(rc)); return rc; } @@ -480,7 +478,7 @@ retry: if (rc == MDBX_RESULT_TRUE) goto retry; if (rc != MDBX_SUCCESS) { - mdbx_error("%s, err %u", "lck_fstat", rc); + ERROR("%s, err %u", "lck_fstat", rc); return rc; } @@ -491,8 +489,8 @@ retry: if (!(rc == EAGAIN || rc == EACCES || rc == EBUSY || rc == EWOULDBLOCK || rc == EDEADLK)) { - mdbx_error("%s, err %u", "try-exclusive", rc); - mdbx_assert(env, MDBX_IS_ERROR(rc)); + ERROR("%s, err %u", "try-exclusive", rc); + eASSERT(env, MDBX_IS_ERROR(rc)); return rc; } @@ -501,8 +499,8 @@ retry: lck_op(env->me_lazy_fd, op_setlk, (env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, env->me_pid, 1); if (rc != MDBX_SUCCESS) { - mdbx_error("%s, err %u", "lock-against-without-lck", rc); - mdbx_assert(env, MDBX_IS_ERROR(rc)); + ERROR("%s, err %u", "lock-against-without-lck", rc); + eASSERT(env, MDBX_IS_ERROR(rc)); return rc; } @@ -510,9 +508,9 @@ retry: return MDBX_RESULT_FALSE; } -MDBX_INTERNAL_FUNC int mdbx_lck_downgrade(MDBX_env *env) { +MDBX_INTERNAL_FUNC int osal_lck_downgrade(MDBX_env *env) { assert(env->me_lfd != INVALID_HANDLE_VALUE); - if (unlikely(mdbx_getpid() != env->me_pid)) + if (unlikely(osal_getpid() != env->me_pid)) return MDBX_PANIC; int rc = MDBX_SUCCESS; @@ -525,15 +523,15 @@ MDBX_INTERNAL_FUNC int mdbx_lck_downgrade(MDBX_env *env) { if (rc == MDBX_SUCCESS) rc = lck_op(env->me_lfd, op_setlk, F_RDLCK, 0, 1); if (unlikely(rc != 0)) { - mdbx_error("%s, err %u", "lck", rc); + ERROR("%s, err %u", "lck", rc); assert(MDBX_IS_ERROR(rc)); } return rc; } -__cold MDBX_INTERNAL_FUNC int mdbx_lck_destroy(MDBX_env *env, +__cold MDBX_INTERNAL_FUNC int osal_lck_destroy(MDBX_env *env, MDBX_env *inprocess_neighbor) { - if (unlikely(mdbx_getpid() != env->me_pid)) + if (unlikely(osal_getpid() != env->me_pid)) return MDBX_PANIC; int rc = MDBX_SUCCESS; @@ -548,25 +546,25 @@ __cold MDBX_INTERNAL_FUNC int mdbx_lck_destroy(MDBX_env *env, (env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, 0, OFF_T_MAX) == 0) { - mdbx_verbose("%p got exclusive, drown locks", (void *)env); + VERBOSE("%p got exclusive, drown locks", (void *)env); #if MDBX_LOCKING == MDBX_LOCKING_SYSV if (env->me_sysv_ipc.semid != -1) rc = semctl(env->me_sysv_ipc.semid, 2, IPC_RMID) ? errno : 0; #else - rc = mdbx_ipclock_destroy(&lck->mti_rlock); + rc = osal_ipclock_destroy(&lck->mti_rlock); if (rc == 0) - rc = mdbx_ipclock_destroy(&lck->mti_wlock); + rc = osal_ipclock_destroy(&lck->mti_wlock); #endif /* MDBX_LOCKING */ - mdbx_assert(env, rc == 0); + eASSERT(env, rc == 0); if (rc == 0) { const bool synced = lck->mti_unsynced_pages.weak == 0; - mdbx_munmap(&env->me_lck_mmap); + osal_munmap(&env->me_lck_mmap); if (synced) rc = ftruncate(env->me_lfd, 0) ? errno : 0; } - mdbx_jitter4testing(false); + jitter4testing(false); } /* 1) POSIX's fcntl() locks (i.e. when op_setlk == F_SETLK) should be restored @@ -607,7 +605,7 @@ __cold MDBX_INTERNAL_FUNC int mdbx_lck_destroy(MDBX_env *env, /* restore file-locks */ rc = lck_op(inprocess_neighbor->me_lfd, F_SETLKW, F_RDLCK, 0, 1); if (rc == MDBX_SUCCESS && inprocess_neighbor->me_live_reader) - rc = mdbx_rpid_set(inprocess_neighbor); + rc = osal_rpid_set(inprocess_neighbor); } } @@ -618,7 +616,7 @@ __cold MDBX_INTERNAL_FUNC int mdbx_lck_destroy(MDBX_env *env, /*---------------------------------------------------------------------------*/ -__cold MDBX_INTERNAL_FUNC int mdbx_lck_init(MDBX_env *env, +__cold MDBX_INTERNAL_FUNC int osal_lck_init(MDBX_env *env, MDBX_env *inprocess_neighbor, int global_uniqueness_flag) { #if MDBX_LOCKING == MDBX_LOCKING_SYSV @@ -765,7 +763,7 @@ bailout: #endif /* MDBX_LOCKING > 0 */ } -__cold static int mdbx_ipclock_failed(MDBX_env *env, mdbx_ipclock_t *ipc, +__cold static int mdbx_ipclock_failed(MDBX_env *env, osal_ipclock_t *ipc, const int err) { int rc = err; #if MDBX_LOCKING == MDBX_LOCKING_POSIX2008 || MDBX_LOCKING == MDBX_LOCKING_SYSV @@ -782,10 +780,10 @@ __cold static int mdbx_ipclock_failed(MDBX_env *env, mdbx_ipclock_t *ipc, rc = MDBX_PANIC; } } - mdbx_warning("%clock owner died, %s", (rlocked ? 'r' : 'w'), - (rc ? "this process' env is hosed" : "recovering")); + WARNING("%clock owner died, %s", (rlocked ? 'r' : 'w'), + (rc ? "this process' env is hosed" : "recovering")); - int check_rc = mdbx_cleanup_dead_readers(env, rlocked, NULL); + int check_rc = cleanup_dead_readers(env, rlocked, NULL); check_rc = (check_rc == MDBX_SUCCESS) ? MDBX_RESULT_TRUE : check_rc; #if MDBX_LOCKING == MDBX_LOCKING_SYSV @@ -803,7 +801,7 @@ __cold static int mdbx_ipclock_failed(MDBX_env *env, mdbx_ipclock_t *ipc, check_rc = (mreco_rc == 0) ? check_rc : mreco_rc; if (unlikely(mreco_rc)) - mdbx_error("lock recovery failed, %s", mdbx_strerror(mreco_rc)); + ERROR("lock recovery failed, %s", mdbx_strerror(mreco_rc)); rc = (rc == MDBX_SUCCESS) ? check_rc : rc; if (MDBX_IS_ERROR(rc)) @@ -826,24 +824,24 @@ __cold static int mdbx_ipclock_failed(MDBX_env *env, mdbx_ipclock_t *ipc, #error "FIXME" #endif /* MDBX_LOCKING */ - mdbx_error("mutex (un)lock failed, %s", mdbx_strerror(err)); + ERROR("mutex (un)lock failed, %s", mdbx_strerror(err)); if (rc != EDEADLK) env->me_flags |= MDBX_FATAL_ERROR; return rc; } #if defined(__ANDROID_API__) || defined(ANDROID) || defined(BIONIC) -MDBX_INTERNAL_FUNC int mdbx_check_tid4bionic(void) { +MDBX_INTERNAL_FUNC int osal_check_tid4bionic(void) { /* avoid 32-bit Bionic bug/hang with 32-pit TID */ if (sizeof(pthread_mutex_t) < sizeof(pid_t) + sizeof(unsigned)) { pid_t tid = gettid(); if (unlikely(tid > 0xffff)) { - mdbx_fatal("Raise the ENOSYS(%d) error to avoid hang due " - "the 32-bit Bionic/Android bug with tid/thread_id 0x%08x(%i) " - "that don’t fit in 16 bits, see " - "https://android.googlesource.com/platform/bionic/+/master/" - "docs/32-bit-abi.md#is-too-small-for-large-pids", - ENOSYS, tid, tid); + FATAL("Raise the ENOSYS(%d) error to avoid hang due " + "the 32-bit Bionic/Android bug with tid/thread_id 0x%08x(%i) " + "that don’t fit in 16 bits, see " + "https://android.googlesource.com/platform/bionic/+/master/" + "docs/32-bit-abi.md#is-too-small-for-large-pids", + ENOSYS, tid, tid); return ENOSYS; } } @@ -851,11 +849,11 @@ MDBX_INTERNAL_FUNC int mdbx_check_tid4bionic(void) { } #endif /* __ANDROID_API__ || ANDROID) || BIONIC */ -static int mdbx_ipclock_lock(MDBX_env *env, mdbx_ipclock_t *ipc, +static int mdbx_ipclock_lock(MDBX_env *env, osal_ipclock_t *ipc, const bool dont_wait) { #if MDBX_LOCKING == MDBX_LOCKING_POSIX2001 || \ MDBX_LOCKING == MDBX_LOCKING_POSIX2008 - int rc = mdbx_check_tid4bionic(); + int rc = osal_check_tid4bionic(); if (likely(rc == 0)) rc = dont_wait ? pthread_mutex_trylock(ipc) : pthread_mutex_lock(ipc); rc = (rc == EBUSY && dont_wait) ? MDBX_BUSY : rc; @@ -891,7 +889,7 @@ static int mdbx_ipclock_lock(MDBX_env *env, mdbx_ipclock_t *ipc, return rc; } -static int mdbx_ipclock_unlock(MDBX_env *env, mdbx_ipclock_t *ipc) { +static int mdbx_ipclock_unlock(MDBX_env *env, osal_ipclock_t *ipc) { #if MDBX_LOCKING == MDBX_LOCKING_POSIX2001 || \ MDBX_LOCKING == MDBX_LOCKING_POSIX2008 int rc = pthread_mutex_unlock(ipc); @@ -913,38 +911,38 @@ static int mdbx_ipclock_unlock(MDBX_env *env, mdbx_ipclock_t *ipc) { return rc; } -MDBX_INTERNAL_FUNC int mdbx_rdt_lock(MDBX_env *env) { - mdbx_trace("%s", ">>"); - mdbx_jitter4testing(true); +MDBX_INTERNAL_FUNC int osal_rdt_lock(MDBX_env *env) { + TRACE("%s", ">>"); + jitter4testing(true); int rc = mdbx_ipclock_lock(env, &env->me_lck->mti_rlock, false); - mdbx_trace("<< rc %d", rc); + TRACE("<< rc %d", rc); return rc; } -MDBX_INTERNAL_FUNC void mdbx_rdt_unlock(MDBX_env *env) { - mdbx_trace("%s", ">>"); +MDBX_INTERNAL_FUNC void osal_rdt_unlock(MDBX_env *env) { + TRACE("%s", ">>"); int rc = mdbx_ipclock_unlock(env, &env->me_lck->mti_rlock); - mdbx_trace("<< rc %d", rc); + TRACE("<< rc %d", rc); if (unlikely(rc != MDBX_SUCCESS)) mdbx_panic("%s() failed: err %d\n", __func__, rc); - mdbx_jitter4testing(true); + jitter4testing(true); } int mdbx_txn_lock(MDBX_env *env, bool dont_wait) { - mdbx_trace("%swait %s", dont_wait ? "dont-" : "", ">>"); - mdbx_jitter4testing(true); + TRACE("%swait %s", dont_wait ? "dont-" : "", ">>"); + jitter4testing(true); int rc = mdbx_ipclock_lock(env, &env->me_lck->mti_wlock, dont_wait); - mdbx_trace("<< rc %d", rc); + TRACE("<< rc %d", rc); return MDBX_IS_ERROR(rc) ? rc : MDBX_SUCCESS; } void mdbx_txn_unlock(MDBX_env *env) { - mdbx_trace("%s", ">>"); + TRACE("%s", ">>"); int rc = mdbx_ipclock_unlock(env, &env->me_lck->mti_wlock); - mdbx_trace("<< rc %d", rc); + TRACE("<< rc %d", rc); if (unlikely(rc != MDBX_SUCCESS)) mdbx_panic("%s() failed: err %d\n", __func__, rc); - mdbx_jitter4testing(true); + jitter4testing(true); } #else diff --git a/src/lck-windows.c b/src/lck-windows.c index 6ba3f3a9..7b833773 100644 --- a/src/lck-windows.c +++ b/src/lck-windows.c @@ -48,16 +48,16 @@ static switch (reason) { case DLL_PROCESS_ATTACH: mdbx_winnt_import(); - mdbx_rthc_global_init(); + global_ctor(); break; case DLL_PROCESS_DETACH: - mdbx_rthc_global_dtor(); + global_dtor(); break; case DLL_THREAD_ATTACH: break; case DLL_THREAD_DETACH: - mdbx_rthc_thread_dtor(module); + thread_dtor(module); break; } #if MDBX_BUILD_SHARED_LIBRARY @@ -186,8 +186,8 @@ void mdbx_txn_unlock(MDBX_env *env) { #define LCK_LOWER LCK_LO_OFFSET, LCK_LO_LEN #define LCK_UPPER LCK_UP_OFFSET, LCK_UP_LEN -MDBX_INTERNAL_FUNC int mdbx_rdt_lock(MDBX_env *env) { - mdbx_srwlock_AcquireShared(&env->me_remap_guard); +MDBX_INTERNAL_FUNC int osal_rdt_lock(MDBX_env *env) { + osal_srwlock_AcquireShared(&env->me_remap_guard); if (env->me_lfd == INVALID_HANDLE_VALUE) return MDBX_SUCCESS; /* readonly database in readonly filesystem */ @@ -198,21 +198,21 @@ MDBX_INTERNAL_FUNC int mdbx_rdt_lock(MDBX_env *env) { return MDBX_SUCCESS; int rc = (int)GetLastError(); - mdbx_srwlock_ReleaseShared(&env->me_remap_guard); + osal_srwlock_ReleaseShared(&env->me_remap_guard); return rc; } -MDBX_INTERNAL_FUNC void mdbx_rdt_unlock(MDBX_env *env) { +MDBX_INTERNAL_FUNC void osal_rdt_unlock(MDBX_env *env) { if (env->me_lfd != INVALID_HANDLE_VALUE) { /* transition from S-E (locked) to S-? (used), e.g. unlock upper-part */ if ((env->me_flags & MDBX_EXCLUSIVE) == 0 && !funlock(env->me_lfd, LCK_UPPER)) mdbx_panic("%s failed: err %u", __func__, (int)GetLastError()); } - mdbx_srwlock_ReleaseShared(&env->me_remap_guard); + osal_srwlock_ReleaseShared(&env->me_remap_guard); } -MDBX_INTERNAL_FUNC int mdbx_lockfile(mdbx_filehandle_t fd, bool wait) { +MDBX_INTERNAL_FUNC int osal_lockfile(mdbx_filehandle_t fd, bool wait) { return flock(fd, wait ? LCK_EXCLUSIVE | LCK_WAITFOR : LCK_EXCLUSIVE | LCK_DONTWAIT, @@ -225,7 +225,7 @@ static int suspend_and_append(mdbx_handle_array_t **array, const DWORD ThreadId) { const unsigned limit = (*array)->limit; if ((*array)->count == limit) { - void *ptr = mdbx_realloc( + void *ptr = osal_realloc( (limit > ARRAY_LENGTH((*array)->handles)) ? *array : /* don't free initial array on the stack */ NULL, @@ -259,8 +259,8 @@ static int suspend_and_append(mdbx_handle_array_t **array, } MDBX_INTERNAL_FUNC int -mdbx_suspend_threads_before_remap(MDBX_env *env, mdbx_handle_array_t **array) { - mdbx_assert(env, (env->me_flags & MDBX_NOTLS) == 0); +osal_suspend_threads_before_remap(MDBX_env *env, mdbx_handle_array_t **array) { + eASSERT(env, (env->me_flags & MDBX_NOTLS) == 0); const uintptr_t CurrentTid = GetCurrentThreadId(); int rc; if (env->me_lck_mmap.lck) { @@ -282,7 +282,7 @@ mdbx_suspend_threads_before_remap(MDBX_env *env, mdbx_handle_array_t **array) { rc = suspend_and_append(array, (mdbx_tid_t)reader->mr_tid.weak); if (rc != MDBX_SUCCESS) { bailout_lck: - (void)mdbx_resume_threads_after_remap(*array); + (void)osal_resume_threads_after_remap(*array); return rc; } } @@ -294,7 +294,7 @@ mdbx_suspend_threads_before_remap(MDBX_env *env, mdbx_handle_array_t **array) { } else { /* Without LCK (i.e. read-only mode). * Walk through a snapshot of all running threads */ - mdbx_assert(env, env->me_flags & (MDBX_EXCLUSIVE | MDBX_RDONLY)); + eASSERT(env, env->me_flags & (MDBX_EXCLUSIVE | MDBX_RDONLY)); const HANDLE hSnapshot = CreateToolhelp32Snapshot(TH32CS_SNAPTHREAD, 0); if (hSnapshot == INVALID_HANDLE_VALUE) return (int)GetLastError(); @@ -306,7 +306,7 @@ mdbx_suspend_threads_before_remap(MDBX_env *env, mdbx_handle_array_t **array) { rc = (int)GetLastError(); bailout_toolhelp: CloseHandle(hSnapshot); - (void)mdbx_resume_threads_after_remap(*array); + (void)osal_resume_threads_after_remap(*array); return rc; } @@ -331,7 +331,7 @@ mdbx_suspend_threads_before_remap(MDBX_env *env, mdbx_handle_array_t **array) { } MDBX_INTERNAL_FUNC int -mdbx_resume_threads_after_remap(mdbx_handle_array_t *array) { +osal_resume_threads_after_remap(mdbx_handle_array_t *array) { int rc = MDBX_SUCCESS; for (unsigned i = 0; i < array->count; ++i) { const HANDLE hThread = array->handles[i]; @@ -370,11 +370,11 @@ mdbx_resume_threads_after_remap(mdbx_handle_array_t *array) { * E-S * E-E = exclusive-write, i.e. exclusive due (re)initialization * - * The mdbx_lck_seize() moves the locking-FSM from the initial free/unlocked + * The osal_lck_seize() moves the locking-FSM from the initial free/unlocked * state to the "exclusive write" (and returns MDBX_RESULT_TRUE) if possible, * or to the "used" (and returns MDBX_RESULT_FALSE). * - * The mdbx_lck_downgrade() moves the locking-FSM from "exclusive write" + * The osal_lck_downgrade() moves the locking-FSM from "exclusive write" * state to the "used" (i.e. shared) state. * * The mdbx_lck_upgrade() moves the locking-FSM from "used" (i.e. shared) @@ -432,21 +432,21 @@ static int internal_seize_lck(HANDLE lfd) { assert(lfd != INVALID_HANDLE_VALUE); /* 1) now on ?-? (free), get ?-E (middle) */ - mdbx_jitter4testing(false); + jitter4testing(false); if (!flock(lfd, LCK_EXCLUSIVE | LCK_WAITFOR, LCK_UPPER)) { rc = (int)GetLastError() /* 2) something went wrong, give up */; - mdbx_error("%s, err %u", "?-?(free) >> ?-E(middle)", rc); + ERROR("%s, err %u", "?-?(free) >> ?-E(middle)", rc); return rc; } /* 3) now on ?-E (middle), try E-E (exclusive-write) */ - mdbx_jitter4testing(false); + jitter4testing(false); if (flock(lfd, LCK_EXCLUSIVE | LCK_DONTWAIT, LCK_LOWER)) return MDBX_RESULT_TRUE /* 4) got E-E (exclusive-write), done */; /* 5) still on ?-E (middle) */ rc = (int)GetLastError(); - mdbx_jitter4testing(false); + jitter4testing(false); if (rc != ERROR_SHARING_VIOLATION && rc != ERROR_LOCK_VIOLATION) { /* 6) something went wrong, give up */ if (!funlock(lfd, LCK_UPPER)) @@ -456,13 +456,13 @@ static int internal_seize_lck(HANDLE lfd) { } /* 7) still on ?-E (middle), try S-E (locked) */ - mdbx_jitter4testing(false); + jitter4testing(false); rc = flock(lfd, LCK_SHARED | LCK_DONTWAIT, LCK_LOWER) ? MDBX_RESULT_FALSE : (int)GetLastError(); - mdbx_jitter4testing(false); + jitter4testing(false); if (rc != MDBX_RESULT_FALSE) - mdbx_error("%s, err %u", "?-E(middle) >> S-E(locked)", rc); + ERROR("%s, err %u", "?-E(middle) >> S-E(locked)", rc); /* 8) now on S-E (locked) or still on ?-E (middle), * transition to S-? (used) or ?-? (free) */ @@ -474,7 +474,7 @@ static int internal_seize_lck(HANDLE lfd) { return rc; } -MDBX_INTERNAL_FUNC int mdbx_lck_seize(MDBX_env *env) { +MDBX_INTERNAL_FUNC int osal_lck_seize(MDBX_env *env) { int rc; assert(env->me_lazy_fd != INVALID_HANDLE_VALUE); @@ -485,17 +485,17 @@ MDBX_INTERNAL_FUNC int mdbx_lck_seize(MDBX_env *env) { if (env->me_lfd == INVALID_HANDLE_VALUE) { /* LY: without-lck mode (e.g. on read-only filesystem) */ - mdbx_jitter4testing(false); + jitter4testing(false); if (!flock(env->me_lazy_fd, LCK_SHARED | LCK_DONTWAIT, LCK_WHOLE)) { rc = (int)GetLastError(); - mdbx_error("%s, err %u", "without-lck", rc); + ERROR("%s, err %u", "without-lck", rc); return rc; } return MDBX_RESULT_FALSE; } rc = internal_seize_lck(env->me_lfd); - mdbx_jitter4testing(false); + jitter4testing(false); if (rc == MDBX_RESULT_TRUE && (env->me_flags & MDBX_RDONLY) == 0) { /* Check that another process don't operates in without-lck mode. * Doing such check by exclusive locking the body-part of db. Should be @@ -505,11 +505,11 @@ MDBX_INTERNAL_FUNC int mdbx_lck_seize(MDBX_env *env) { * while opening db in valid (non-conflict) mode. */ if (!flock(env->me_lazy_fd, LCK_EXCLUSIVE | LCK_DONTWAIT, LCK_BODY)) { rc = (int)GetLastError(); - mdbx_error("%s, err %u", "lock-against-without-lck", rc); - mdbx_jitter4testing(false); + ERROR("%s, err %u", "lock-against-without-lck", rc); + jitter4testing(false); lck_unlock(env); } else { - mdbx_jitter4testing(false); + jitter4testing(false); if (!funlock(env->me_lazy_fd, LCK_BODY)) mdbx_panic("%s(%s) failed: err %u", __func__, "unlock-against-without-lck", (int)GetLastError()); @@ -519,7 +519,7 @@ MDBX_INTERNAL_FUNC int mdbx_lck_seize(MDBX_env *env) { return rc; } -MDBX_INTERNAL_FUNC int mdbx_lck_downgrade(MDBX_env *env) { +MDBX_INTERNAL_FUNC int osal_lck_downgrade(MDBX_env *env) { /* Transite from exclusive-write state (E-E) to used (S-?) */ assert(env->me_lazy_fd != INVALID_HANDLE_VALUE); assert(env->me_lfd != INVALID_HANDLE_VALUE); @@ -535,7 +535,7 @@ MDBX_INTERNAL_FUNC int mdbx_lck_downgrade(MDBX_env *env) { /* 2) now at ?-E (middle), transition to S-E (locked) */ if (!flock(env->me_lfd, LCK_SHARED | LCK_DONTWAIT, LCK_LOWER)) { int rc = (int)GetLastError() /* 3) something went wrong, give up */; - mdbx_error("%s, err %u", "?-E(middle) >> S-E(locked)", rc); + ERROR("%s, err %u", "?-E(middle) >> S-E(locked)", rc); return rc; } @@ -557,10 +557,10 @@ MDBX_INTERNAL_FUNC int mdbx_lck_upgrade(MDBX_env *env) { int rc; /* 1) now on S-? (used), try S-E (locked) */ - mdbx_jitter4testing(false); + jitter4testing(false); if (!flock(env->me_lfd, LCK_EXCLUSIVE | LCK_DONTWAIT, LCK_UPPER)) { rc = (int)GetLastError() /* 2) something went wrong, give up */; - mdbx_verbose("%s, err %u", "S-?(used) >> S-E(locked)", rc); + VERBOSE("%s, err %u", "S-?(used) >> S-E(locked)", rc); return rc; } @@ -570,17 +570,17 @@ MDBX_INTERNAL_FUNC int mdbx_lck_upgrade(MDBX_env *env) { (int)GetLastError()); /* 4) now on ?-E (middle), try E-E (exclusive-write) */ - mdbx_jitter4testing(false); + jitter4testing(false); if (!flock(env->me_lfd, LCK_EXCLUSIVE | LCK_DONTWAIT, LCK_LOWER)) { rc = (int)GetLastError() /* 5) something went wrong, give up */; - mdbx_verbose("%s, err %u", "?-E(middle) >> E-E(exclusive-write)", rc); + VERBOSE("%s, err %u", "?-E(middle) >> E-E(exclusive-write)", rc); return rc; } return MDBX_SUCCESS /* 6) now at E-E (exclusive-write), done */; } -MDBX_INTERNAL_FUNC int mdbx_lck_init(MDBX_env *env, +MDBX_INTERNAL_FUNC int osal_lck_init(MDBX_env *env, MDBX_env *inprocess_neighbor, int global_uniqueness_flag) { (void)env; @@ -589,19 +589,19 @@ MDBX_INTERNAL_FUNC int mdbx_lck_init(MDBX_env *env, return MDBX_SUCCESS; } -MDBX_INTERNAL_FUNC int mdbx_lck_destroy(MDBX_env *env, +MDBX_INTERNAL_FUNC int osal_lck_destroy(MDBX_env *env, MDBX_env *inprocess_neighbor) { /* LY: should unmap before releasing the locks to avoid race condition and * STATUS_USER_MAPPED_FILE/ERROR_USER_MAPPED_FILE */ if (env->me_map) - mdbx_munmap(&env->me_dxb_mmap); + osal_munmap(&env->me_dxb_mmap); if (env->me_lck_mmap.lck) { const bool synced = env->me_lck_mmap.lck->mti_unsynced_pages.weak == 0; - mdbx_munmap(&env->me_lck_mmap); + osal_munmap(&env->me_lck_mmap); if (synced && !inprocess_neighbor && env->me_lfd != INVALID_HANDLE_VALUE && mdbx_lck_upgrade(env) == MDBX_SUCCESS) /* this will fail if LCK is used/mmapped by other process(es) */ - mdbx_ftruncate(env->me_lfd, 0); + osal_ftruncate(env->me_lfd, 0); } lck_unlock(env); return MDBX_SUCCESS; @@ -610,12 +610,12 @@ MDBX_INTERNAL_FUNC int mdbx_lck_destroy(MDBX_env *env, /*----------------------------------------------------------------------------*/ /* reader checking (by pid) */ -MDBX_INTERNAL_FUNC int mdbx_rpid_set(MDBX_env *env) { +MDBX_INTERNAL_FUNC int osal_rpid_set(MDBX_env *env) { (void)env; return MDBX_SUCCESS; } -MDBX_INTERNAL_FUNC int mdbx_rpid_clear(MDBX_env *env) { +MDBX_INTERNAL_FUNC int osal_rpid_clear(MDBX_env *env) { (void)env; return MDBX_SUCCESS; } @@ -626,7 +626,7 @@ MDBX_INTERNAL_FUNC int mdbx_rpid_clear(MDBX_env *env) { * MDBX_RESULT_TRUE, if pid is live (unable to acquire lock) * MDBX_RESULT_FALSE, if pid is dead (lock acquired) * or otherwise the errcode. */ -MDBX_INTERNAL_FUNC int mdbx_rpid_check(MDBX_env *env, uint32_t pid) { +MDBX_INTERNAL_FUNC int osal_rpid_check(MDBX_env *env, uint32_t pid) { (void)env; HANDLE hProcess = OpenProcess(SYNCHRONIZE, FALSE, pid); int rc; @@ -663,11 +663,11 @@ MDBX_INTERNAL_FUNC int mdbx_rpid_check(MDBX_env *env, uint32_t pid) { // Stub for slim read-write lock // Copyright (C) 1995-2002 Brad Wilson -static void WINAPI stub_srwlock_Init(MDBX_srwlock *srwl) { +static void WINAPI stub_srwlock_Init(osal_srwlock_t *srwl) { srwl->readerCount = srwl->writerCount = 0; } -static void WINAPI stub_srwlock_AcquireShared(MDBX_srwlock *srwl) { +static void WINAPI stub_srwlock_AcquireShared(osal_srwlock_t *srwl) { while (true) { assert(srwl->writerCount >= 0 && srwl->readerCount >= 0); @@ -692,12 +692,12 @@ static void WINAPI stub_srwlock_AcquireShared(MDBX_srwlock *srwl) { } } -static void WINAPI stub_srwlock_ReleaseShared(MDBX_srwlock *srwl) { +static void WINAPI stub_srwlock_ReleaseShared(osal_srwlock_t *srwl) { assert(srwl->readerCount > 0); _InterlockedDecrement(&srwl->readerCount); } -static void WINAPI stub_srwlock_AcquireExclusive(MDBX_srwlock *srwl) { +static void WINAPI stub_srwlock_AcquireExclusive(osal_srwlock_t *srwl) { while (true) { assert(srwl->writerCount >= 0 && srwl->readerCount >= 0); @@ -723,7 +723,7 @@ static void WINAPI stub_srwlock_AcquireExclusive(MDBX_srwlock *srwl) { } } -static void WINAPI stub_srwlock_ReleaseExclusive(MDBX_srwlock *srwl) { +static void WINAPI stub_srwlock_ReleaseExclusive(osal_srwlock_t *srwl) { assert(srwl->writerCount == 1 && srwl->readerCount >= 0); srwl->writerCount = 0; } @@ -739,9 +739,9 @@ static uint64_t WINAPI stub_GetTickCount64(void) { /*----------------------------------------------------------------------------*/ #ifndef xMDBX_ALLOY -MDBX_srwlock_function mdbx_srwlock_Init, mdbx_srwlock_AcquireShared, - mdbx_srwlock_ReleaseShared, mdbx_srwlock_AcquireExclusive, - mdbx_srwlock_ReleaseExclusive; +osal_srwlock_t_function osal_srwlock_Init, osal_srwlock_AcquireShared, + osal_srwlock_ReleaseShared, osal_srwlock_AcquireExclusive, + osal_srwlock_ReleaseExclusive; MDBX_NtExtendSection mdbx_NtExtendSection; MDBX_GetFileInformationByHandleEx mdbx_GetFileInformationByHandleEx; @@ -789,24 +789,24 @@ static void mdbx_winnt_import(void) { GET_PROC_ADDR(hAdvapi32dll, RegGetValueA); #undef GET_PROC_ADDR - const MDBX_srwlock_function init = - (MDBX_srwlock_function)GetProcAddress(hKernel32dll, "InitializeSRWLock"); + const osal_srwlock_t_function init = (osal_srwlock_t_function)GetProcAddress( + hKernel32dll, "InitializeSRWLock"); if (init != NULL) { - mdbx_srwlock_Init = init; - mdbx_srwlock_AcquireShared = (MDBX_srwlock_function)GetProcAddress( + osal_srwlock_Init = init; + osal_srwlock_AcquireShared = (osal_srwlock_t_function)GetProcAddress( hKernel32dll, "AcquireSRWLockShared"); - mdbx_srwlock_ReleaseShared = (MDBX_srwlock_function)GetProcAddress( + osal_srwlock_ReleaseShared = (osal_srwlock_t_function)GetProcAddress( hKernel32dll, "ReleaseSRWLockShared"); - mdbx_srwlock_AcquireExclusive = (MDBX_srwlock_function)GetProcAddress( + osal_srwlock_AcquireExclusive = (osal_srwlock_t_function)GetProcAddress( hKernel32dll, "AcquireSRWLockExclusive"); - mdbx_srwlock_ReleaseExclusive = (MDBX_srwlock_function)GetProcAddress( + osal_srwlock_ReleaseExclusive = (osal_srwlock_t_function)GetProcAddress( hKernel32dll, "ReleaseSRWLockExclusive"); } else { - mdbx_srwlock_Init = stub_srwlock_Init; - mdbx_srwlock_AcquireShared = stub_srwlock_AcquireShared; - mdbx_srwlock_ReleaseShared = stub_srwlock_ReleaseShared; - mdbx_srwlock_AcquireExclusive = stub_srwlock_AcquireExclusive; - mdbx_srwlock_ReleaseExclusive = stub_srwlock_ReleaseExclusive; + osal_srwlock_Init = stub_srwlock_Init; + osal_srwlock_AcquireShared = stub_srwlock_AcquireShared; + osal_srwlock_ReleaseShared = stub_srwlock_ReleaseShared; + osal_srwlock_AcquireExclusive = stub_srwlock_AcquireExclusive; + osal_srwlock_ReleaseExclusive = stub_srwlock_ReleaseExclusive; } } diff --git a/src/mdbx_chk.c b/src/mdbx_chk.c index 0b288f08..d2dea1e3 100644 --- a/src/mdbx_chk.c +++ b/src/mdbx_chk.c @@ -20,7 +20,7 @@ #pragma warning(disable : 4996) /* The POSIX name is deprecated... */ #endif /* _MSC_VER (warnings) */ -#define xMDBX_TOOLS /* Avoid using internal mdbx_assert() */ +#define xMDBX_TOOLS /* Avoid using internal eASSERT() */ #include "internals.h" typedef struct flagbit { @@ -193,12 +193,12 @@ static void pagemap_cleanup(void) { for (size_t i = CORE_DBS + /* account pseudo-entry for meta */ 1; i < ARRAY_LENGTH(walk.dbi); ++i) { if (walk.dbi[i].name) { - mdbx_free((void *)walk.dbi[i].name); + osal_free((void *)walk.dbi[i].name); walk.dbi[i].name = nullptr; } } - mdbx_free(walk.pagemap); + osal_free(walk.pagemap); walk.pagemap = nullptr; } @@ -229,7 +229,7 @@ static walk_dbi_t *pagemap_lookup_dbi(const char *dbi_name, bool silent) { if (dbi == ARRAY_END(walk.dbi)) return nullptr; - dbi->name = mdbx_strdup(dbi_name); + dbi->name = osal_strdup(dbi_name); return last = dbi; } @@ -247,7 +247,7 @@ static void MDBX_PRINTF_ARGS(4, 5) break; if (!p) { - p = mdbx_calloc(1, sizeof(*p)); + p = osal_calloc(1, sizeof(*p)); if (unlikely(!p)) return; p->caption = msg; @@ -292,7 +292,7 @@ static size_t problems_pop(struct problem *list) { count += problems_list->count; print("%s%s (%" PRIuPTR ")", i ? ", " : "", problems_list->caption, problems_list->count); - mdbx_free(problems_list); + osal_free(problems_list); problems_list = p; } print("\n"); @@ -529,7 +529,7 @@ static int handle_freedb(const uint64_t record_number, const MDBX_val *key, number = data->iov_len / sizeof(pgno_t) - 1; } else if (data->iov_len - (number + 1) * sizeof(pgno_t) >= /* LY: allow gap up to one page. it is ok - * and better than shink-and-retry inside mdbx_update_gc() */ + * and better than shink-and-retry inside update_gc() */ envinfo.mi_dxb_pagesize) problem_add("entry", txnid, "extra idl space", "%" PRIuSIZE " < %" PRIuSIZE " (minor, not a trouble)", @@ -626,7 +626,7 @@ static int handle_maindb(const uint64_t record_number, const MDBX_val *key, return handle_userdb(record_number, key, data); } - name = mdbx_malloc(key->iov_len + 1); + name = osal_malloc(key->iov_len + 1); if (unlikely(!name)) return MDBX_ENOMEM; memcpy(name, key->iov_base, key->iov_len); @@ -634,7 +634,7 @@ static int handle_maindb(const uint64_t record_number, const MDBX_val *key, userdb_count++; rc = process_db(~0u, name, handle_userdb, false); - mdbx_free(name); + osal_free(name); if (rc != MDBX_INCOMPATIBLE) return rc; @@ -1340,7 +1340,7 @@ int main(int argc, char *argv[]) { } #endif if (rc) { - error("mdbx_filesize() failed, error %d %s\n", rc, mdbx_strerror(rc)); + error("osal_filesize() failed, error %d %s\n", rc, mdbx_strerror(rc)); goto bailout; } @@ -1504,7 +1504,7 @@ int main(int argc, char *argv[]) { print("Traversal b-tree by txn#%" PRIaTXN "...\n", txn->mt_txnid); fflush(nullptr); - walk.pagemap = mdbx_calloc((size_t)backed_pages, sizeof(*walk.pagemap)); + walk.pagemap = osal_calloc((size_t)backed_pages, sizeof(*walk.pagemap)); if (!walk.pagemap) { rc = errno ? errno : MDBX_ENOMEM; error("calloc() failed, error %d %s\n", rc, mdbx_strerror(rc)); diff --git a/src/mdbx_copy.c b/src/mdbx_copy.c index 4b40b558..18eafca0 100644 --- a/src/mdbx_copy.c +++ b/src/mdbx_copy.c @@ -20,7 +20,7 @@ #pragma warning(disable : 4996) /* The POSIX name is deprecated... */ #endif /* _MSC_VER (warnings) */ -#define xMDBX_TOOLS /* Avoid using internal mdbx_assert() */ +#define xMDBX_TOOLS /* Avoid using internal eASSERT() */ #include "internals.h" #if defined(_WIN32) || defined(_WIN64) diff --git a/src/mdbx_drop.c b/src/mdbx_drop.c index 9b0a18b5..0680fc11 100644 --- a/src/mdbx_drop.c +++ b/src/mdbx_drop.c @@ -22,7 +22,7 @@ #pragma warning(disable : 4996) /* The POSIX name is deprecated... */ #endif /* _MSC_VER (warnings) */ -#define xMDBX_TOOLS /* Avoid using internal mdbx_assert() */ +#define xMDBX_TOOLS /* Avoid using internal eASSERT() */ #include "internals.h" #include diff --git a/src/mdbx_dump.c b/src/mdbx_dump.c index 170a5332..364e03ab 100644 --- a/src/mdbx_dump.c +++ b/src/mdbx_dump.c @@ -20,7 +20,7 @@ #pragma warning(disable : 4996) /* The POSIX name is deprecated... */ #endif /* _MSC_VER (warnings) */ -#define xMDBX_TOOLS /* Avoid using internal mdbx_assert() */ +#define xMDBX_TOOLS /* Avoid using internal eASSERT() */ #include "internals.h" #include @@ -403,7 +403,7 @@ int main(int argc, char *argv[]) { if (memchr(key.iov_base, '\0', key.iov_len)) continue; - subname = mdbx_realloc(buf4free, key.iov_len + 1); + subname = osal_realloc(buf4free, key.iov_len + 1); if (!subname) { rc = MDBX_ENOMEM; break; diff --git a/src/mdbx_load.c b/src/mdbx_load.c index 10e54750..b9fdfd8c 100644 --- a/src/mdbx_load.c +++ b/src/mdbx_load.c @@ -20,7 +20,7 @@ #pragma warning(disable : 4996) /* The POSIX name is deprecated... */ #endif /* _MSC_VER (warnings) */ -#define xMDBX_TOOLS /* Avoid using internal mdbx_assert() */ +#define xMDBX_TOOLS /* Avoid using internal eASSERT() */ #include "internals.h" #include @@ -213,7 +213,7 @@ static int readhdr(void) { if (str) { if (*str) { free(subname); - subname = mdbx_strdup(str); + subname = osal_strdup(str); if (!subname) { if (!quiet) perror("strdup()"); @@ -421,7 +421,7 @@ __hot static int readline(MDBX_val *out, MDBX_val *buf) { /* Is buffer too short? */ while (c1[len - 1] != '\n') { - buf->iov_base = mdbx_realloc(buf->iov_base, buf->iov_len * 2); + buf->iov_base = osal_realloc(buf->iov_base, buf->iov_len * 2); if (!buf->iov_base) { if (!quiet) fprintf(stderr, @@ -560,7 +560,7 @@ int main(int argc, char *argv[]) { envflags |= MDBX_NOSUBDIR; break; case 's': - subname = mdbx_strdup(optarg); + subname = osal_strdup(optarg); break; case 'N': putflags |= MDBX_NOOVERWRITE | MDBX_NODUPDATA; @@ -606,7 +606,7 @@ int main(int argc, char *argv[]) { fflush(nullptr); dbuf.iov_len = 4096; - dbuf.iov_base = mdbx_malloc(dbuf.iov_len); + dbuf.iov_base = osal_malloc(dbuf.iov_len); if (!dbuf.iov_base) { rc = MDBX_ENOMEM; error("value-buffer", rc); diff --git a/src/mdbx_stat.c b/src/mdbx_stat.c index fa229435..ebf53324 100644 --- a/src/mdbx_stat.c +++ b/src/mdbx_stat.c @@ -20,7 +20,7 @@ #pragma warning(disable : 4996) /* The POSIX name is deprecated... */ #endif /* _MSC_VER (warnings) */ -#define xMDBX_TOOLS /* Avoid using internal mdbx_assert() */ +#define xMDBX_TOOLS /* Avoid using internal eASSERT() */ #include "internals.h" #if defined(_WIN32) || defined(_WIN64) @@ -469,13 +469,13 @@ int main(int argc, char *argv[]) { MDBX_dbi subdbi; if (memchr(key.iov_base, '\0', key.iov_len)) continue; - subname = mdbx_malloc(key.iov_len + 1); + subname = osal_malloc(key.iov_len + 1); memcpy(subname, key.iov_base, key.iov_len); subname[key.iov_len] = '\0'; rc = mdbx_dbi_open(txn, subname, MDBX_DB_ACCEDE, &subdbi); if (rc == MDBX_SUCCESS) printf("Status of %s\n", subname); - mdbx_free(subname); + osal_free(subname); if (unlikely(rc != MDBX_SUCCESS)) { if (rc == MDBX_INCOMPATIBLE) continue; diff --git a/src/osal.c b/src/osal.c index 91a3f8c7..a3e23eee 100644 --- a/src/osal.c +++ b/src/osal.c @@ -232,12 +232,12 @@ __cold void mdbx_assert_fail(const MDBX_env *env, const char *msg, (void)env; #endif /* MDBX_DEBUG */ - if (mdbx_debug_logger) - mdbx_debug_log(MDBX_LOG_FATAL, func, line, "assert: %s\n", msg); + if (debug_logger) + debug_log(MDBX_LOG_FATAL, func, line, "assert: %s\n", msg); else { #if defined(_WIN32) || defined(_WIN64) char *message = nullptr; - const int num = mdbx_asprintf(&message, "\r\nMDBX-ASSERTION: %s, %s:%u", + const int num = osal_asprintf(&message, "\r\nMDBX-ASSERTION: %s, %s:%u", msg, func ? func : "unknown", line); if (num < 1 || !message) message = ""; @@ -261,7 +261,7 @@ __cold void mdbx_panic(const char *fmt, ...) { va_start(ap, fmt); char *message = nullptr; - const int num = mdbx_vasprintf(&message, fmt, ap); + const int num = osal_vasprintf(&message, fmt, ap); va_end(ap); const char *const const_message = (num < 1 || !message) ? "" @@ -281,8 +281,8 @@ __cold void mdbx_panic(const char *fmt, ...) { /*----------------------------------------------------------------------------*/ -#ifndef mdbx_vasprintf -MDBX_INTERNAL_FUNC int mdbx_vasprintf(char **strp, const char *fmt, +#ifndef osal_vasprintf +MDBX_INTERNAL_FUNC int osal_vasprintf(char **strp, const char *fmt, va_list ap) { va_list ones; va_copy(ones, ap); @@ -294,7 +294,7 @@ MDBX_INTERNAL_FUNC int mdbx_vasprintf(char **strp, const char *fmt, return needed; } - *strp = mdbx_malloc(needed + 1); + *strp = osal_malloc(needed + 1); if (unlikely(*strp == nullptr)) { va_end(ones); #if defined(_WIN32) || defined(_WIN64) @@ -310,25 +310,25 @@ MDBX_INTERNAL_FUNC int mdbx_vasprintf(char **strp, const char *fmt, assert(actual == needed); if (unlikely(actual < 0)) { - mdbx_free(*strp); + osal_free(*strp); *strp = nullptr; } return actual; } -#endif /* mdbx_vasprintf */ +#endif /* osal_vasprintf */ -#ifndef mdbx_asprintf -MDBX_INTERNAL_FUNC int mdbx_asprintf(char **strp, const char *fmt, ...) { +#ifndef osal_asprintf +MDBX_INTERNAL_FUNC int osal_asprintf(char **strp, const char *fmt, ...) { va_list ap; va_start(ap, fmt); - int rc = mdbx_vasprintf(strp, fmt, ap); + int rc = osal_vasprintf(strp, fmt, ap); va_end(ap); return rc; } -#endif /* mdbx_asprintf */ +#endif /* osal_asprintf */ -#ifndef mdbx_memalign_alloc -MDBX_INTERNAL_FUNC int mdbx_memalign_alloc(size_t alignment, size_t bytes, +#ifndef osal_memalign_alloc +MDBX_INTERNAL_FUNC int osal_memalign_alloc(size_t alignment, size_t bytes, void **result) { assert(is_powerof2(alignment) && alignment >= sizeof(void *)); #if defined(_WIN32) || defined(_WIN64) @@ -349,35 +349,35 @@ MDBX_INTERNAL_FUNC int mdbx_memalign_alloc(size_t alignment, size_t bytes, #error FIXME #endif } -#endif /* mdbx_memalign_alloc */ +#endif /* osal_memalign_alloc */ -#ifndef mdbx_memalign_free -MDBX_INTERNAL_FUNC void mdbx_memalign_free(void *ptr) { +#ifndef osal_memalign_free +MDBX_INTERNAL_FUNC void osal_memalign_free(void *ptr) { #if defined(_WIN32) || defined(_WIN64) VirtualFree(ptr, 0, MEM_RELEASE); #else - mdbx_free(ptr); + osal_free(ptr); #endif } -#endif /* mdbx_memalign_free */ +#endif /* osal_memalign_free */ -#ifndef mdbx_strdup -char *mdbx_strdup(const char *str) { +#ifndef osal_strdup +char *osal_strdup(const char *str) { if (!str) return NULL; size_t bytes = strlen(str) + 1; - char *dup = mdbx_malloc(bytes); + char *dup = osal_malloc(bytes); if (dup) memcpy(dup, str, bytes); return dup; } -#endif /* mdbx_strdup */ +#endif /* osal_strdup */ /*----------------------------------------------------------------------------*/ -MDBX_INTERNAL_FUNC int mdbx_condpair_init(mdbx_condpair_t *condpair) { +MDBX_INTERNAL_FUNC int osal_condpair_init(osal_condpair_t *condpair) { int rc; - memset(condpair, 0, sizeof(mdbx_condpair_t)); + memset(condpair, 0, sizeof(osal_condpair_t)); #if defined(_WIN32) || defined(_WIN64) if ((condpair->mutex = CreateMutexW(NULL, FALSE, NULL)) == NULL) { rc = (int)GetLastError(); @@ -410,11 +410,11 @@ bailout_cond: (void)pthread_mutex_destroy(&condpair->mutex); #endif bailout_mutex: - memset(condpair, 0, sizeof(mdbx_condpair_t)); + memset(condpair, 0, sizeof(osal_condpair_t)); return rc; } -MDBX_INTERNAL_FUNC int mdbx_condpair_destroy(mdbx_condpair_t *condpair) { +MDBX_INTERNAL_FUNC int osal_condpair_destroy(osal_condpair_t *condpair) { #if defined(_WIN32) || defined(_WIN64) int rc = CloseHandle(condpair->mutex) ? MDBX_SUCCESS : (int)GetLastError(); rc = CloseHandle(condpair->event[0]) ? rc : (int)GetLastError(); @@ -424,20 +424,20 @@ MDBX_INTERNAL_FUNC int mdbx_condpair_destroy(mdbx_condpair_t *condpair) { rc = (err = pthread_cond_destroy(&condpair->cond[0])) ? err : rc; rc = (err = pthread_cond_destroy(&condpair->cond[1])) ? err : rc; #endif - memset(condpair, 0, sizeof(mdbx_condpair_t)); + memset(condpair, 0, sizeof(osal_condpair_t)); return rc; } -MDBX_INTERNAL_FUNC int mdbx_condpair_lock(mdbx_condpair_t *condpair) { +MDBX_INTERNAL_FUNC int osal_condpair_lock(osal_condpair_t *condpair) { #if defined(_WIN32) || defined(_WIN64) DWORD code = WaitForSingleObject(condpair->mutex, INFINITE); return waitstatus2errcode(code); #else - return mdbx_pthread_mutex_lock(&condpair->mutex); + return osal_pthread_mutex_lock(&condpair->mutex); #endif } -MDBX_INTERNAL_FUNC int mdbx_condpair_unlock(mdbx_condpair_t *condpair) { +MDBX_INTERNAL_FUNC int osal_condpair_unlock(osal_condpair_t *condpair) { #if defined(_WIN32) || defined(_WIN64) return ReleaseMutex(condpair->mutex) ? MDBX_SUCCESS : (int)GetLastError(); #else @@ -445,7 +445,7 @@ MDBX_INTERNAL_FUNC int mdbx_condpair_unlock(mdbx_condpair_t *condpair) { #endif } -MDBX_INTERNAL_FUNC int mdbx_condpair_signal(mdbx_condpair_t *condpair, +MDBX_INTERNAL_FUNC int osal_condpair_signal(osal_condpair_t *condpair, bool part) { #if defined(_WIN32) || defined(_WIN64) return SetEvent(condpair->event[part]) ? MDBX_SUCCESS : (int)GetLastError(); @@ -454,7 +454,7 @@ MDBX_INTERNAL_FUNC int mdbx_condpair_signal(mdbx_condpair_t *condpair, #endif } -MDBX_INTERNAL_FUNC int mdbx_condpair_wait(mdbx_condpair_t *condpair, +MDBX_INTERNAL_FUNC int osal_condpair_wait(osal_condpair_t *condpair, bool part) { #if defined(_WIN32) || defined(_WIN64) DWORD code = SignalObjectAndWait(condpair->mutex, condpair->event[part], @@ -472,7 +472,7 @@ MDBX_INTERNAL_FUNC int mdbx_condpair_wait(mdbx_condpair_t *condpair, /*----------------------------------------------------------------------------*/ -MDBX_INTERNAL_FUNC int mdbx_fastmutex_init(mdbx_fastmutex_t *fastmutex) { +MDBX_INTERNAL_FUNC int osal_fastmutex_init(osal_fastmutex_t *fastmutex) { #if defined(_WIN32) || defined(_WIN64) InitializeCriticalSection(fastmutex); return MDBX_SUCCESS; @@ -481,7 +481,7 @@ MDBX_INTERNAL_FUNC int mdbx_fastmutex_init(mdbx_fastmutex_t *fastmutex) { #endif } -MDBX_INTERNAL_FUNC int mdbx_fastmutex_destroy(mdbx_fastmutex_t *fastmutex) { +MDBX_INTERNAL_FUNC int osal_fastmutex_destroy(osal_fastmutex_t *fastmutex) { #if defined(_WIN32) || defined(_WIN64) DeleteCriticalSection(fastmutex); return MDBX_SUCCESS; @@ -490,7 +490,7 @@ MDBX_INTERNAL_FUNC int mdbx_fastmutex_destroy(mdbx_fastmutex_t *fastmutex) { #endif } -MDBX_INTERNAL_FUNC int mdbx_fastmutex_acquire(mdbx_fastmutex_t *fastmutex) { +MDBX_INTERNAL_FUNC int osal_fastmutex_acquire(osal_fastmutex_t *fastmutex) { #if defined(_WIN32) || defined(_WIN64) __try { EnterCriticalSection(fastmutex); @@ -503,11 +503,11 @@ MDBX_INTERNAL_FUNC int mdbx_fastmutex_acquire(mdbx_fastmutex_t *fastmutex) { } return MDBX_SUCCESS; #else - return mdbx_pthread_mutex_lock(fastmutex); + return osal_pthread_mutex_lock(fastmutex); #endif } -MDBX_INTERNAL_FUNC int mdbx_fastmutex_release(mdbx_fastmutex_t *fastmutex) { +MDBX_INTERNAL_FUNC int osal_fastmutex_release(osal_fastmutex_t *fastmutex) { #if defined(_WIN32) || defined(_WIN64) LeaveCriticalSection(fastmutex); return MDBX_SUCCESS; @@ -527,7 +527,7 @@ static const DWORD WC_ERR_INVALID_CHARS = : 0; #endif /* WC_ERR_INVALID_CHARS */ -MDBX_INTERNAL_FUNC size_t mdbx_mb2w(wchar_t *dst, size_t dst_n, const char *src, +MDBX_INTERNAL_FUNC size_t osal_mb2w(wchar_t *dst, size_t dst_n, const char *src, size_t src_n) { return MultiByteToWideChar(CP_THREAD_ACP, MB_ERR_INVALID_CHARS, src, (int)src_n, dst, (int)dst_n); @@ -537,7 +537,7 @@ MDBX_INTERNAL_FUNC size_t mdbx_mb2w(wchar_t *dst, size_t dst_n, const char *src, /*----------------------------------------------------------------------------*/ -MDBX_INTERNAL_FUNC int mdbx_removefile(const pathchar_t *pathname) { +MDBX_INTERNAL_FUNC int osal_removefile(const pathchar_t *pathname) { #if defined(_WIN32) || defined(_WIN64) return DeleteFileW(pathname) ? MDBX_SUCCESS : (int)GetLastError(); #else @@ -549,7 +549,7 @@ MDBX_INTERNAL_FUNC int mdbx_removefile(const pathchar_t *pathname) { static bool is_valid_fd(int fd) { return !(isatty(fd) < 0 && errno == EBADF); } #endif /*! Windows */ -MDBX_INTERNAL_FUNC int mdbx_removedirectory(const pathchar_t *pathname) { +MDBX_INTERNAL_FUNC int osal_removedirectory(const pathchar_t *pathname) { #if defined(_WIN32) || defined(_WIN64) return RemoveDirectoryW(pathname) ? MDBX_SUCCESS : (int)GetLastError(); #else @@ -557,7 +557,7 @@ MDBX_INTERNAL_FUNC int mdbx_removedirectory(const pathchar_t *pathname) { #endif } -MDBX_INTERNAL_FUNC int mdbx_openfile(const enum mdbx_openfile_purpose purpose, +MDBX_INTERNAL_FUNC int osal_openfile(const enum osal_openfile_purpose purpose, const MDBX_env *env, const pathchar_t *pathname, mdbx_filehandle_t *fd, @@ -687,18 +687,18 @@ MDBX_INTERNAL_FUNC int mdbx_openfile(const enum mdbx_openfile_purpose purpose, int stub_fd0 = -1, stub_fd1 = -1, stub_fd2 = -1; static const char dev_null[] = "/dev/null"; if (!is_valid_fd(STDIN_FILENO)) { - mdbx_warning("STD%s_FILENO/%d is invalid, open %s for temporary stub", "IN", - STDIN_FILENO, dev_null); + WARNING("STD%s_FILENO/%d is invalid, open %s for temporary stub", "IN", + STDIN_FILENO, dev_null); stub_fd0 = open(dev_null, O_RDONLY | O_NOCTTY); } if (!is_valid_fd(STDOUT_FILENO)) { - mdbx_warning("STD%s_FILENO/%d is invalid, open %s for temporary stub", - "OUT", STDOUT_FILENO, dev_null); + WARNING("STD%s_FILENO/%d is invalid, open %s for temporary stub", "OUT", + STDOUT_FILENO, dev_null); stub_fd1 = open(dev_null, O_WRONLY | O_NOCTTY); } if (!is_valid_fd(STDERR_FILENO)) { - mdbx_warning("STD%s_FILENO/%d is invalid, open %s for temporary stub", - "ERR", STDERR_FILENO, dev_null); + WARNING("STD%s_FILENO/%d is invalid, open %s for temporary stub", "ERR", + STDERR_FILENO, dev_null); stub_fd2 = open(dev_null, O_WRONLY | O_NOCTTY); } #else @@ -723,20 +723,20 @@ MDBX_INTERNAL_FUNC int mdbx_openfile(const enum mdbx_openfile_purpose purpose, /* Safeguard for todo4recovery://erased_by_github/libmdbx/issues/144 */ #if STDIN_FILENO == 0 && STDOUT_FILENO == 1 && STDERR_FILENO == 2 if (*fd == STDIN_FILENO) { - mdbx_warning("Got STD%s_FILENO/%d, avoid using it by dup(fd)", "IN", - STDIN_FILENO); + WARNING("Got STD%s_FILENO/%d, avoid using it by dup(fd)", "IN", + STDIN_FILENO); assert(stub_fd0 == -1); *fd = dup(stub_fd0 = *fd); } if (*fd == STDOUT_FILENO) { - mdbx_warning("Got STD%s_FILENO/%d, avoid using it by dup(fd)", "OUT", - STDOUT_FILENO); + WARNING("Got STD%s_FILENO/%d, avoid using it by dup(fd)", "OUT", + STDOUT_FILENO); assert(stub_fd1 == -1); *fd = dup(stub_fd1 = *fd); } if (*fd == STDERR_FILENO) { - mdbx_warning("Got STD%s_FILENO/%d, avoid using it by dup(fd)", "ERR", - STDERR_FILENO); + WARNING("Got STD%s_FILENO/%d, avoid using it by dup(fd)", "ERR", + STDERR_FILENO); assert(stub_fd2 == -1); *fd = dup(stub_fd2 = *fd); } @@ -747,10 +747,9 @@ MDBX_INTERNAL_FUNC int mdbx_openfile(const enum mdbx_openfile_purpose purpose, if (stub_fd2 != -1) close(stub_fd2); if (*fd >= STDIN_FILENO && *fd <= STDERR_FILENO) { - mdbx_error( - "Rejecting the use of a FD in the range " - "STDIN_FILENO/%d..STDERR_FILENO/%d to prevent database corruption", - STDIN_FILENO, STDERR_FILENO); + ERROR("Rejecting the use of a FD in the range " + "STDIN_FILENO/%d..STDERR_FILENO/%d to prevent database corruption", + STDIN_FILENO, STDERR_FILENO); close(*fd); return EBADF; } @@ -777,7 +776,7 @@ MDBX_INTERNAL_FUNC int mdbx_openfile(const enum mdbx_openfile_purpose purpose, return MDBX_SUCCESS; } -MDBX_INTERNAL_FUNC int mdbx_closefile(mdbx_filehandle_t fd) { +MDBX_INTERNAL_FUNC int osal_closefile(mdbx_filehandle_t fd) { #if defined(_WIN32) || defined(_WIN64) return CloseHandle(fd) ? MDBX_SUCCESS : (int)GetLastError(); #else @@ -786,7 +785,7 @@ MDBX_INTERNAL_FUNC int mdbx_closefile(mdbx_filehandle_t fd) { #endif } -MDBX_INTERNAL_FUNC int mdbx_pread(mdbx_filehandle_t fd, void *buf, size_t bytes, +MDBX_INTERNAL_FUNC int osal_pread(mdbx_filehandle_t fd, void *buf, size_t bytes, uint64_t offset) { if (bytes > MAX_WRITE) return MDBX_EINVAL; @@ -813,7 +812,7 @@ MDBX_INTERNAL_FUNC int mdbx_pread(mdbx_filehandle_t fd, void *buf, size_t bytes, return (bytes == (size_t)read) ? MDBX_SUCCESS : MDBX_ENODATA; } -MDBX_INTERNAL_FUNC int mdbx_pwrite(mdbx_filehandle_t fd, const void *buf, +MDBX_INTERNAL_FUNC int osal_pwrite(mdbx_filehandle_t fd, const void *buf, size_t bytes, uint64_t offset) { while (true) { #if defined(_WIN32) || defined(_WIN64) @@ -849,7 +848,7 @@ MDBX_INTERNAL_FUNC int mdbx_pwrite(mdbx_filehandle_t fd, const void *buf, } } -MDBX_INTERNAL_FUNC int mdbx_write(mdbx_filehandle_t fd, const void *buf, +MDBX_INTERNAL_FUNC int osal_write(mdbx_filehandle_t fd, const void *buf, size_t bytes) { while (true) { #if defined(_WIN32) || defined(_WIN64) @@ -879,13 +878,13 @@ MDBX_INTERNAL_FUNC int mdbx_write(mdbx_filehandle_t fd, const void *buf, } } -int mdbx_pwritev(mdbx_filehandle_t fd, struct iovec *iov, int iovcnt, +int osal_pwritev(mdbx_filehandle_t fd, struct iovec *iov, int iovcnt, uint64_t offset, size_t expected_written) { #if defined(_WIN32) || defined(_WIN64) || defined(__APPLE__) || \ (defined(__ANDROID_API__) && __ANDROID_API__ < 24) size_t written = 0; for (int i = 0; i < iovcnt; ++i) { - int rc = mdbx_pwrite(fd, iov[i].iov_base, iov[i].iov_len, offset); + int rc = osal_pwrite(fd, iov[i].iov_base, iov[i].iov_len, offset); if (unlikely(rc != MDBX_SUCCESS)) return rc; written += iov[i].iov_len; @@ -908,8 +907,8 @@ int mdbx_pwritev(mdbx_filehandle_t fd, struct iovec *iov, int iovcnt, #endif } -MDBX_INTERNAL_FUNC int mdbx_fsync(mdbx_filehandle_t fd, - enum mdbx_syncmode_bits mode_bits) { +MDBX_INTERNAL_FUNC int osal_fsync(mdbx_filehandle_t fd, + enum osal_syncmode_bits mode_bits) { #if defined(_WIN32) || defined(_WIN64) if ((mode_bits & (MDBX_SYNC_DATA | MDBX_SYNC_IODQ)) && !FlushFileBuffers(fd)) return (int)GetLastError(); @@ -938,7 +937,7 @@ MDBX_INTERNAL_FUNC int mdbx_fsync(mdbx_filehandle_t fd, break /* error */; #if defined(__linux__) || defined(__gnu_linux__) case MDBX_SYNC_SIZE: - if (mdbx_linux_kernel_version >= 0x03060000) + if (linux_kernel_version >= 0x03060000) return MDBX_SUCCESS; __fallthrough /* fall through */; #endif /* Linux */ @@ -955,7 +954,7 @@ MDBX_INTERNAL_FUNC int mdbx_fsync(mdbx_filehandle_t fd, #endif } -int mdbx_filesize(mdbx_filehandle_t fd, uint64_t *length) { +int osal_filesize(mdbx_filehandle_t fd, uint64_t *length) { #if defined(_WIN32) || defined(_WIN64) BY_HANDLE_FILE_INFORMATION info; if (!GetFileInformationByHandle(fd, &info)) @@ -974,7 +973,7 @@ int mdbx_filesize(mdbx_filehandle_t fd, uint64_t *length) { return MDBX_SUCCESS; } -MDBX_INTERNAL_FUNC int mdbx_is_pipe(mdbx_filehandle_t fd) { +MDBX_INTERNAL_FUNC int osal_is_pipe(mdbx_filehandle_t fd) { #if defined(_WIN32) || defined(_WIN64) switch (GetFileType(fd)) { case FILE_TYPE_DISK: @@ -1005,7 +1004,7 @@ MDBX_INTERNAL_FUNC int mdbx_is_pipe(mdbx_filehandle_t fd) { #endif } -MDBX_INTERNAL_FUNC int mdbx_ftruncate(mdbx_filehandle_t fd, uint64_t length) { +MDBX_INTERNAL_FUNC int osal_ftruncate(mdbx_filehandle_t fd, uint64_t length) { #if defined(_WIN32) || defined(_WIN64) if (mdbx_SetFileInformationByHandle) { FILE_END_OF_FILE_INFO EndOfFileInfo; @@ -1029,7 +1028,7 @@ MDBX_INTERNAL_FUNC int mdbx_ftruncate(mdbx_filehandle_t fd, uint64_t length) { #endif } -MDBX_INTERNAL_FUNC int mdbx_fseek(mdbx_filehandle_t fd, uint64_t pos) { +MDBX_INTERNAL_FUNC int osal_fseek(mdbx_filehandle_t fd, uint64_t pos) { #if defined(_WIN32) || defined(_WIN64) LARGE_INTEGER li; li.QuadPart = pos; @@ -1045,7 +1044,7 @@ MDBX_INTERNAL_FUNC int mdbx_fseek(mdbx_filehandle_t fd, uint64_t pos) { /*----------------------------------------------------------------------------*/ MDBX_INTERNAL_FUNC int -mdbx_thread_create(mdbx_thread_t *thread, +osal_thread_create(osal_thread_t *thread, THREAD_RESULT(THREAD_CALL *start_routine)(void *), void *arg) { #if defined(_WIN32) || defined(_WIN64) @@ -1056,7 +1055,7 @@ mdbx_thread_create(mdbx_thread_t *thread, #endif } -MDBX_INTERNAL_FUNC int mdbx_thread_join(mdbx_thread_t thread) { +MDBX_INTERNAL_FUNC int osal_thread_join(osal_thread_t thread) { #if defined(_WIN32) || defined(_WIN64) DWORD code = WaitForSingleObject(thread, INFINITE); return waitstatus2errcode(code); @@ -1068,16 +1067,16 @@ MDBX_INTERNAL_FUNC int mdbx_thread_join(mdbx_thread_t thread) { /*----------------------------------------------------------------------------*/ -MDBX_INTERNAL_FUNC int mdbx_msync(mdbx_mmap_t *map, size_t offset, +MDBX_INTERNAL_FUNC int osal_msync(osal_mmap_t *map, size_t offset, size_t length, - enum mdbx_syncmode_bits mode_bits) { + enum osal_syncmode_bits mode_bits) { uint8_t *ptr = (uint8_t *)map->address + offset; #if defined(_WIN32) || defined(_WIN64) if (!FlushViewOfFile(ptr, length)) return (int)GetLastError(); #else #if defined(__linux__) || defined(__gnu_linux__) - if (mode_bits == MDBX_SYNC_NONE && mdbx_linux_kernel_version > 0x02061300) + if (mode_bits == MDBX_SYNC_NONE && linux_kernel_version > 0x02061300) /* Since Linux 2.6.19, MS_ASYNC is in fact a no-op. The kernel properly * tracks dirty pages and flushes them to storage as necessary. */ return MDBX_SUCCESS; @@ -1086,10 +1085,10 @@ MDBX_INTERNAL_FUNC int mdbx_msync(mdbx_mmap_t *map, size_t offset, return errno; mode_bits &= ~MDBX_SYNC_DATA; #endif - return mdbx_fsync(map->fd, mode_bits); + return osal_fsync(map->fd, mode_bits); } -MDBX_INTERNAL_FUNC int mdbx_check_fs_rdonly(mdbx_filehandle_t handle, +MDBX_INTERNAL_FUNC int osal_check_fs_rdonly(mdbx_filehandle_t handle, const pathchar_t *pathname, int err) { #if defined(_WIN32) || defined(_WIN64) @@ -1119,7 +1118,7 @@ MDBX_INTERNAL_FUNC int mdbx_check_fs_rdonly(mdbx_filehandle_t handle, return MDBX_SUCCESS; } -static int mdbx_check_fs_local(mdbx_filehandle_t handle, int flags) { +static int osal_check_fs_local(mdbx_filehandle_t handle, int flags) { #if defined(_WIN32) || defined(_WIN64) if (mdbx_RunningUnderWine() && !(flags & MDBX_EXCLUSIVE)) return ERROR_NOT_CAPABLE /* workaround for Wine */; @@ -1166,7 +1165,7 @@ static int mdbx_check_fs_local(mdbx_filehandle_t handle, int flags) { } if (mdbx_GetVolumeInformationByHandleW && mdbx_GetFinalPathNameByHandleW) { - WCHAR *PathBuffer = mdbx_malloc(sizeof(WCHAR) * INT16_MAX); + WCHAR *PathBuffer = osal_malloc(sizeof(WCHAR) * INT16_MAX); if (!PathBuffer) return MDBX_ENOMEM; @@ -1234,7 +1233,7 @@ static int mdbx_check_fs_local(mdbx_filehandle_t handle, int flags) { } bailout: - mdbx_free(PathBuffer); + osal_free(PathBuffer); return rc; } @@ -1411,11 +1410,10 @@ static int check_mmap_limit(const size_t limit) { const int log2page = log2n_powerof2(pagesize); if ((limit >> (log2page + 7)) > (size_t)total_ram_pages || (limit >> (log2page + 6)) > (size_t)avail_ram_pages) { - mdbx_error( - "%s (%zu pages) is too large for available (%zu pages) or total " - "(%zu pages) system RAM", - "database upper size limit", limit >> log2page, avail_ram_pages, - total_ram_pages); + ERROR("%s (%zu pages) is too large for available (%zu pages) or total " + "(%zu pages) system RAM", + "database upper size limit", limit >> log2page, avail_ram_pages, + total_ram_pages); return MDBX_TOO_LARGE; } } @@ -1423,7 +1421,7 @@ static int check_mmap_limit(const size_t limit) { return MDBX_SUCCESS; } -MDBX_INTERNAL_FUNC int mdbx_mmap(const int flags, mdbx_mmap_t *map, +MDBX_INTERNAL_FUNC int osal_mmap(const int flags, osal_mmap_t *map, const size_t size, const size_t limit, const unsigned options) { assert(size <= limit); @@ -1435,7 +1433,7 @@ MDBX_INTERNAL_FUNC int mdbx_mmap(const int flags, mdbx_mmap_t *map, map->section = NULL; #endif /* Windows */ - int err = mdbx_check_fs_local(map->fd, flags); + int err = osal_check_fs_local(map->fd, flags); if (unlikely(err != MDBX_SUCCESS)) return err; @@ -1444,7 +1442,7 @@ MDBX_INTERNAL_FUNC int mdbx_mmap(const int flags, mdbx_mmap_t *map, return err; if ((flags & MDBX_RDONLY) == 0 && (options & MMAP_OPTION_TRUNCATE) != 0) { - err = mdbx_ftruncate(map->fd, size); + err = osal_ftruncate(map->fd, size); if (err != MDBX_SUCCESS) return err; map->filesize = size; @@ -1452,7 +1450,7 @@ MDBX_INTERNAL_FUNC int mdbx_mmap(const int flags, mdbx_mmap_t *map, map->current = size; #endif /* !Windows */ } else { - err = mdbx_filesize(map->fd, &map->filesize); + err = osal_filesize(map->fd, &map->filesize); if (err != MDBX_SUCCESS) return err; #if !(defined(_WIN32) || defined(_WIN64)) @@ -1559,7 +1557,7 @@ MDBX_INTERNAL_FUNC int mdbx_mmap(const int flags, mdbx_mmap_t *map, return MDBX_SUCCESS; } -MDBX_INTERNAL_FUNC int mdbx_munmap(mdbx_mmap_t *map) { +MDBX_INTERNAL_FUNC int osal_munmap(osal_mmap_t *map) { VALGRIND_MAKE_MEM_NOACCESS(map->address, map->current); /* Unpoisoning is required for ASAN to avoid false-positive diagnostic * when this memory will re-used by malloc or another mmapping. @@ -1586,7 +1584,7 @@ MDBX_INTERNAL_FUNC int mdbx_munmap(mdbx_mmap_t *map) { return MDBX_SUCCESS; } -MDBX_INTERNAL_FUNC int mdbx_mresize(const int flags, mdbx_mmap_t *map, +MDBX_INTERNAL_FUNC int osal_mresize(const int flags, osal_mmap_t *map, size_t size, size_t limit) { assert(size <= limit); #if defined(_WIN32) || defined(_WIN64) @@ -1683,12 +1681,12 @@ retry_file_and_section: map->address = NULL; } - err = mdbx_filesize(map->fd, &map->filesize); + err = osal_filesize(map->fd, &map->filesize); if (err != MDBX_SUCCESS) goto bailout; if ((flags & MDBX_RDONLY) == 0 && map->filesize != size) { - err = mdbx_ftruncate(map->fd, size); + err = osal_ftruncate(map->fd, size); if (err == MDBX_SUCCESS) map->filesize = size; /* ignore error, because Windows unable shrink file @@ -1764,7 +1762,7 @@ retry_mapview:; #else /* Windows */ map->filesize = 0; - int rc = mdbx_filesize(map->fd, &map->filesize); + int rc = osal_filesize(map->fd, &map->filesize); if (rc != MDBX_SUCCESS) return rc; @@ -1774,7 +1772,7 @@ retry_mapview:; rc = (size > map->current) ? MDBX_UNABLE_EXTEND_MAPSIZE : MDBX_EPERM; } else { if (map->filesize != size) { - rc = mdbx_ftruncate(map->fd, size); + rc = osal_ftruncate(map->fd, size); if (rc != MDBX_SUCCESS) return rc; map->filesize = size; @@ -1956,7 +1954,7 @@ retry_mapview:; /*----------------------------------------------------------------------------*/ -__cold MDBX_INTERNAL_FUNC void mdbx_osal_jitter(bool tiny) { +__cold MDBX_INTERNAL_FUNC void osal_jitter(bool tiny) { for (;;) { #if defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || \ defined(__x86_64__) @@ -2011,8 +2009,7 @@ static LARGE_INTEGER performance_frequency; static uint64_t ratio_16dot16_to_monotine; #endif -MDBX_INTERNAL_FUNC uint64_t -mdbx_osal_16dot16_to_monotime(uint32_t seconds_16dot16) { +MDBX_INTERNAL_FUNC uint64_t osal_16dot16_to_monotime(uint32_t seconds_16dot16) { #if defined(_WIN32) || defined(_WIN64) if (unlikely(performance_frequency.QuadPart == 0)) QueryPerformanceFrequency(&performance_frequency); @@ -2031,12 +2028,12 @@ mdbx_osal_16dot16_to_monotime(uint32_t seconds_16dot16) { return likely(ret || seconds_16dot16 == 0) ? ret : /* fix underflow */ 1; } -MDBX_INTERNAL_FUNC uint32_t mdbx_osal_monotime_to_16dot16(uint64_t monotime) { +MDBX_INTERNAL_FUNC uint32_t osal_monotime_to_16dot16(uint64_t monotime) { static uint64_t limit; if (unlikely(monotime > limit)) { if (likely(limit != 0)) return UINT32_MAX; - limit = mdbx_osal_16dot16_to_monotime(UINT32_MAX - 1); + limit = osal_16dot16_to_monotime(UINT32_MAX - 1); if (unlikely(monotime > limit)) return UINT32_MAX; } @@ -2053,7 +2050,7 @@ MDBX_INTERNAL_FUNC uint32_t mdbx_osal_monotime_to_16dot16(uint64_t monotime) { return monotime > 0 /* fix underflow */; } -MDBX_INTERNAL_FUNC uint64_t mdbx_osal_monotime(void) { +MDBX_INTERNAL_FUNC uint64_t osal_monotime(void) { #if defined(_WIN32) || defined(_WIN64) LARGE_INTEGER counter; counter.QuadPart = 0; @@ -2222,7 +2219,7 @@ bootid_parse_uuid(bin128_t *s, const void *p, const size_t n) { return false; } -__cold MDBX_INTERNAL_FUNC bin128_t mdbx_osal_bootid(void) { +__cold MDBX_INTERNAL_FUNC bin128_t osal_bootid(void) { bin128_t bin = {{0, 0}}; bool got_machineid = false, got_boottime = false, got_bootseq = false; @@ -2535,7 +2532,7 @@ __cold int mdbx_get_sysraminfo(intptr_t *page_size, intptr_t *total_pages, if (avail_pages) *avail_pages = -1; - const intptr_t pagesize = mdbx_syspagesize(); + const intptr_t pagesize = osal_syspagesize(); if (page_size) *page_size = pagesize; if (unlikely(pagesize < MIN_PAGESIZE || !is_powerof2(pagesize))) diff --git a/src/osal.h b/src/osal.h index f30c0db6..cec91dca 100644 --- a/src/osal.h +++ b/src/osal.h @@ -58,7 +58,7 @@ #include #endif -MDBX_MAYBE_UNUSED static __inline void mdbx_compiler_barrier(void) { +MDBX_MAYBE_UNUSED static __inline void osal_compiler_barrier(void) { #if defined(__clang__) || defined(__GNUC__) __asm__ __volatile__("" ::: "memory"); #elif defined(_MSC_VER) @@ -78,7 +78,7 @@ MDBX_MAYBE_UNUSED static __inline void mdbx_compiler_barrier(void) { #endif } -MDBX_MAYBE_UNUSED static __inline void mdbx_memory_barrier(void) { +MDBX_MAYBE_UNUSED static __inline void osal_memory_barrier(void) { #ifdef MDBX_HAVE_C11ATOMICS atomic_thread_fence(memory_order_seq_cst); #elif defined(__ATOMIC_SEQ_CST) @@ -116,8 +116,8 @@ MDBX_MAYBE_UNUSED static __inline void mdbx_memory_barrier(void) { #if defined(_WIN32) || defined(_WIN64) #define HAVE_SYS_STAT_H #define HAVE_SYS_TYPES_H -typedef HANDLE mdbx_thread_t; -typedef unsigned mdbx_thread_key_t; +typedef HANDLE osal_thread_t; +typedef unsigned osal_thread_key_t; #define MAP_FAILED NULL #define HIGH_DWORD(v) ((DWORD)((sizeof(v) > 4) ? ((uint64_t)(v) >> 32) : 0)) #define THREAD_CALL WINAPI @@ -125,8 +125,8 @@ typedef unsigned mdbx_thread_key_t; typedef struct { HANDLE mutex; HANDLE event[2]; -} mdbx_condpair_t; -typedef CRITICAL_SECTION mdbx_fastmutex_t; +} osal_condpair_t; +typedef CRITICAL_SECTION osal_fastmutex_t; #if !defined(_MSC_VER) && !defined(__try) /* *INDENT-OFF* */ @@ -139,36 +139,36 @@ typedef CRITICAL_SECTION mdbx_fastmutex_t; #if MDBX_WITHOUT_MSVC_CRT -#ifndef mdbx_malloc -static inline void *mdbx_malloc(size_t bytes) { +#ifndef osal_malloc +static inline void *osal_malloc(size_t bytes) { return HeapAlloc(GetProcessHeap(), 0, bytes); } -#endif /* mdbx_malloc */ +#endif /* osal_malloc */ -#ifndef mdbx_calloc -static inline void *mdbx_calloc(size_t nelem, size_t size) { +#ifndef osal_calloc +static inline void *osal_calloc(size_t nelem, size_t size) { return HeapAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY, nelem * size); } -#endif /* mdbx_calloc */ +#endif /* osal_calloc */ -#ifndef mdbx_realloc -static inline void *mdbx_realloc(void *ptr, size_t bytes) { +#ifndef osal_realloc +static inline void *osal_realloc(void *ptr, size_t bytes) { return ptr ? HeapReAlloc(GetProcessHeap(), 0, ptr, bytes) : HeapAlloc(GetProcessHeap(), 0, bytes); } -#endif /* mdbx_realloc */ +#endif /* osal_realloc */ -#ifndef mdbx_free -static inline void mdbx_free(void *ptr) { HeapFree(GetProcessHeap(), 0, ptr); } -#endif /* mdbx_free */ +#ifndef osal_free +static inline void osal_free(void *ptr) { HeapFree(GetProcessHeap(), 0, ptr); } +#endif /* osal_free */ #else /* MDBX_WITHOUT_MSVC_CRT */ -#define mdbx_malloc malloc -#define mdbx_calloc calloc -#define mdbx_realloc realloc -#define mdbx_free free -#define mdbx_strdup _strdup +#define osal_malloc malloc +#define osal_calloc calloc +#define osal_realloc realloc +#define osal_free free +#define osal_strdup _strdup #endif /* MDBX_WITHOUT_MSVC_CRT */ @@ -180,26 +180,26 @@ static inline void mdbx_free(void *ptr) { HeapFree(GetProcessHeap(), 0, ptr); } #define vsnprintf _vsnprintf /* ntdll */ #endif -MDBX_INTERNAL_FUNC size_t mdbx_mb2w(wchar_t *dst, size_t dst_n, const char *src, +MDBX_INTERNAL_FUNC size_t osal_mb2w(wchar_t *dst, size_t dst_n, const char *src, size_t src_n); #else /*----------------------------------------------------------------------*/ -typedef pthread_t mdbx_thread_t; -typedef pthread_key_t mdbx_thread_key_t; +typedef pthread_t osal_thread_t; +typedef pthread_key_t osal_thread_key_t; #define INVALID_HANDLE_VALUE (-1) #define THREAD_CALL #define THREAD_RESULT void * typedef struct { pthread_mutex_t mutex; pthread_cond_t cond[2]; -} mdbx_condpair_t; -typedef pthread_mutex_t mdbx_fastmutex_t; -#define mdbx_malloc malloc -#define mdbx_calloc calloc -#define mdbx_realloc realloc -#define mdbx_free free -#define mdbx_strdup strdup +} osal_condpair_t; +typedef pthread_mutex_t osal_fastmutex_t; +#define osal_malloc malloc +#define osal_calloc calloc +#define osal_realloc realloc +#define osal_free free +#define osal_strdup strdup #endif /* Platform */ #if __GLIBC_PREREQ(2, 12) || defined(__FreeBSD__) || defined(malloc_usable_size) @@ -217,7 +217,7 @@ typedef pthread_mutex_t mdbx_fastmutex_t; * This is the basic size that the platform's memory manager uses, and is * fundamental to the use of memory-mapped files. */ MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline size_t -mdbx_syspagesize(void) { +osal_syspagesize(void) { #if defined(_WIN32) || defined(_WIN64) SYSTEM_INFO si; GetSystemInfo(&si); @@ -233,7 +233,7 @@ typedef wchar_t pathchar_t; typedef char pathchar_t; #endif -typedef struct mdbx_mmap_param { +typedef struct osal_mmap_param { union { void *address; uint8_t *dxb; @@ -246,7 +246,7 @@ typedef struct mdbx_mmap_param { #if defined(_WIN32) || defined(_WIN64) HANDLE section; /* memory-mapped section handle */ #endif -} mdbx_mmap_t; +} osal_mmap_t; typedef union bin128 { __anonymous_struct_extension__ struct { uint64_t x, y; }; @@ -254,13 +254,13 @@ typedef union bin128 { } bin128_t; #if defined(_WIN32) || defined(_WIN64) -typedef union MDBX_srwlock { +typedef union osal_srwlock { __anonymous_struct_extension__ struct { long volatile readerCount; long volatile writerCount; }; RTL_SRWLOCK native; -} MDBX_srwlock; +} osal_srwlock_t; #endif /* Windows */ #ifndef __cplusplus @@ -270,12 +270,12 @@ typedef union MDBX_srwlock { #if (!defined(__GLIBC__) && __GLIBC_PREREQ(2, 1)) && \ (defined(_GNU_SOURCE) || defined(_BSD_SOURCE)) -#define mdbx_asprintf asprintf -#define mdbx_vasprintf vasprintf +#define osal_asprintf asprintf +#define osal_vasprintf vasprintf #else MDBX_MAYBE_UNUSED MDBX_INTERNAL_FUNC - MDBX_PRINTF_ARGS(2, 3) int mdbx_asprintf(char **strp, const char *fmt, ...); -MDBX_INTERNAL_FUNC int mdbx_vasprintf(char **strp, const char *fmt, va_list ap); + MDBX_PRINTF_ARGS(2, 3) int osal_asprintf(char **strp, const char *fmt, ...); +MDBX_INTERNAL_FUNC int osal_vasprintf(char **strp, const char *fmt, va_list ap); #endif #if !defined(MADV_DODUMP) && defined(MADV_CORE) @@ -286,8 +286,8 @@ MDBX_INTERNAL_FUNC int mdbx_vasprintf(char **strp, const char *fmt, va_list ap); #define MADV_DONTDUMP MADV_NOCORE #endif /* MADV_NOCORE -> MADV_DONTDUMP */ -MDBX_MAYBE_UNUSED MDBX_INTERNAL_FUNC void mdbx_osal_jitter(bool tiny); -MDBX_MAYBE_UNUSED static __inline void mdbx_jitter4testing(bool tiny); +MDBX_MAYBE_UNUSED MDBX_INTERNAL_FUNC void osal_jitter(bool tiny); +MDBX_MAYBE_UNUSED static __inline void jitter4testing(bool tiny); /* max bytes to write in one call */ #if defined(_WIN32) || defined(_WIN64) @@ -297,15 +297,15 @@ MDBX_MAYBE_UNUSED static __inline void mdbx_jitter4testing(bool tiny); #endif #if defined(__linux__) || defined(__gnu_linux__) -MDBX_INTERNAL_VAR uint32_t mdbx_linux_kernel_version; +MDBX_INTERNAL_VAR uint32_t linux_kernel_version; MDBX_INTERNAL_VAR bool mdbx_RunningOnWSL1 /* Windows Subsystem 1 for Linux */; #endif /* Linux */ -#ifndef mdbx_strdup -LIBMDBX_API char *mdbx_strdup(const char *str); +#ifndef osal_strdup +LIBMDBX_API char *osal_strdup(const char *str); #endif -MDBX_MAYBE_UNUSED static __inline int mdbx_get_errno(void) { +MDBX_MAYBE_UNUSED static __inline int osal_get_errno(void) { #if defined(_WIN32) || defined(_WIN64) DWORD rc = GetLastError(); #else @@ -314,57 +314,57 @@ MDBX_MAYBE_UNUSED static __inline int mdbx_get_errno(void) { return rc; } -#ifndef mdbx_memalign_alloc -MDBX_INTERNAL_FUNC int mdbx_memalign_alloc(size_t alignment, size_t bytes, +#ifndef osal_memalign_alloc +MDBX_INTERNAL_FUNC int osal_memalign_alloc(size_t alignment, size_t bytes, void **result); #endif -#ifndef mdbx_memalign_free -MDBX_INTERNAL_FUNC void mdbx_memalign_free(void *ptr); +#ifndef osal_memalign_free +MDBX_INTERNAL_FUNC void osal_memalign_free(void *ptr); #endif -MDBX_INTERNAL_FUNC int mdbx_condpair_init(mdbx_condpair_t *condpair); -MDBX_INTERNAL_FUNC int mdbx_condpair_lock(mdbx_condpair_t *condpair); -MDBX_INTERNAL_FUNC int mdbx_condpair_unlock(mdbx_condpair_t *condpair); -MDBX_INTERNAL_FUNC int mdbx_condpair_signal(mdbx_condpair_t *condpair, +MDBX_INTERNAL_FUNC int osal_condpair_init(osal_condpair_t *condpair); +MDBX_INTERNAL_FUNC int osal_condpair_lock(osal_condpair_t *condpair); +MDBX_INTERNAL_FUNC int osal_condpair_unlock(osal_condpair_t *condpair); +MDBX_INTERNAL_FUNC int osal_condpair_signal(osal_condpair_t *condpair, bool part); -MDBX_INTERNAL_FUNC int mdbx_condpair_wait(mdbx_condpair_t *condpair, bool part); -MDBX_INTERNAL_FUNC int mdbx_condpair_destroy(mdbx_condpair_t *condpair); +MDBX_INTERNAL_FUNC int osal_condpair_wait(osal_condpair_t *condpair, bool part); +MDBX_INTERNAL_FUNC int osal_condpair_destroy(osal_condpair_t *condpair); -MDBX_INTERNAL_FUNC int mdbx_fastmutex_init(mdbx_fastmutex_t *fastmutex); -MDBX_INTERNAL_FUNC int mdbx_fastmutex_acquire(mdbx_fastmutex_t *fastmutex); -MDBX_INTERNAL_FUNC int mdbx_fastmutex_release(mdbx_fastmutex_t *fastmutex); -MDBX_INTERNAL_FUNC int mdbx_fastmutex_destroy(mdbx_fastmutex_t *fastmutex); +MDBX_INTERNAL_FUNC int osal_fastmutex_init(osal_fastmutex_t *fastmutex); +MDBX_INTERNAL_FUNC int osal_fastmutex_acquire(osal_fastmutex_t *fastmutex); +MDBX_INTERNAL_FUNC int osal_fastmutex_release(osal_fastmutex_t *fastmutex); +MDBX_INTERNAL_FUNC int osal_fastmutex_destroy(osal_fastmutex_t *fastmutex); -MDBX_INTERNAL_FUNC int mdbx_pwritev(mdbx_filehandle_t fd, struct iovec *iov, +MDBX_INTERNAL_FUNC int osal_pwritev(mdbx_filehandle_t fd, struct iovec *iov, int iovcnt, uint64_t offset, size_t expected_written); -MDBX_INTERNAL_FUNC int mdbx_pread(mdbx_filehandle_t fd, void *buf, size_t count, +MDBX_INTERNAL_FUNC int osal_pread(mdbx_filehandle_t fd, void *buf, size_t count, uint64_t offset); -MDBX_INTERNAL_FUNC int mdbx_pwrite(mdbx_filehandle_t fd, const void *buf, +MDBX_INTERNAL_FUNC int osal_pwrite(mdbx_filehandle_t fd, const void *buf, size_t count, uint64_t offset); -MDBX_INTERNAL_FUNC int mdbx_write(mdbx_filehandle_t fd, const void *buf, +MDBX_INTERNAL_FUNC int osal_write(mdbx_filehandle_t fd, const void *buf, size_t count); MDBX_INTERNAL_FUNC int -mdbx_thread_create(mdbx_thread_t *thread, +osal_thread_create(osal_thread_t *thread, THREAD_RESULT(THREAD_CALL *start_routine)(void *), void *arg); -MDBX_INTERNAL_FUNC int mdbx_thread_join(mdbx_thread_t thread); +MDBX_INTERNAL_FUNC int osal_thread_join(osal_thread_t thread); -enum mdbx_syncmode_bits { +enum osal_syncmode_bits { MDBX_SYNC_NONE = 0, MDBX_SYNC_DATA = 1, MDBX_SYNC_SIZE = 2, MDBX_SYNC_IODQ = 4 }; -MDBX_INTERNAL_FUNC int mdbx_fsync(mdbx_filehandle_t fd, - const enum mdbx_syncmode_bits mode_bits); -MDBX_INTERNAL_FUNC int mdbx_ftruncate(mdbx_filehandle_t fd, uint64_t length); -MDBX_INTERNAL_FUNC int mdbx_fseek(mdbx_filehandle_t fd, uint64_t pos); -MDBX_INTERNAL_FUNC int mdbx_filesize(mdbx_filehandle_t fd, uint64_t *length); +MDBX_INTERNAL_FUNC int osal_fsync(mdbx_filehandle_t fd, + const enum osal_syncmode_bits mode_bits); +MDBX_INTERNAL_FUNC int osal_ftruncate(mdbx_filehandle_t fd, uint64_t length); +MDBX_INTERNAL_FUNC int osal_fseek(mdbx_filehandle_t fd, uint64_t pos); +MDBX_INTERNAL_FUNC int osal_filesize(mdbx_filehandle_t fd, uint64_t *length); -enum mdbx_openfile_purpose { +enum osal_openfile_purpose { MDBX_OPEN_DXB_READ = 0, MDBX_OPEN_DXB_LAZY = 1, MDBX_OPEN_DXB_DSYNC = 2, @@ -373,26 +373,26 @@ enum mdbx_openfile_purpose { MDBX_OPEN_DELETE = 5 }; -MDBX_INTERNAL_FUNC int mdbx_openfile(const enum mdbx_openfile_purpose purpose, +MDBX_INTERNAL_FUNC int osal_openfile(const enum osal_openfile_purpose purpose, const MDBX_env *env, const pathchar_t *pathname, mdbx_filehandle_t *fd, mdbx_mode_t unix_mode_bits); -MDBX_INTERNAL_FUNC int mdbx_closefile(mdbx_filehandle_t fd); -MDBX_INTERNAL_FUNC int mdbx_removefile(const pathchar_t *pathname); -MDBX_INTERNAL_FUNC int mdbx_removedirectory(const pathchar_t *pathname); -MDBX_INTERNAL_FUNC int mdbx_is_pipe(mdbx_filehandle_t fd); -MDBX_INTERNAL_FUNC int mdbx_lockfile(mdbx_filehandle_t fd, bool wait); +MDBX_INTERNAL_FUNC int osal_closefile(mdbx_filehandle_t fd); +MDBX_INTERNAL_FUNC int osal_removefile(const pathchar_t *pathname); +MDBX_INTERNAL_FUNC int osal_removedirectory(const pathchar_t *pathname); +MDBX_INTERNAL_FUNC int osal_is_pipe(mdbx_filehandle_t fd); +MDBX_INTERNAL_FUNC int osal_lockfile(mdbx_filehandle_t fd, bool wait); #define MMAP_OPTION_TRUNCATE 1 #define MMAP_OPTION_SEMAPHORE 2 -MDBX_INTERNAL_FUNC int mdbx_mmap(const int flags, mdbx_mmap_t *map, +MDBX_INTERNAL_FUNC int osal_mmap(const int flags, osal_mmap_t *map, const size_t must, const size_t limit, const unsigned options); -MDBX_INTERNAL_FUNC int mdbx_munmap(mdbx_mmap_t *map); +MDBX_INTERNAL_FUNC int osal_munmap(osal_mmap_t *map); #define MDBX_MRESIZE_MAY_MOVE 0x00000100 #define MDBX_MRESIZE_MAY_UNMAP 0x00000200 -MDBX_INTERNAL_FUNC int mdbx_mresize(const int flags, mdbx_mmap_t *map, +MDBX_INTERNAL_FUNC int osal_mresize(const int flags, osal_mmap_t *map, size_t size, size_t limit); #if defined(_WIN32) || defined(_WIN64) typedef struct { @@ -400,18 +400,18 @@ typedef struct { HANDLE handles[31]; } mdbx_handle_array_t; MDBX_INTERNAL_FUNC int -mdbx_suspend_threads_before_remap(MDBX_env *env, mdbx_handle_array_t **array); +osal_suspend_threads_before_remap(MDBX_env *env, mdbx_handle_array_t **array); MDBX_INTERNAL_FUNC int -mdbx_resume_threads_after_remap(mdbx_handle_array_t *array); +osal_resume_threads_after_remap(mdbx_handle_array_t *array); #endif /* Windows */ -MDBX_INTERNAL_FUNC int mdbx_msync(mdbx_mmap_t *map, size_t offset, +MDBX_INTERNAL_FUNC int osal_msync(osal_mmap_t *map, size_t offset, size_t length, - enum mdbx_syncmode_bits mode_bits); -MDBX_INTERNAL_FUNC int mdbx_check_fs_rdonly(mdbx_filehandle_t handle, + enum osal_syncmode_bits mode_bits); +MDBX_INTERNAL_FUNC int osal_check_fs_rdonly(mdbx_filehandle_t handle, const pathchar_t *pathname, int err); -MDBX_MAYBE_UNUSED static __inline uint32_t mdbx_getpid(void) { +MDBX_MAYBE_UNUSED static __inline uint32_t osal_getpid(void) { STATIC_ASSERT(sizeof(mdbx_pid_t) <= sizeof(uint32_t)); #if defined(_WIN32) || defined(_WIN64) return GetCurrentProcessId(); @@ -421,7 +421,7 @@ MDBX_MAYBE_UNUSED static __inline uint32_t mdbx_getpid(void) { #endif } -MDBX_MAYBE_UNUSED static __inline uintptr_t mdbx_thread_self(void) { +MDBX_MAYBE_UNUSED static __inline uintptr_t osal_thread_self(void) { mdbx_tid_t thunk; STATIC_ASSERT(sizeof(uintptr_t) >= sizeof(thunk)); #if defined(_WIN32) || defined(_WIN64) @@ -434,24 +434,23 @@ MDBX_MAYBE_UNUSED static __inline uintptr_t mdbx_thread_self(void) { #if !defined(_WIN32) && !defined(_WIN64) #if defined(__ANDROID_API__) || defined(ANDROID) || defined(BIONIC) -MDBX_INTERNAL_FUNC int mdbx_check_tid4bionic(void); +MDBX_INTERNAL_FUNC int osal_check_tid4bionic(void); #else -static __inline int mdbx_check_tid4bionic(void) { return 0; } +static __inline int osal_check_tid4bionic(void) { return 0; } #endif /* __ANDROID_API__ || ANDROID) || BIONIC */ MDBX_MAYBE_UNUSED static __inline int -mdbx_pthread_mutex_lock(pthread_mutex_t *mutex) { - int err = mdbx_check_tid4bionic(); +osal_pthread_mutex_lock(pthread_mutex_t *mutex) { + int err = osal_check_tid4bionic(); return unlikely(err) ? err : pthread_mutex_lock(mutex); } #endif /* !Windows */ -MDBX_INTERNAL_FUNC uint64_t mdbx_osal_monotime(void); -MDBX_INTERNAL_FUNC uint64_t -mdbx_osal_16dot16_to_monotime(uint32_t seconds_16dot16); -MDBX_INTERNAL_FUNC uint32_t mdbx_osal_monotime_to_16dot16(uint64_t monotime); +MDBX_INTERNAL_FUNC uint64_t osal_monotime(void); +MDBX_INTERNAL_FUNC uint64_t osal_16dot16_to_monotime(uint32_t seconds_16dot16); +MDBX_INTERNAL_FUNC uint32_t osal_monotime_to_16dot16(uint64_t monotime); -MDBX_INTERNAL_FUNC bin128_t mdbx_osal_bootid(void); +MDBX_INTERNAL_FUNC bin128_t osal_bootid(void); /*----------------------------------------------------------------------------*/ /* lck stuff */ @@ -467,7 +466,7 @@ MDBX_INTERNAL_FUNC bin128_t mdbx_osal_bootid(void); /// MUST NOT initialize shared synchronization objects in memory-mapped /// LCK-file that are already in use. /// \return Error code or zero on success. -MDBX_INTERNAL_FUNC int mdbx_lck_init(MDBX_env *env, +MDBX_INTERNAL_FUNC int osal_lck_init(MDBX_env *env, MDBX_env *inprocess_neighbor, int global_uniqueness_flag); @@ -488,7 +487,7 @@ MDBX_INTERNAL_FUNC int mdbx_lck_init(MDBX_env *env, /// of other instances of MDBX_env within the current process, e.g. /// restore POSIX-fcntl locks after the closing of file descriptors. /// \return Error code (MDBX_PANIC) or zero on success. -MDBX_INTERNAL_FUNC int mdbx_lck_destroy(MDBX_env *env, +MDBX_INTERNAL_FUNC int osal_lck_destroy(MDBX_env *env, MDBX_env *inprocess_neighbor); /// \brief Connects to shared interprocess locking objects and tries to acquire @@ -496,14 +495,14 @@ MDBX_INTERNAL_FUNC int mdbx_lck_destroy(MDBX_env *env, /// Depending on implementation or/and platform (Windows) this function may /// acquire the non-OS super-level lock (e.g. for shared synchronization /// objects initialization), which will be downgraded to OS-exclusive or -/// shared via explicit calling of mdbx_lck_downgrade(). +/// shared via explicit calling of osal_lck_downgrade(). /// \return /// MDBX_RESULT_TRUE (-1) - if an exclusive lock was acquired and thus /// the current process is the first and only after the last use of DB. /// MDBX_RESULT_FALSE (0) - if a shared lock was acquired and thus /// DB has already been opened and now is used by other processes. /// Otherwise (not 0 and not -1) - error code. -MDBX_INTERNAL_FUNC int mdbx_lck_seize(MDBX_env *env); +MDBX_INTERNAL_FUNC int osal_lck_seize(MDBX_env *env); /// \brief Downgrades the level of initially acquired lock to /// operational level specified by argument. The reson for such downgrade: @@ -516,14 +515,14 @@ MDBX_INTERNAL_FUNC int mdbx_lck_seize(MDBX_env *env); /// (env->me_flags & MDBX_EXCLUSIVE) != 0 - downgrade to exclusive /// operational lock. /// \return Error code or zero on success -MDBX_INTERNAL_FUNC int mdbx_lck_downgrade(MDBX_env *env); +MDBX_INTERNAL_FUNC int osal_lck_downgrade(MDBX_env *env); /// \brief Locks LCK-file or/and table of readers for (de)registering. /// \return Error code or zero on success -MDBX_INTERNAL_FUNC int mdbx_rdt_lock(MDBX_env *env); +MDBX_INTERNAL_FUNC int osal_rdt_lock(MDBX_env *env); /// \brief Unlocks LCK-file or/and table of readers after (de)registering. -MDBX_INTERNAL_FUNC void mdbx_rdt_unlock(MDBX_env *env); +MDBX_INTERNAL_FUNC void osal_rdt_unlock(MDBX_env *env); /// \brief Acquires lock for DB change (on writing transaction start) /// Reading transactions will not be blocked. @@ -538,15 +537,15 @@ LIBMDBX_API void mdbx_txn_unlock(MDBX_env *env); /// \brief Sets alive-flag of reader presence (indicative lock) for PID of /// the current process. The function does no more than needed for -/// the correct working of mdbx_rpid_check() in other processes. +/// the correct working of osal_rpid_check() in other processes. /// \return Error code or zero on success -MDBX_INTERNAL_FUNC int mdbx_rpid_set(MDBX_env *env); +MDBX_INTERNAL_FUNC int osal_rpid_set(MDBX_env *env); /// \brief Resets alive-flag of reader presence (indicative lock) /// for PID of the current process. The function does no more than needed -/// for the correct working of mdbx_rpid_check() in other processes. +/// for the correct working of osal_rpid_check() in other processes. /// \return Error code or zero on success -MDBX_INTERNAL_FUNC int mdbx_rpid_clear(MDBX_env *env); +MDBX_INTERNAL_FUNC int osal_rpid_clear(MDBX_env *env); /// \brief Checks for reading process status with the given pid with help of /// alive-flag of presence (indicative lock) or using another way. @@ -556,28 +555,28 @@ MDBX_INTERNAL_FUNC int mdbx_rpid_clear(MDBX_env *env); /// MDBX_RESULT_FALSE (0) - if the reader process with the given PID is absent /// or not working with DB (indicative lock is not present). /// Otherwise (not 0 and not -1) - error code. -MDBX_INTERNAL_FUNC int mdbx_rpid_check(MDBX_env *env, uint32_t pid); +MDBX_INTERNAL_FUNC int osal_rpid_check(MDBX_env *env, uint32_t pid); #if defined(_WIN32) || defined(_WIN64) -#define MUSTDIE_MB2WIDE(FROM, TO) \ +#define OSAL_MB2WIDE(FROM, TO) \ do { \ const char *const from_tmp = (FROM); \ const size_t from_mblen = strlen(from_tmp); \ - const size_t to_wlen = mdbx_mb2w(nullptr, 0, from_tmp, from_mblen); \ + const size_t to_wlen = osal_mb2w(nullptr, 0, from_tmp, from_mblen); \ if (to_wlen < 1 || to_wlen > /* MAX_PATH */ INT16_MAX) \ return ERROR_INVALID_NAME; \ wchar_t *const to_tmp = _alloca((to_wlen + 1) * sizeof(wchar_t)); \ if (to_wlen + 1 != \ - mdbx_mb2w(to_tmp, to_wlen + 1, from_tmp, from_mblen + 1)) \ + osal_mb2w(to_tmp, to_wlen + 1, from_tmp, from_mblen + 1)) \ return ERROR_INVALID_NAME; \ (TO) = to_tmp; \ } while (0) -typedef void(WINAPI *MDBX_srwlock_function)(MDBX_srwlock *); -MDBX_INTERNAL_VAR MDBX_srwlock_function mdbx_srwlock_Init, - mdbx_srwlock_AcquireShared, mdbx_srwlock_ReleaseShared, - mdbx_srwlock_AcquireExclusive, mdbx_srwlock_ReleaseExclusive; +typedef void(WINAPI *osal_srwlock_t_function)(osal_srwlock_t *); +MDBX_INTERNAL_VAR osal_srwlock_t_function osal_srwlock_Init, + osal_srwlock_AcquireShared, osal_srwlock_ReleaseShared, + osal_srwlock_AcquireExclusive, osal_srwlock_ReleaseExclusive; #if _WIN32_WINNT < 0x0600 /* prior to Windows Vista */ typedef enum _FILE_INFO_BY_HANDLE_CLASS { diff --git a/test/base.h b/test/base.h index 4c113f72..fa22ce60 100644 --- a/test/base.h +++ b/test/base.h @@ -94,7 +94,7 @@ #define MDBX_INTERNAL_FUNC #define MDBX_INTERNAL_VAR extern -#define xMDBX_TOOLS /* Avoid using internal mdbx_assert() */ +#define xMDBX_TOOLS /* Avoid using internal eASSERT() */ #include "../mdbx.h++" #include "../src/base.h" #include "../src/osal.h" diff --git a/test/copy.cc b/test/copy.cc index 37c58a24..d164fc45 100644 --- a/test/copy.cc +++ b/test/copy.cc @@ -15,7 +15,7 @@ REGISTER_TESTCASE(copy); void testcase_copy::copy_db(const bool with_compaction) { int err = mdbx_env_delete(copy_pathname.c_str(), MDBX_ENV_JUST_DELETE); if (err != MDBX_SUCCESS && err != MDBX_RESULT_TRUE) - failure_perror("mdbx_removefile()", err); + failure_perror("osal_removefile()", err); err = mdbx_env_copy(db_guard.get(), copy_pathname.c_str(), with_compaction ? MDBX_CP_COMPACT : MDBX_CP_DEFAULTS); diff --git a/test/valgrind_suppress.txt b/test/valgrind_suppress.txt index 1977becc..2eb3142b 100644 --- a/test/valgrind_suppress.txt +++ b/test/valgrind_suppress.txt @@ -4,7 +4,7 @@ msync(start) fun:msync ... - fun:mdbx_sync_locked* + fun:sync_locked* } { msync-whole-mmap-2 @@ -12,7 +12,7 @@ msync(start) fun:msync ... - fun:mdbx_env_sync_internal* + fun:env_sync* } { msync-whole-mmap-3 @@ -20,7 +20,7 @@ msync(start) fun:msync ... - fun:mdbx_mapresize* + fun:map_resize* } { msync-wipe-steady @@ -28,21 +28,21 @@ msync(start) fun:msync ... - fun:mdbx_wipe_steady* + fun:wipe_steady* } -# memcmp() inside mdbx_iov_write() as workaround for todo4recovery://erased_by_github/libmdbx/issues/269 +# memcmp() inside iov_write() as workaround for todo4recovery://erased_by_github/libmdbx/issues/269 { write-page-check-bcmp Memcheck:Cond fun:bcmp - fun:mdbx_iov_write* + fun:iov_write* } { write-page-check-memcmp Memcheck:Cond fun:memcmp* - fun:mdbx_iov_write* + fun:iov_write* } # single-page flush by pwrite() @@ -52,7 +52,7 @@ pwrite(buf) fun:pwrite ... - fun:mdbx_iov_write* + fun:iov_write* } { pwrite64-page-flush @@ -60,7 +60,7 @@ pwrite64(buf) fun:pwrite ... - fun:mdbx_iov_write* + fun:iov_write* } # modern Valgrind don't support the `vector[...]` pattern @@ -70,16 +70,16 @@ # pwritev(vector[...]) # fun:pwritev # ... -# fun:mdbx_iov_write* +# fun:iov_write* #} -# for((i=0;i<64;++i)); do echo -e "{\n pwritev-page-flush-$i\n Memcheck:Param\n pwritev(vector[$i])\n fun:pwritev\n ...\n fun:mdbx_iov_write*\n}"; done >> valgrind_suppress.txt +# for((i=0;i<64;++i)); do echo -e "{\n pwritev-page-flush-$i\n Memcheck:Param\n pwritev(vector[$i])\n fun:pwritev\n ...\n fun:iov_write*\n}"; done >> valgrind_suppress.txt { pwritev-page-flush-0 Memcheck:Param pwritev(vector[0]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-1 @@ -87,7 +87,7 @@ pwritev(vector[1]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-2 @@ -95,7 +95,7 @@ pwritev(vector[2]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-3 @@ -103,7 +103,7 @@ pwritev(vector[3]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-4 @@ -111,7 +111,7 @@ pwritev(vector[4]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-5 @@ -119,7 +119,7 @@ pwritev(vector[5]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-6 @@ -127,7 +127,7 @@ pwritev(vector[6]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-7 @@ -135,7 +135,7 @@ pwritev(vector[7]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-8 @@ -143,7 +143,7 @@ pwritev(vector[8]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-9 @@ -151,7 +151,7 @@ pwritev(vector[9]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-10 @@ -159,7 +159,7 @@ pwritev(vector[10]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-11 @@ -167,7 +167,7 @@ pwritev(vector[11]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-12 @@ -175,7 +175,7 @@ pwritev(vector[12]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-13 @@ -183,7 +183,7 @@ pwritev(vector[13]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-14 @@ -191,7 +191,7 @@ pwritev(vector[14]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-15 @@ -199,7 +199,7 @@ pwritev(vector[15]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-16 @@ -207,7 +207,7 @@ pwritev(vector[16]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-17 @@ -215,7 +215,7 @@ pwritev(vector[17]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-18 @@ -223,7 +223,7 @@ pwritev(vector[18]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-19 @@ -231,7 +231,7 @@ pwritev(vector[19]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-20 @@ -239,7 +239,7 @@ pwritev(vector[20]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-21 @@ -247,7 +247,7 @@ pwritev(vector[21]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-22 @@ -255,7 +255,7 @@ pwritev(vector[22]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-23 @@ -263,7 +263,7 @@ pwritev(vector[23]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-24 @@ -271,7 +271,7 @@ pwritev(vector[24]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-25 @@ -279,7 +279,7 @@ pwritev(vector[25]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-26 @@ -287,7 +287,7 @@ pwritev(vector[26]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-27 @@ -295,7 +295,7 @@ pwritev(vector[27]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-28 @@ -303,7 +303,7 @@ pwritev(vector[28]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-29 @@ -311,7 +311,7 @@ pwritev(vector[29]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-30 @@ -319,7 +319,7 @@ pwritev(vector[30]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-31 @@ -327,7 +327,7 @@ pwritev(vector[31]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-32 @@ -335,7 +335,7 @@ pwritev(vector[32]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-33 @@ -343,7 +343,7 @@ pwritev(vector[33]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-34 @@ -351,7 +351,7 @@ pwritev(vector[34]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-35 @@ -359,7 +359,7 @@ pwritev(vector[35]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-36 @@ -367,7 +367,7 @@ pwritev(vector[36]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-37 @@ -375,7 +375,7 @@ pwritev(vector[37]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-38 @@ -383,7 +383,7 @@ pwritev(vector[38]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-39 @@ -391,7 +391,7 @@ pwritev(vector[39]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-40 @@ -399,7 +399,7 @@ pwritev(vector[40]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-41 @@ -407,7 +407,7 @@ pwritev(vector[41]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-42 @@ -415,7 +415,7 @@ pwritev(vector[42]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-43 @@ -423,7 +423,7 @@ pwritev(vector[43]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-44 @@ -431,7 +431,7 @@ pwritev(vector[44]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-45 @@ -439,7 +439,7 @@ pwritev(vector[45]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-46 @@ -447,7 +447,7 @@ pwritev(vector[46]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-47 @@ -455,7 +455,7 @@ pwritev(vector[47]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-48 @@ -463,7 +463,7 @@ pwritev(vector[48]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-49 @@ -471,7 +471,7 @@ pwritev(vector[49]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-50 @@ -479,7 +479,7 @@ pwritev(vector[50]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-51 @@ -487,7 +487,7 @@ pwritev(vector[51]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-52 @@ -495,7 +495,7 @@ pwritev(vector[52]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-53 @@ -503,7 +503,7 @@ pwritev(vector[53]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-54 @@ -511,7 +511,7 @@ pwritev(vector[54]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-55 @@ -519,7 +519,7 @@ pwritev(vector[55]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-56 @@ -527,7 +527,7 @@ pwritev(vector[56]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-57 @@ -535,7 +535,7 @@ pwritev(vector[57]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-58 @@ -543,7 +543,7 @@ pwritev(vector[58]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-59 @@ -551,7 +551,7 @@ pwritev(vector[59]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-60 @@ -559,7 +559,7 @@ pwritev(vector[60]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-61 @@ -567,7 +567,7 @@ pwritev(vector[61]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-62 @@ -575,7 +575,7 @@ pwritev(vector[62]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-63 @@ -583,5 +583,5 @@ pwritev(vector[63]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } From ae730ae2f35a58a94324a85dec172e5d1aac0a76 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Thu, 11 Aug 2022 01:08:47 +0300 Subject: [PATCH 087/364] mdbx: fix minor warnings for ASAN-enabled builds. --- src/core.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/core.c b/src/core.c index cb08e340..0093005e 100644 --- a/src/core.c +++ b/src/core.c @@ -5999,7 +5999,9 @@ scan4seq_sse2(pgno_t *range, const size_t len, const unsigned seq) { do { mask = (uint8_t)diffcmp2mask_sse2(range - 3, offset, pattern); if (mask) { +#ifndef __SANITIZE_ADDRESS__ found: +#endif /* __SANITIZE_ADDRESS__ */ return range + 28 - __builtin_clz(mask); } range -= 4; @@ -6058,7 +6060,9 @@ scan4seq_avx2(pgno_t *range, const size_t len, const unsigned seq) { do { mask = (uint8_t)diffcmp2mask_avx2(range - 7, offset, pattern); if (mask) { +#ifndef __SANITIZE_ADDRESS__ found: +#endif /* __SANITIZE_ADDRESS__ */ return range + 24 - __builtin_clz(mask); } range -= 8; @@ -6123,7 +6127,9 @@ scan4seq_avx512bw(pgno_t *range, const size_t len, const unsigned seq) { do { mask = diffcmp2mask_avx512bw(range - 15, offset, pattern); if (mask) { +#ifndef __SANITIZE_ADDRESS__ found: +#endif /* __SANITIZE_ADDRESS__ */ return range + 16 - __builtin_clz(mask); } range -= 16; From 34a4e7e1025204eb7a796263d549366f84e9f667 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Thu, 11 Aug 2022 12:39:30 +0300 Subject: [PATCH 088/364] mdbx: avoid Valgrind warnings. --- src/core.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/core.c b/src/core.c index 0093005e..3ae6a62b 100644 --- a/src/core.c +++ b/src/core.c @@ -6016,7 +6016,8 @@ scan4seq_sse2(pgno_t *range, const size_t len, const unsigned seq) { * Поэтому проверяем смещение на странице, а с ASAN всегда страхуемся. */ #ifndef __SANITIZE_ADDRESS__ const unsigned on_page_safe_mask = 0xff0 /* enough for '-15' bytes offset */; - if (likely(on_page_safe_mask & (uintptr_t)(range + offset))) { + if (likely(on_page_safe_mask & (uintptr_t)(range + offset)) && + !RUNNING_ON_VALGRIND) { const unsigned extra = (unsigned)(detent + 4 - range); assert(extra > 0 && extra < 4); mask = 0xF << extra; @@ -6077,7 +6078,8 @@ scan4seq_avx2(pgno_t *range, const size_t len, const unsigned seq) { * Поэтому проверяем смещение на странице, а с ASAN всегда страхуемся. */ #ifndef __SANITIZE_ADDRESS__ const unsigned on_page_safe_mask = 0xfe0 /* enough for '-31' bytes offset */; - if (likely(on_page_safe_mask & (uintptr_t)(range + offset))) { + if (likely(on_page_safe_mask & (uintptr_t)(range + offset)) && + !RUNNING_ON_VALGRIND) { const unsigned extra = (unsigned)(detent + 8 - range); assert(extra > 0 && extra < 8); mask = 0xFF << extra; @@ -6144,7 +6146,8 @@ scan4seq_avx512bw(pgno_t *range, const size_t len, const unsigned seq) { * Поэтому проверяем смещение на странице, а с ASAN всегда страхуемся. */ #ifndef __SANITIZE_ADDRESS__ const unsigned on_page_safe_mask = 0xfc0 /* enough for '-63' bytes offset */; - if (likely(on_page_safe_mask & (uintptr_t)(range + offset))) { + if (likely(on_page_safe_mask & (uintptr_t)(range + offset)) && + !RUNNING_ON_VALGRIND) { const unsigned extra = (unsigned)(detent + 16 - range); assert(extra > 0 && extra < 16); mask = 0xFFFF << extra; From 1c5ef060c5495c02f3a8c02ccb4125c78150c535 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Thu, 11 Aug 2022 17:09:13 +0300 Subject: [PATCH 089/364] mdbx: reduce number of memory fences in the hot paths. --- src/core.c | 128 +++++++++++++++++++++++++++++++++--------------- src/internals.h | 16 +++--- 2 files changed, 97 insertions(+), 47 deletions(-) diff --git a/src/core.c b/src/core.c index 3ae6a62b..eb0ec5d5 100644 --- a/src/core.c +++ b/src/core.c @@ -930,7 +930,7 @@ MDBX_MAYBE_UNUSED static __always_inline #endif /* MDBX_64BIT_ATOMIC */ uint64_t - atomic_load64(const MDBX_atomic_uint64_t *p, + atomic_load64(const volatile MDBX_atomic_uint64_t *p, enum MDBX_memory_order order) { STATIC_ASSERT(sizeof(MDBX_atomic_uint64_t) == 8); #if MDBX_64BIT_ATOMIC @@ -5068,9 +5068,35 @@ static __inline void meta_cache_clear(MDBX_env *env) { static __inline txnid_t meta_txnid(const MDBX_env *env, volatile const MDBX_meta *meta) { (void)env; - txnid_t a = unaligned_peek_u64_volatile(4, &meta->mm_txnid_a); - txnid_t b = unaligned_peek_u64_volatile(4, &meta->mm_txnid_b); - return (a == b) ? a : 0; +#if defined(__amd64__) && !defined(ENABLE_UBSAN) && MDBX_UNALIGNED_OK >= 8 + const uint64_t id = + atomic_load64((const volatile MDBX_atomic_uint64_t *)&meta->mm_txnid_a, + mo_AcquireRelease); + if (unlikely(id != + atomic_load64( + (const volatile MDBX_atomic_uint64_t *)&meta->mm_txnid_b, + mo_AcquireRelease))) + return 0; + return id; +#else + const uint32_t l = atomic_load32( + &meta->mm_txnid_a[__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__], + mo_AcquireRelease); + if (unlikely(l != + atomic_load32( + &meta->mm_txnid_b[__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__], + mo_AcquireRelease))) + return 0; + const uint32_t h = atomic_load32( + &meta->mm_txnid_a[__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__], + mo_AcquireRelease); + if (unlikely(h != + atomic_load32( + &meta->mm_txnid_b[__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__], + mo_AcquireRelease))) + return 0; + return l | (uint64_t)h << 32; +#endif } static __inline void meta_update_begin(const MDBX_env *env, MDBX_meta *meta, @@ -5079,9 +5105,21 @@ static __inline void meta_update_begin(const MDBX_env *env, MDBX_meta *meta, eASSERT(env, unaligned_peek_u64(4, meta->mm_txnid_a) < txnid && unaligned_peek_u64(4, meta->mm_txnid_b) < txnid); (void)env; - unaligned_poke_u64(4, meta->mm_txnid_b, 0); - osal_memory_fence(mo_AcquireRelease, true); - unaligned_poke_u64(4, meta->mm_txnid_a, txnid); +#if defined(__amd64__) && !defined(ENABLE_UBSAN) && MDBX_UNALIGNED_OK >= 8 + atomic_store64((MDBX_atomic_uint64_t *)&meta->mm_txnid_b, 0, + mo_AcquireRelease); + atomic_store64((MDBX_atomic_uint64_t *)&meta->mm_txnid_a, txnid, + mo_AcquireRelease); +#else + atomic_store32(&meta->mm_txnid_b[__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__], + 0, mo_AcquireRelease); + atomic_store32(&meta->mm_txnid_b[__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__], + 0, mo_AcquireRelease); + atomic_store32(&meta->mm_txnid_a[__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__], + (uint32_t)txnid, mo_AcquireRelease); + atomic_store32(&meta->mm_txnid_a[__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__], + (uint32_t)(txnid >> 32), mo_AcquireRelease); +#endif } static __inline void meta_update_end(const MDBX_env *env, MDBX_meta *meta, @@ -5092,8 +5130,15 @@ static __inline void meta_update_end(const MDBX_env *env, MDBX_meta *meta, (void)env; jitter4testing(true); memcpy(&meta->mm_bootid, &bootid, 16); - unaligned_poke_u64(4, meta->mm_txnid_b, txnid); - osal_memory_fence(mo_AcquireRelease, true); +#if defined(__amd64__) && !defined(ENABLE_UBSAN) && MDBX_UNALIGNED_OK >= 8 + atomic_store64((MDBX_atomic_uint64_t *)&meta->mm_txnid_b, txnid, + mo_AcquireRelease); +#else + atomic_store32(&meta->mm_txnid_b[__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__], + (uint32_t)txnid, mo_AcquireRelease); + atomic_store32(&meta->mm_txnid_b[__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__], + (uint32_t)(txnid >> 32), mo_AcquireRelease); +#endif } static __inline void meta_set_txnid(const MDBX_env *env, MDBX_meta *meta, @@ -5243,7 +5288,6 @@ __cold static txnid_t recent_committed_txnid(const MDBX_env *env) { while (true) { volatile const MDBX_meta *head = meta_prefer_last(env); const txnid_t recent = meta_txnid(env, head); - osal_memory_fence(mo_AcquireRelease, false); if (likely(head == meta_prefer_last(env) && recent == meta_txnid(env, head))) return recent; @@ -7725,7 +7769,6 @@ static int txn_renew(MDBX_txn *txn, const unsigned flags) { txn->mt_canary = meta->mm_canary; /* LY: Retry on a race, ITS#7970. */ - osal_memory_fence(mo_AcquireRelease, false); const txnid_t oldest = atomic_load64(&env->me_lck->mti_oldest_reader, mo_AcquireRelease); if (unlikely(target_txnid < oldest || @@ -8239,7 +8282,6 @@ int mdbx_txn_info(const MDBX_txn *txn, MDBX_txn_info *info, bool scan_rlt) { info->txn_space_limit_hard = pgno2bytes(env, head_meta->mm_geo.upper); info->txn_space_leftover = pgno2bytes(env, head_meta->mm_geo.now - head_meta->mm_geo.next); - osal_memory_fence(mo_AcquireRelease, false); } while (unlikely(head_meta != meta_prefer_last(env) || head_txnid != meta_txnid(env, head_meta))); @@ -8352,11 +8394,19 @@ int mdbx_txn_flags(const MDBX_txn *txn) { } /* Check for misused dbi handles */ -#define TXN_DBI_CHANGED(txn, dbi) \ - ((txn)->mt_dbiseqs[dbi] != (txn)->mt_env->me_dbiseqs[dbi]) +static __inline bool dbi_changed(MDBX_txn *txn, MDBX_dbi dbi) { + if (txn->mt_dbiseqs == txn->mt_env->me_dbiseqs) + return false; + if (likely( + txn->mt_dbiseqs[dbi].weak == + atomic_load32((MDBX_atomic_uint32_t *)&txn->mt_env->me_dbiseqs[dbi], + mo_AcquireRelease))) + return false; + return true; +} static __inline unsigned dbi_seq(const MDBX_env *const env, unsigned slot) { - unsigned v = env->me_dbiseqs[slot] + 1; + unsigned v = env->me_dbiseqs[slot].weak + 1; return v + (v == 0); } @@ -8367,10 +8417,10 @@ static void dbi_import_locked(MDBX_txn *txn) { if (i >= txn->mt_numdbs) { txn->mt_cursors[i] = NULL; if (txn->mt_dbiseqs != env->me_dbiseqs) - txn->mt_dbiseqs[i] = 0; + txn->mt_dbiseqs[i].weak = 0; txn->mt_dbistate[i] = 0; } - if ((TXN_DBI_CHANGED(txn, i) && + if ((dbi_changed(txn, i) && (txn->mt_dbistate[i] & (DBI_CREAT | DBI_DIRTY | DBI_FRESH)) == 0) || ((env->me_dbflags[i] & DB_VALID) && !(txn->mt_dbistate[i] & DBI_VALID))) { @@ -8393,7 +8443,7 @@ static void dbi_import_locked(MDBX_txn *txn) { else { if ((txn->mt_dbistate[n] & DBI_USRVALID) == 0) { if (txn->mt_dbiseqs != env->me_dbiseqs) - txn->mt_dbiseqs[n] = 0; + txn->mt_dbiseqs[n].weak = 0; txn->mt_dbistate[n] = 0; } ++n; @@ -8430,7 +8480,8 @@ static void dbi_update(MDBX_txn *txn, int keep) { ENSURE(env, osal_fastmutex_acquire(&env->me_dbi_lock) == MDBX_SUCCESS); locked = true; } - if (env->me_numdbs <= i || txn->mt_dbiseqs[i] != env->me_dbiseqs[i]) + if (env->me_numdbs <= i || + txn->mt_dbiseqs[i].weak != env->me_dbiseqs[i].weak) continue /* dbi explicitly closed and/or then re-opened by other txn */; if (keep) { env->me_dbflags[i] = txn->mt_dbs[i].md_flags | DB_VALID; @@ -8438,9 +8489,9 @@ static void dbi_update(MDBX_txn *txn, int keep) { char *ptr = env->me_dbxs[i].md_name.iov_base; if (ptr) { env->me_dbxs[i].md_name.iov_len = 0; - osal_memory_fence(mo_AcquireRelease, true); eASSERT(env, env->me_dbflags[i] == 0); - env->me_dbiseqs[i] = dbi_seq(env, i); + atomic_store32(&env->me_dbiseqs[i], dbi_seq(env, i), + mo_AcquireRelease); env->me_dbxs[i].md_name.iov_base = NULL; osal_free(ptr); } @@ -9835,8 +9886,7 @@ static int txn_write(MDBX_txn *txn, struct iov_ctx *ctx) { static __always_inline bool check_dbi(MDBX_txn *txn, MDBX_dbi dbi, unsigned validity) { if (likely(dbi < txn->mt_numdbs)) { - osal_memory_fence(mo_AcquireRelease, false); - if (likely(!TXN_DBI_CHANGED(txn, dbi))) { + if (likely(!dbi_changed(txn, dbi))) { if (likely(txn->mt_dbistate[dbi] & validity)) return true; if (likely(dbi < CORE_DBS || @@ -11157,19 +11207,19 @@ static int sync_locked(MDBX_env *env, unsigned flags, (meta0 == head) ? "head" : (meta0 == target) ? "tail" : "stay", - durable_caption(meta0), meta_txnid(env, meta0), + durable_caption(meta0), constmeta_txnid(env, meta0), meta0->mm_dbs[MAIN_DBI].md_root, meta0->mm_dbs[FREE_DBI].md_root); DEBUG("meta1: %s, %s, txn_id %" PRIaTXN ", root %" PRIaPGNO "/%" PRIaPGNO, (meta1 == head) ? "head" : (meta1 == target) ? "tail" : "stay", - durable_caption(meta1), meta_txnid(env, meta1), + durable_caption(meta1), constmeta_txnid(env, meta1), meta1->mm_dbs[MAIN_DBI].md_root, meta1->mm_dbs[FREE_DBI].md_root); DEBUG("meta2: %s, %s, txn_id %" PRIaTXN ", root %" PRIaPGNO "/%" PRIaPGNO, (meta2 == head) ? "head" : (meta2 == target) ? "tail" : "stay", - durable_caption(meta2), meta_txnid(env, meta2), + durable_caption(meta2), constmeta_txnid(env, meta2), meta2->mm_dbs[MAIN_DBI].md_root, meta2->mm_dbs[FREE_DBI].md_root); eASSERT(env, !meta_eq(env, pending, meta0)); @@ -12129,7 +12179,6 @@ __cold static int setup_dxb(MDBX_env *env, const int lck_rc, } } else /* not recovery mode */ while (1) { - osal_memory_fence(mo_AcquireRelease, false); const unsigned meta_clash_mask = meta_eq_mask(env); if (unlikely(meta_clash_mask)) { ERROR("meta-pages are clashed: mask 0x%d", meta_clash_mask); @@ -13237,7 +13286,7 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, const size_t tsize = sizeof(MDBX_txn), size = tsize + env->me_maxdbs * (sizeof(MDBX_db) + sizeof(MDBX_cursor *) + - sizeof(unsigned) + 1); + sizeof(MDBX_atomic_uint32_t) + 1); rc = alloc_page_buf(env); if (rc == MDBX_SUCCESS) { memset(env->me_pbuf, -1, env->me_psize * 2); @@ -13245,7 +13294,8 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, if (txn) { txn->mt_dbs = (MDBX_db *)((char *)txn + tsize); txn->mt_cursors = (MDBX_cursor **)(txn->mt_dbs + env->me_maxdbs); - txn->mt_dbiseqs = (unsigned *)(txn->mt_cursors + env->me_maxdbs); + txn->mt_dbiseqs = + (MDBX_atomic_uint32_t *)(txn->mt_cursors + env->me_maxdbs); txn->mt_dbistate = (uint8_t *)(txn->mt_dbiseqs + env->me_maxdbs); txn->mt_env = env; txn->mt_dbxs = env->me_dbxs; @@ -13269,7 +13319,7 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, (uint8_t)unaligned_peek_u64(4, meta->mm_magic_and_version), env->me_psize); DEBUG("using meta page %" PRIaPGNO ", txn %" PRIaTXN, - data_page(meta)->mp_pgno, meta_txnid(env, meta)); + data_page(meta)->mp_pgno, constmeta_txnid(env, meta)); DEBUG("depth: %u", db->md_depth); DEBUG("entries: %" PRIu64, db->md_entries); DEBUG("branch pages: %" PRIaPGNO, db->md_branch_pages); @@ -13920,7 +13970,7 @@ static int setup_dbx(MDBX_dbx *const dbx, const MDBX_db *const db, static int fetch_sdb(MDBX_txn *txn, MDBX_dbi dbi) { MDBX_cursor_couple couple; - if (unlikely(TXN_DBI_CHANGED(txn, dbi))) { + if (unlikely(dbi_changed(txn, dbi))) { NOTICE("dbi %u was changed for txn %" PRIaTXN, dbi, txn->mt_txnid); return MDBX_BAD_DBI; } @@ -15267,7 +15317,7 @@ __hot int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, if (unlikely(rc != MDBX_SUCCESS)) return rc; - if (unlikely(TXN_DBI_CHANGED(mc->mc_txn, mc->mc_dbi))) + if (unlikely(dbi_changed(mc->mc_txn, mc->mc_dbi))) return MDBX_BAD_DBI; cASSERT(mc, cursor_is_tracked(mc)); @@ -16058,7 +16108,7 @@ __hot int mdbx_cursor_del(MDBX_cursor *mc, MDBX_put_flags_t flags) { if (unlikely(rc != MDBX_SUCCESS)) return rc; - if (unlikely(TXN_DBI_CHANGED(mc->mc_txn, mc->mc_dbi))) + if (unlikely(dbi_changed(mc->mc_txn, mc->mc_dbi))) return MDBX_BAD_DBI; if (unlikely(!(mc->mc_flags & C_INITIALIZED))) @@ -20279,7 +20329,6 @@ __cold static int fetch_envinfo_ex(const MDBX_env *env, const MDBX_txn *txn, if (unlikely(env->me_flags & MDBX_FATAL_ERROR)) return MDBX_PANIC; - osal_memory_fence(mo_AcquireRelease, false); volatile const MDBX_meta *const recent_meta = meta_prefer_last(env); arg->mi_recent_txnid = meta_txnid(env, recent_meta); arg->mi_meta0_txnid = meta_txnid(env, meta0); @@ -20694,16 +20743,19 @@ static int dbi_open(MDBX_txn *txn, const char *table_name, unsigned user_flags, txn->mt_dbistate[slot] = (uint8_t)dbiflags; txn->mt_dbxs[slot].md_name.iov_base = namedup; txn->mt_dbxs[slot].md_name.iov_len = len; - txn->mt_dbiseqs[slot] = env->me_dbiseqs[slot] = dbi_seq(env, slot); + txn->mt_dbiseqs[slot].weak = env->me_dbiseqs[slot].weak = + dbi_seq(env, slot); if (!(dbiflags & DBI_CREAT)) env->me_dbflags[slot] = txn->mt_dbs[slot].md_flags | DB_VALID; if (txn->mt_numdbs == slot) { + txn->mt_cursors[slot] = NULL; osal_compiler_barrier(); txn->mt_numdbs = slot + 1; - txn->mt_cursors[slot] = NULL; } - if (env->me_numdbs <= slot) + if (env->me_numdbs <= slot) { + osal_memory_fence(mo_AcquireRelease, true); env->me_numdbs = slot + 1; + } *dbi = slot; } @@ -21044,7 +21096,6 @@ __cold int mdbx_reader_list(const MDBX_env *env, MDBX_reader_list_func *func, const uint64_t head_pages_retired = unaligned_peek_u64_volatile(4, recent_meta->mm_pages_retired); const txnid_t head_txnid = meta_txnid(env, recent_meta); - osal_memory_fence(mo_AcquireRelease, false); if (unlikely(recent_meta != meta_prefer_last(env) || head_pages_retired != unaligned_peek_u64_volatile( @@ -21381,7 +21432,6 @@ int mdbx_txn_straggler(const MDBX_txn *txn, int *percent) const pgno_t maxpg = meta->mm_geo.now; *percent = (int)((meta->mm_geo.next * UINT64_C(100) + maxpg / 2) / maxpg); } - osal_memory_fence(mo_AcquireRelease, false); } while (unlikely(recent != meta_txnid(env, meta))); txnid_t lag = (recent - txn->mt_txnid) / xMDBX_TXNID_STEP; diff --git a/src/internals.h b/src/internals.h index 119bce90..ab74f25a 100644 --- a/src/internals.h +++ b/src/internals.h @@ -337,8 +337,8 @@ atomic_store32(MDBX_atomic_uint32_t *p, const uint32_t value, #endif /* atomic_store32 */ #ifndef atomic_load32 -MDBX_MAYBE_UNUSED static __always_inline uint32_t -atomic_load32(const MDBX_atomic_uint32_t *p, enum MDBX_memory_order order) { +MDBX_MAYBE_UNUSED static __always_inline uint32_t atomic_load32( + const volatile MDBX_atomic_uint32_t *p, enum MDBX_memory_order order) { STATIC_ASSERT(sizeof(MDBX_atomic_uint32_t) == 4); #ifdef MDBX_HAVE_C11ATOMICS assert(atomic_is_lock_free(MDBX_c11a_ro(uint32_t, p))); @@ -455,7 +455,7 @@ typedef struct MDBX_meta { uint32_t mm_magic_and_version[2]; /* txnid that committed this page, the first of a two-phase-update pair */ - uint32_t mm_txnid_a[2]; + MDBX_atomic_uint32_t mm_txnid_a[2]; uint16_t mm_extra_flags; /* extra DB flags, zero (nothing) for now */ uint8_t mm_validator_id; /* ID of checksum and page validation method, @@ -478,7 +478,7 @@ typedef struct MDBX_meta { uint32_t mm_datasync_sign[2]; /* txnid that committed this page, the second of a two-phase-update pair */ - uint32_t mm_txnid_b[2]; + MDBX_atomic_uint32_t mm_txnid_b[2]; /* Number of non-meta pages which were put in GC after COW. May be 0 in case * DB was previously handled by libmdbx without corresponding feature. @@ -959,7 +959,7 @@ struct MDBX_txn { /* Array of MDBX_db records for each known DB */ MDBX_db *mt_dbs; /* Array of sequence numbers for each DB handle */ - unsigned *mt_dbiseqs; + MDBX_atomic_uint32_t *mt_dbiseqs; /* Transaction DBI Flags */ #define DBI_DIRTY MDBX_DBI_DIRTY /* DB was written in this txn */ @@ -1147,9 +1147,9 @@ struct MDBX_env { void *me_pbuf; /* scratch area for DUPSORT put() */ MDBX_txn *me_txn0; /* preallocated write transaction */ - MDBX_dbx *me_dbxs; /* array of static DB info */ - uint16_t *me_dbflags; /* array of flags from MDBX_db.md_flags */ - unsigned *me_dbiseqs; /* array of dbi sequence numbers */ + MDBX_dbx *me_dbxs; /* array of static DB info */ + uint16_t *me_dbflags; /* array of flags from MDBX_db.md_flags */ + MDBX_atomic_uint32_t *me_dbiseqs; /* array of dbi sequence numbers */ unsigned me_maxgc_ov1page; /* Number of pgno_t fit in a single overflow page */ uint32_t me_live_reader; /* have liveness lock in reader table */ From 345c3d433f5b3a03d7fc336077ababd2b12e6e4e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Thu, 11 Aug 2022 19:39:14 +0300 Subject: [PATCH 090/364] mdbx-make: add `-DENABLE_UBSAN` to ubsan-targets. --- GNUmakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/GNUmakefile b/GNUmakefile index ae126325..997795a6 100644 --- a/GNUmakefile +++ b/GNUmakefile @@ -452,7 +452,7 @@ gcc-analyzer: test-ubsan: @echo ' RE-TEST with `-fsanitize=undefined` option...' - $(QUIET)$(MAKE) IOARENA=false CXXSTD=$(CXXSTD) CFLAGS_EXTRA="-Ofast -fsanitize=undefined -fsanitize-undefined-trap-on-error" test + $(QUIET)$(MAKE) IOARENA=false CXXSTD=$(CXXSTD) CFLAGS_EXTRA="-DENABLE_UBSAN -Ofast -fsanitize=undefined -fsanitize-undefined-trap-on-error" test test-asan: @echo ' RE-TEST with `-fsanitize=address` option...' From 9b3faee630004c7b40bd5c83c72ad7acd7c2a4d2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Wed, 17 Aug 2022 21:19:14 +0300 Subject: [PATCH 091/364] mdbx: drop obsolete Nexenta attributes API. --- mdbx.h | 171 ----------------------------------------------------- src/core.c | 126 --------------------------------------- 2 files changed, 297 deletions(-) diff --git a/mdbx.h b/mdbx.h index 82d46ef4..a99d4143 100644 --- a/mdbx.h +++ b/mdbx.h @@ -5240,177 +5240,6 @@ LIBMDBX_API int mdbx_env_turn_for_recovery(MDBX_env *env, unsigned target_meta); /** end of btree_traversal @} */ -/**** Attribute support functions for Nexenta (scheduled for removal) - * *****************************************************************/ -#if defined(MDBX_NEXENTA_ATTRS) || defined(DOXYGEN) -/** \defgroup nexenta Attribute support functions for Nexenta - * \ingroup c_crud - * @{ */ -typedef uint_fast64_t mdbx_attr_t; - -/** Store by cursor with attribute. - * - * This function stores key/data pairs into the database. The cursor is - * positioned at the new item, or on failure usually near it. - * - * \note Internally based on \ref MDBX_RESERVE feature, - * therefore doesn't support \ref MDBX_DUPSORT. - * - * \param [in] cursor A cursor handle returned by \ref mdbx_cursor_open() - * \param [in] key The key operated on. - * \param [in] data The data operated on. - * \param [in] attr The attribute. - * \param [in] flags Options for this operation. This parameter must be set - * to 0 or one of the values described here: - * - \ref MDBX_CURRENT - * Replace the item at the current cursor position. The key parameter - * must still be provided, and must match it, otherwise the function - * return \ref MDBX_EKEYMISMATCH. - * - * - \ref MDBX_APPEND - * Append the given key/data pair to the end of the database. No key - * comparisons are performed. This option allows fast bulk loading when - * keys are already known to be in the correct order. Loading unsorted - * keys with this flag will cause a \ref MDBX_KEYEXIST error. - * - * \see \ref c_crud_hints "Quick reference for Insert/Update/Delete operations" - * - * \returns A non-zero error value on failure and 0 on success, - * some possible errors are: - * \retval MDBX_EKEYMISMATCH - * \retval MDBX_MAP_FULL The database is full, see \ref mdbx_env_set_mapsize(). - * \retval MDBX_TXN_FULL The transaction has too many dirty pages. - * \retval MDBX_EACCES An attempt was made to write in a read-only - * transaction. - * \retval MDBX_EINVAL an invalid parameter was specified. */ -LIBMDBX_API int mdbx_cursor_put_attr(MDBX_cursor *cursor, MDBX_val *key, - MDBX_val *data, mdbx_attr_t attr, - MDBX_put_flags_t flags); - -/** Store items and attributes into a database. - * - * This function stores key/data pairs in the database. The default behavior - * is to enter the new key/data pair, replacing any previously existing key - * if duplicates are disallowed. - * - * \note Internally based on \ref MDBX_RESERVE feature, - * therefore doesn't support \ref MDBX_DUPSORT. - * - * \param [in] txn A transaction handle returned by \ref mdbx_txn_begin(). - * \param [in] dbi A database handle returned by \ref mdbx_dbi_open(). - * \param [in] key The key to store in the database. - * \param [in] attr The attribute to store in the database. - * \param [in,out] data The data to store. - * \param [in] flags Special options for this operation. This parameter - * must be set to 0 or by bitwise OR'ing together one or - * more of the values described here: - * - \ref MDBX_NOOVERWRITE - * Enter the new key/data pair only if the key does not already appear - * in the database. The function will return \ref MDBX_KEYEXIST if the key - * already appears in the database. The data parameter will be set to - * point to the existing item. - * - * - \ref MDBX_CURRENT - * Update an single existing entry, but not add new ones. The function - * will return \ref MDBX_NOTFOUND if the given key not exist in the - * database. Or the \ref MDBX_EMULTIVAL in case duplicates for the given - * key. - * - * - \ref MDBX_APPEND - * Append the given key/data pair to the end of the database. This option - * allows fast bulk loading when keys are already known to be in the - * correct order. Loading unsorted keys with this flag will cause - * a \ref MDBX_EKEYMISMATCH error. - * - * \see \ref c_crud_hints "Quick reference for Insert/Update/Delete operations" - * - * \returns A non-zero error value on failure and 0 on success, - * some possible errors are: - * \retval MDBX_KEYEXIST - * \retval MDBX_MAP_FULL The database is full, see \ref mdbx_env_set_mapsize(). - * \retval MDBX_TXN_FULL The transaction has too many dirty pages. - * \retval MDBX_EACCES An attempt was made to write - * in a read-only transaction. - * \retval MDBX_EINVAL An invalid parameter was specified. */ -LIBMDBX_API int mdbx_put_attr(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, - MDBX_val *data, mdbx_attr_t attr, - MDBX_put_flags_t flags); - -/** Set items attribute from a database. - * - * This function stores key/data pairs attribute to the database. - * - * \note Internally based on \ref MDBX_RESERVE feature, - * therefore doesn't support \ref MDBX_DUPSORT. - * - * \param [in] txn A transaction handle returned by \ref mdbx_txn_begin(). - * \param [in] dbi A database handle returned by \ref mdbx_dbi_open(). - * \param [in] key The key to search for in the database. - * \param [in] data The data to be stored or NULL to save previous value. - * \param [in] attr The attribute to be stored. - * - * \returns A non-zero error value on failure and 0 on success, - * some possible errors are: - * \retval MDBX_NOTFOUND The key-value pair was not in the database. - * \retval MDBX_EINVAL An invalid parameter was specified. */ -LIBMDBX_API int mdbx_set_attr(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, - MDBX_val *data, mdbx_attr_t attr); - -/** Get items attribute from a database cursor. - * - * This function retrieves key/data pairs from the database. The address and - * length of the key are returned in the object to which key refers (except - * for the case of the \ref MDBX_SET option, in which the key object is - * unchanged), and the address and length of the data are returned in the object - * to which data refers. - * \see mdbx_get() - * - * \param [in] cursor A cursor handle returned by \ref mdbx_cursor_open(). - * \param [in,out] key The key for a retrieved item. - * \param [in,out] data The data of a retrieved item. - * \param [out] pattr The pointer to retrieve attribute. - * \param [in] op A cursor operation MDBX_cursor_op. - * - * \returns A non-zero error value on failure and 0 on success, - * some possible errors are: - * \retval MDBX_NOTFOUND No matching key found. - * \retval MDBX_EINVAL An invalid parameter was specified. */ -LIBMDBX_API int mdbx_cursor_get_attr(MDBX_cursor *cursor, MDBX_val *key, - MDBX_val *data, mdbx_attr_t *pattr, - MDBX_cursor_op op); - -/** Get items attribute from a database. - * - * This function retrieves key/data pairs from the database. The address - * and length of the data associated with the specified key are returned - * in the structure to which data refers. - * If the database supports duplicate keys (see \ref MDBX_DUPSORT) then the - * first data item for the key will be returned. Retrieval of other - * items requires the use of \ref mdbx_cursor_get(). - * - * \note The memory pointed to by the returned values is owned by the - * database. The caller need not dispose of the memory, and may not - * modify it in any way. For values returned in a read-only transaction - * any modification attempts will cause a `SIGSEGV`. - * - * \note Values returned from the database are valid only until a - * subsequent update operation, or the end of the transaction. - * - * \param [in] txn A transaction handle returned by \ref mdbx_txn_begin(). - * \param [in] dbi A database handle returned by \ref mdbx_dbi_open(). - * \param [in] key The key to search for in the database. - * \param [in,out] data The data corresponding to the key. - * \param [out] pattr The pointer to retrieve attribute. - * - * \returns A non-zero error value on failure and 0 on success, - * some possible errors are: - * \retval MDBX_NOTFOUND The key was not in the database. - * \retval MDBX_EINVAL An invalid parameter was specified. */ -LIBMDBX_API int mdbx_get_attr(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, - MDBX_val *data, mdbx_attr_t *pattr); -/** end of nexenta @} */ -#endif /* MDBX_NEXENTA_ATTRS */ - /** end of c_api @} */ /******************************************************************************* diff --git a/src/core.c b/src/core.c index eb0ec5d5..eaf9c818 100644 --- a/src/core.c +++ b/src/core.c @@ -23018,132 +23018,6 @@ __cold int mdbx_env_get_option(const MDBX_env *env, const MDBX_option_t option, return MDBX_SUCCESS; } -/*** Attribute support functions for Nexenta **********************************/ -#ifdef MDBX_NEXENTA_ATTRS - -static __inline int mdbx_attr_peek(MDBX_val *data, mdbx_attr_t *attrptr) { - if (unlikely(data->iov_len < sizeof(mdbx_attr_t))) - return MDBX_INCOMPATIBLE; - - if (likely(attrptr != NULL)) - *attrptr = *(mdbx_attr_t *)data->iov_base; - data->iov_len -= sizeof(mdbx_attr_t); - data->iov_base = - likely(data->iov_len > 0) ? ((mdbx_attr_t *)data->iov_base) + 1 : NULL; - - return MDBX_SUCCESS; -} - -static __inline int mdbx_attr_poke(MDBX_val *reserved, MDBX_val *data, - mdbx_attr_t attr, MDBX_put_flags_t flags) { - mdbx_attr_t *space = reserved->iov_base; - if (flags & MDBX_RESERVE) { - if (likely(data != NULL)) { - data->iov_base = data->iov_len ? space + 1 : NULL; - } - } else { - *space = attr; - if (likely(data != NULL)) { - memcpy(space + 1, data->iov_base, data->iov_len); - } - } - - return MDBX_SUCCESS; -} - -int mdbx_cursor_get_attr(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, - mdbx_attr_t *attrptr, MDBX_cursor_op op) { - int rc = mdbx_cursor_get(mc, key, data, op); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - - return mdbx_attr_peek(data, attrptr); -} - -int mdbx_get_attr(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data, - uint64_t *attrptr) { - int rc = mdbx_get(txn, dbi, key, data); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - - return mdbx_attr_peek(data, attrptr); -} - -int mdbx_put_attr(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data, - mdbx_attr_t attr, MDBX_put_flags_t flags) { - MDBX_val reserve; - reserve.iov_base = NULL; - reserve.iov_len = (data ? data->iov_len : 0) + sizeof(mdbx_attr_t); - - int rc = mdbx_put(txn, dbi, key, &reserve, flags | MDBX_RESERVE); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - - return mdbx_attr_poke(&reserve, data, attr, flags); -} - -int mdbx_cursor_put_attr(MDBX_cursor *cursor, MDBX_val *key, MDBX_val *data, - mdbx_attr_t attr, MDBX_put_flags_t flags) { - MDBX_val reserve; - reserve.iov_base = NULL; - reserve.iov_len = (data ? data->iov_len : 0) + sizeof(mdbx_attr_t); - - int rc = mdbx_cursor_put(cursor, key, &reserve, flags | MDBX_RESERVE); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - - return mdbx_attr_poke(&reserve, data, attr, flags); -} - -int mdbx_set_attr(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data, - mdbx_attr_t attr) { - if (unlikely(!key || !txn)) - return MDBX_EINVAL; - - if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDBX_EBADSIGN; - - if (unlikely(!check_dbi(txn, dbi, DB_USRVALID))) - return MDBX_BAD_DBI; - - if (unlikely(txn->mt_flags & (MDBX_TXN_RDONLY | MDBX_TXN_BLOCKED))) - return (txn->mt_flags & MDBX_TXN_RDONLY) ? MDBX_EACCESS : MDBX_BAD_TXN; - - MDBX_cursor_couple cx; - MDBX_val old_data; - int rc = cursor_init(&cx.outer, txn, dbi); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - rc = mdbx_cursor_set(&cx.outer, key, &old_data, MDBX_SET, NULL); - if (unlikely(rc != MDBX_SUCCESS)) { - if (rc == MDBX_NOTFOUND && data) { - cx.outer.mc_next = txn->mt_cursors[dbi]; - txn->mt_cursors[dbi] = &cx.outer; - rc = mdbx_cursor_put_attr(&cx.outer, key, data, attr, 0); - txn->mt_cursors[dbi] = cx.outer.mc_next; - } - return rc; - } - - mdbx_attr_t old_attr = 0; - rc = mdbx_attr_peek(&old_data, &old_attr); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - - if (old_attr == attr && (!data || (data->iov_len == old_data.iov_len && - memcmp(data->iov_base, old_data.iov_base, - old_data.iov_len) == 0))) - return MDBX_SUCCESS; - - cx.outer.mc_next = txn->mt_cursors[dbi]; - txn->mt_cursors[dbi] = &cx.outer; - rc = mdbx_cursor_put_attr(&cx.outer, key, data ? data : &old_data, attr, - MDBX_CURRENT); - txn->mt_cursors[dbi] = cx.outer.mc_next; - return rc; -} -#endif /* MDBX_NEXENTA_ATTRS */ - /******************************************************************************/ /* *INDENT-OFF* */ /* clang-format off */ From f9ad835680cfcff6b8ea2afb153825426a4d51f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Wed, 17 Aug 2022 21:27:32 +0300 Subject: [PATCH 092/364] mdbx: drop E2K libc obsolete workarounds. --- mdbx.h | 51 ------------------------ src/core.c | 114 ----------------------------------------------------- 2 files changed, 165 deletions(-) diff --git a/mdbx.h b/mdbx.h index a99d4143..f536d41d 100644 --- a/mdbx.h +++ b/mdbx.h @@ -5242,57 +5242,6 @@ LIBMDBX_API int mdbx_env_turn_for_recovery(MDBX_env *env, unsigned target_meta); /** end of c_api @} */ -/******************************************************************************* - * Workaround for mmaped-lookahead-cross-page-boundary bug - * in an obsolete versions of Elbrus's libc and kernels. */ -#if defined(__e2k__) && defined(MDBX_E2K_MLHCPB_WORKAROUND) && \ - MDBX_E2K_MLHCPB_WORKAROUND -LIBMDBX_API int mdbx_e2k_memcmp_bug_workaround(const void *s1, const void *s2, - size_t n); -LIBMDBX_API int mdbx_e2k_strcmp_bug_workaround(const char *s1, const char *s2); -LIBMDBX_API int mdbx_e2k_strncmp_bug_workaround(const char *s1, const char *s2, - size_t n); -LIBMDBX_API size_t mdbx_e2k_strlen_bug_workaround(const char *s); -LIBMDBX_API size_t mdbx_e2k_strnlen_bug_workaround(const char *s, - size_t maxlen); -#ifdef __cplusplus -namespace std { -inline int mdbx_e2k_memcmp_bug_workaround(const void *s1, const void *s2, - size_t n) { - return ::mdbx_e2k_memcmp_bug_workaround(s1, s2, n); -} -inline int mdbx_e2k_strcmp_bug_workaround(const char *s1, const char *s2) { - return ::mdbx_e2k_strcmp_bug_workaround(s1, s2); -} -inline int mdbx_e2k_strncmp_bug_workaround(const char *s1, const char *s2, - size_t n) { - return ::mdbx_e2k_strncmp_bug_workaround(s1, s2, n); -} -inline size_t mdbx_e2k_strlen_bug_workaround(const char *s) { - return ::mdbx_e2k_strlen_bug_workaround(s); -} -inline size_t mdbx_e2k_strnlen_bug_workaround(const char *s, size_t maxlen) { - return ::mdbx_e2k_strnlen_bug_workaround(s, maxlen); -} -} // namespace std -#endif /* __cplusplus */ - -#include -#include -#undef memcmp -#define memcmp mdbx_e2k_memcmp_bug_workaround -#undef bcmp -#define bcmp mdbx_e2k_memcmp_bug_workaround -#undef strcmp -#define strcmp mdbx_e2k_strcmp_bug_workaround -#undef strncmp -#define strncmp mdbx_e2k_strncmp_bug_workaround -#undef strlen -#define strlen mdbx_e2k_strlen_bug_workaround -#undef strnlen -#define strnlen mdbx_e2k_strnlen_bug_workaround -#endif /* MDBX_E2K_MLHCPB_WORKAROUND */ - #ifdef __cplusplus } /* extern "C" */ #endif diff --git a/src/core.c b/src/core.c index eaf9c818..9e91e4a5 100644 --- a/src/core.c +++ b/src/core.c @@ -781,120 +781,6 @@ get_key_optional(const MDBX_node *node, MDBX_val *keyptr /* __may_null */) { get_key(node, keyptr); } -/*------------------------------------------------------------------------------ - * Workaround for mmaped-lookahead-cross-page-boundary bug - * in an obsolete versions of Elbrus's libc and kernels. */ -#if defined(__e2k__) && defined(MDBX_E2K_MLHCPB_WORKAROUND) && \ - MDBX_E2K_MLHCPB_WORKAROUND -__hot int mdbx_e2k_memcmp_bug_workaround(const void *s1, const void *s2, - size_t n) { - if (unlikely(n > 42 - /* LY: align followed access if reasonable possible */ - && (((uintptr_t)s1) & 7) != 0 && - (((uintptr_t)s1) & 7) == (((uintptr_t)s2) & 7))) { - if (((uintptr_t)s1) & 1) { - const int diff = *(uint8_t *)s1 - *(uint8_t *)s2; - if (diff) - return diff; - s1 = (char *)s1 + 1; - s2 = (char *)s2 + 1; - n -= 1; - } - - if (((uintptr_t)s1) & 2) { - const uint16_t a = *(uint16_t *)s1; - const uint16_t b = *(uint16_t *)s2; - if (likely(a != b)) - return (__builtin_bswap16(a) > __builtin_bswap16(b)) ? 1 : -1; - s1 = (char *)s1 + 2; - s2 = (char *)s2 + 2; - n -= 2; - } - - if (((uintptr_t)s1) & 4) { - const uint32_t a = *(uint32_t *)s1; - const uint32_t b = *(uint32_t *)s2; - if (likely(a != b)) - return (__builtin_bswap32(a) > __builtin_bswap32(b)) ? 1 : -1; - s1 = (char *)s1 + 4; - s2 = (char *)s2 + 4; - n -= 4; - } - } - - while (n >= 8) { - const uint64_t a = *(uint64_t *)s1; - const uint64_t b = *(uint64_t *)s2; - if (likely(a != b)) - return (__builtin_bswap64(a) > __builtin_bswap64(b)) ? 1 : -1; - s1 = (char *)s1 + 8; - s2 = (char *)s2 + 8; - n -= 8; - } - - if (n & 4) { - const uint32_t a = *(uint32_t *)s1; - const uint32_t b = *(uint32_t *)s2; - if (likely(a != b)) - return (__builtin_bswap32(a) > __builtin_bswap32(b)) ? 1 : -1; - s1 = (char *)s1 + 4; - s2 = (char *)s2 + 4; - } - - if (n & 2) { - const uint16_t a = *(uint16_t *)s1; - const uint16_t b = *(uint16_t *)s2; - if (likely(a != b)) - return (__builtin_bswap16(a) > __builtin_bswap16(b)) ? 1 : -1; - s1 = (char *)s1 + 2; - s2 = (char *)s2 + 2; - } - - return (n & 1) ? *(uint8_t *)s1 - *(uint8_t *)s2 : 0; -} - -__hot int mdbx_e2k_strcmp_bug_workaround(const char *s1, const char *s2) { - while (true) { - int diff = *(uint8_t *)s1 - *(uint8_t *)s2; - if (likely(diff != 0) || *s1 == '\0') - return diff; - s1 += 1; - s2 += 1; - } -} - -__hot int mdbx_e2k_strncmp_bug_workaround(const char *s1, const char *s2, - size_t n) { - while (n > 0) { - int diff = *(uint8_t *)s1 - *(uint8_t *)s2; - if (likely(diff != 0) || *s1 == '\0') - return diff; - s1 += 1; - s2 += 1; - n -= 1; - } - return 0; -} - -__hot size_t mdbx_e2k_strlen_bug_workaround(const char *s) { - size_t n = 0; - while (*s) { - s += 1; - n += 1; - } - return n; -} - -__hot size_t mdbx_e2k_strnlen_bug_workaround(const char *s, size_t maxlen) { - size_t n = 0; - while (maxlen > n && *s) { - s += 1; - n += 1; - } - return n; -} -#endif /* MDBX_E2K_MLHCPB_WORKAROUND */ - /*------------------------------------------------------------------------------ * safe read/write volatile 64-bit fields on 32-bit architectures. */ From ef16dd2a22c761e7c3f4b8f8f7d175baf5da298d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Wed, 17 Aug 2022 21:31:11 +0300 Subject: [PATCH 093/364] mdbx: move `global_ctor()` to the end . --- src/core.c | 82 +++++++++++++++++++++++++++--------------------------- 1 file changed, 41 insertions(+), 41 deletions(-) diff --git a/src/core.c b/src/core.c index 9e91e4a5..b7a96835 100644 --- a/src/core.c +++ b/src/core.c @@ -1311,47 +1311,6 @@ static void thread_rthc_set(osal_thread_key_t key, const void *value) { #endif } -__cold void global_ctor(void) { - rthc_limit = RTHC_INITIAL_LIMIT; - rthc_table = rthc_table_static; -#if defined(_WIN32) || defined(_WIN64) - InitializeCriticalSection(&rthc_critical_section); - InitializeCriticalSection(&lcklist_critical_section); -#else - ENSURE(nullptr, pthread_key_create(&rthc_key, thread_dtor) == 0); - TRACE("pid %d, &mdbx_rthc_key = %p, value 0x%x", osal_getpid(), - __Wpedantic_format_voidptr(&rthc_key), (unsigned)rthc_key); -#endif - /* checking time conversion, this also avoids racing on 32-bit architectures - * during storing calculated 64-bit ratio(s) into memory. */ - uint32_t proba = UINT32_MAX; - while (true) { - unsigned time_conversion_checkup = - osal_monotime_to_16dot16(osal_16dot16_to_monotime(proba)); - unsigned one_more = (proba < UINT32_MAX) ? proba + 1 : proba; - unsigned one_less = (proba > 0) ? proba - 1 : proba; - ENSURE(nullptr, time_conversion_checkup >= one_less && - time_conversion_checkup <= one_more); - if (proba == 0) - break; - proba >>= 1; - } - - bootid = osal_bootid(); - -#if 0 /* debug */ - for (unsigned i = 0; i < 65536; ++i) { - size_t pages = pv2pages(i); - unsigned x = pages2pv(pages); - size_t xp = pv2pages(x); - if (!(x == i || (x % 2 == 0 && x < 65536)) || pages != xp) - printf("%u => %zu => %u => %zu\n", i, pages, x, xp); - assert(pages == xp); - } - fflush(stdout); -#endif /* #if 0 */ -} - /* dtor called for thread, i.e. for all mdbx's environment objects */ __cold void thread_dtor(void *rthc) { rthc_lock(); @@ -22904,6 +22863,47 @@ __cold int mdbx_env_get_option(const MDBX_env *env, const MDBX_option_t option, return MDBX_SUCCESS; } +__cold void global_ctor(void) { + rthc_limit = RTHC_INITIAL_LIMIT; + rthc_table = rthc_table_static; +#if defined(_WIN32) || defined(_WIN64) + InitializeCriticalSection(&rthc_critical_section); + InitializeCriticalSection(&lcklist_critical_section); +#else + ENSURE(nullptr, pthread_key_create(&rthc_key, thread_dtor) == 0); + TRACE("pid %d, &mdbx_rthc_key = %p, value 0x%x", osal_getpid(), + __Wpedantic_format_voidptr(&rthc_key), (unsigned)rthc_key); +#endif + /* checking time conversion, this also avoids racing on 32-bit architectures + * during storing calculated 64-bit ratio(s) into memory. */ + uint32_t proba = UINT32_MAX; + while (true) { + unsigned time_conversion_checkup = + osal_monotime_to_16dot16(osal_16dot16_to_monotime(proba)); + unsigned one_more = (proba < UINT32_MAX) ? proba + 1 : proba; + unsigned one_less = (proba > 0) ? proba - 1 : proba; + ENSURE(nullptr, time_conversion_checkup >= one_less && + time_conversion_checkup <= one_more); + if (proba == 0) + break; + proba >>= 1; + } + + bootid = osal_bootid(); + +#if 0 /* debug */ + for (unsigned i = 0; i < 65536; ++i) { + size_t pages = pv2pages(i); + unsigned x = pages2pv(pages); + size_t xp = pv2pages(x); + if (!(x == i || (x % 2 == 0 && x < 65536)) || pages != xp) + printf("%u => %zu => %u => %zu\n", i, pages, x, xp); + assert(pages == xp); + } + fflush(stdout); +#endif /* #if 0 */ +} + /******************************************************************************/ /* *INDENT-OFF* */ /* clang-format off */ From 7b36f946cbeabd7bd04927ed9d29ce40eb91f449 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Wed, 17 Aug 2022 15:10:05 +0300 Subject: [PATCH 094/364] mdbx: rework/speed up accessing to meta-pages, choosing and cache of ones (squashed). --- src/core.c | 1300 ++++++++++++++++++++++++++--------------------- src/internals.h | 24 +- src/options.h | 6 - 3 files changed, 736 insertions(+), 594 deletions(-) diff --git a/src/core.c b/src/core.c index b7a96835..e92fb2dd 100644 --- a/src/core.c +++ b/src/core.c @@ -222,19 +222,19 @@ MDBX_NOTHROW_PURE_FUNCTION static __always_inline uint64_t unaligned_peek_u64( static __always_inline uint64_t unaligned_peek_u64_volatile(const unsigned expected_alignment, - volatile const void *const __restrict ptr) { + const volatile void *const __restrict ptr) { assert((uintptr_t)ptr % expected_alignment == 0); assert(expected_alignment % sizeof(uint32_t) == 0); if (MDBX_UNALIGNED_OK >= 8 || (expected_alignment % sizeof(uint64_t)) == 0) - return *(volatile const uint64_t *)ptr; + return *(const volatile uint64_t *)ptr; else { #if defined(__unaligned) || defined(_M_ARM) || defined(_M_ARM64) || \ defined(_M_X64) || defined(_M_IA64) - return *(volatile const __unaligned uint64_t *)ptr; + return *(const volatile __unaligned uint64_t *)ptr; #else - const uint32_t lo = ((volatile const uint32_t *) + const uint32_t lo = ((const volatile uint32_t *) ptr)[__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__]; - const uint32_t hi = ((volatile const uint32_t *) + const uint32_t hi = ((const volatile uint32_t *) ptr)[__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__]; return lo | (uint64_t)hi << 32; #endif /* _MSC_VER || __unaligned */ @@ -3105,8 +3105,8 @@ static int __must_check_result page_split(MDBX_cursor *mc, MDBX_val *const newdata, pgno_t newpgno, const unsigned naf); -static bool meta_checktxnid(const MDBX_env *env, const volatile MDBX_meta *meta, - bool report); +static bool coherency_check_meta(const MDBX_env *env, + const volatile MDBX_meta *meta, bool report); static int __must_check_result validate_meta_copy(MDBX_env *env, const MDBX_meta *meta, MDBX_meta *dest); @@ -3117,7 +3117,8 @@ static int __must_check_result read_header(MDBX_env *env, MDBX_meta *meta, const int lck_exclusive, const mdbx_mode_t mode_bits); static int __must_check_result sync_locked(MDBX_env *env, unsigned flags, - MDBX_meta *const pending); + MDBX_meta *const pending, + meta_xyz_t *const xyz); static int env_close(MDBX_env *env); struct node_result { @@ -4892,65 +4893,56 @@ static bool meta_weak_acceptable(const MDBX_env *env, const MDBX_meta *meta, #define METAPAGE(env, n) page_meta(pgno2page(env, n)) #define METAPAGE_END(env) METAPAGE(env, NUM_METAS) -MDBX_NOTHROW_PURE_FUNCTION static __inline txnid_t -constmeta_txnid(const MDBX_env *env, const MDBX_meta *meta) { - txnid_t a = unaligned_peek_u64(4, &meta->mm_txnid_a); - txnid_t b = unaligned_peek_u64(4, &meta->mm_txnid_b); - eASSERT(env, a == b); - (void)env; - return (a == b) ? a : 0; +MDBX_NOTHROW_PURE_FUNCTION static txnid_t +constmeta_txnid(const MDBX_meta *meta) { + const txnid_t a = unaligned_peek_u64(4, &meta->mm_txnid_a); + const txnid_t b = unaligned_peek_u64(4, &meta->mm_txnid_b); + return likely(a == b) ? a : 0; } -static __inline void meta_cache_clear(MDBX_env *env) { -#if MDBX_CACHE_METAPTR - env->cache_last_meta = nullptr; - env->cache_steady_meta = nullptr; -#else - (void)env; -#endif /* MDBX_CACHE_METAPTR */ -} +typedef struct { + uint64_t txnid; + size_t is_steady; +} meta_snap_t; -static __inline txnid_t meta_txnid(const MDBX_env *env, - volatile const MDBX_meta *meta) { - (void)env; -#if defined(__amd64__) && !defined(ENABLE_UBSAN) && MDBX_UNALIGNED_OK >= 8 - const uint64_t id = - atomic_load64((const volatile MDBX_atomic_uint64_t *)&meta->mm_txnid_a, - mo_AcquireRelease); - if (unlikely(id != - atomic_load64( - (const volatile MDBX_atomic_uint64_t *)&meta->mm_txnid_b, - mo_AcquireRelease))) - return 0; - return id; +static __always_inline txnid_t +atomic_load_txnid(const volatile MDBX_atomic_uint32_t *ptr) { +#if (defined(__amd64__) || defined(__e2k__)) && !defined(ENABLE_UBSAN) && \ + MDBX_UNALIGNED_OK >= 8 + return atomic_load64((const volatile MDBX_atomic_uint64_t *)ptr, + mo_AcquireRelease); #else const uint32_t l = atomic_load32( - &meta->mm_txnid_a[__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__], - mo_AcquireRelease); - if (unlikely(l != - atomic_load32( - &meta->mm_txnid_b[__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__], - mo_AcquireRelease))) - return 0; + &ptr[__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__], mo_AcquireRelease); const uint32_t h = atomic_load32( - &meta->mm_txnid_a[__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__], - mo_AcquireRelease); - if (unlikely(h != - atomic_load32( - &meta->mm_txnid_b[__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__], - mo_AcquireRelease))) - return 0; - return l | (uint64_t)h << 32; + &ptr[__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__], mo_AcquireRelease); + return (uint64_t)h << 32 | l; #endif } +static __inline meta_snap_t meta_snap(const volatile MDBX_meta *meta) { + txnid_t txnid = atomic_load_txnid(meta->mm_txnid_a); + jitter4testing(true); + size_t is_steady = META_IS_STEADY(meta) && txnid >= MIN_TXNID; + jitter4testing(true); + if (unlikely(txnid != atomic_load_txnid(meta->mm_txnid_b))) + txnid = is_steady = 0; + meta_snap_t r = {txnid, is_steady}; + return r; +} + +static __inline txnid_t meta_txnid(const volatile MDBX_meta *meta) { + return meta_snap(meta).txnid; +} + static __inline void meta_update_begin(const MDBX_env *env, MDBX_meta *meta, txnid_t txnid) { eASSERT(env, meta >= METAPAGE(env, 0) && meta < METAPAGE_END(env)); eASSERT(env, unaligned_peek_u64(4, meta->mm_txnid_a) < txnid && unaligned_peek_u64(4, meta->mm_txnid_b) < txnid); (void)env; -#if defined(__amd64__) && !defined(ENABLE_UBSAN) && MDBX_UNALIGNED_OK >= 8 +#if (defined(__amd64__) || defined(__e2k__)) && !defined(ENABLE_UBSAN) && \ + MDBX_UNALIGNED_OK >= 8 atomic_store64((MDBX_atomic_uint64_t *)&meta->mm_txnid_b, 0, mo_AcquireRelease); atomic_store64((MDBX_atomic_uint64_t *)&meta->mm_txnid_a, txnid, @@ -4975,7 +4967,8 @@ static __inline void meta_update_end(const MDBX_env *env, MDBX_meta *meta, (void)env; jitter4testing(true); memcpy(&meta->mm_bootid, &bootid, 16); -#if defined(__amd64__) && !defined(ENABLE_UBSAN) && MDBX_UNALIGNED_OK >= 8 +#if (defined(__amd64__) || defined(__e2k__)) && !defined(ENABLE_UBSAN) && \ + MDBX_UNALIGNED_OK >= 8 atomic_store64((MDBX_atomic_uint64_t *)&meta->mm_txnid_b, txnid, mo_AcquireRelease); #else @@ -5009,139 +5002,172 @@ static __inline uint64_t meta_sign(const MDBX_meta *meta) { return (sign > MDBX_DATASIGN_WEAK) ? sign : ~sign; } -enum meta_choice_mode { prefer_last, prefer_steady }; +typedef struct { + txnid_t txnid; + union { + const volatile MDBX_meta *ptr_v; + const MDBX_meta *ptr_c; + }; + size_t is_steady; +} meta_ptr_t; -static __inline bool meta_ot(const enum meta_choice_mode mode, - const MDBX_env *env, volatile const MDBX_meta *a, - volatile const MDBX_meta *b) { - jitter4testing(true); - const txnid_t txnid_a = meta_txnid(env, a); - jitter4testing(true); - const txnid_t txnid_b = meta_txnid(env, b); - jitter4testing(true); - const bool is_stead_b = META_IS_STEADY(b); - - if (mode == prefer_steady) { - jitter4testing(true); - const bool is_stead_a = META_IS_STEADY(a); - if (is_stead_a != is_stead_b) - return is_stead_b; - } else { - eASSERT(env, mode == prefer_last); - } - if (txnid_a == txnid_b) - return is_stead_b; - return txnid_a < txnid_b; +static meta_ptr_t meta_ptr(const MDBX_env *env, unsigned n) { + eASSERT(env, n < NUM_METAS); + meta_ptr_t r; + meta_snap_t snap = meta_snap(r.ptr_v = METAPAGE(env, n)); + r.txnid = snap.txnid; + r.is_steady = snap.is_steady; + return r; } -static bool meta_eq(const MDBX_env *env, volatile const MDBX_meta *a, - volatile const MDBX_meta *b) { - jitter4testing(true); - const txnid_t txnid = meta_txnid(env, a); - if (!txnid || txnid != meta_txnid(env, b)) - return false; - - jitter4testing(true); - if (META_IS_STEADY(a) != META_IS_STEADY(b)) - return false; - - jitter4testing(true); - return true; +static __always_inline uint8_t meta_cmp2int(txnid_t a, txnid_t b, uint8_t s) { + return unlikely(a == b) ? 1 * s : (a > b) ? 2 * s : 0 * s; } -static int meta_eq_mask(const MDBX_env *env) { - volatile const MDBX_meta *m0 = METAPAGE(env, 0); - volatile const MDBX_meta *m1 = METAPAGE(env, 1); - volatile const MDBX_meta *m2 = METAPAGE(env, 2); - - int rc = meta_eq(env, m0, m1) ? 1 : 0; - if (meta_eq(env, m1, m2)) - rc += 2; - if (meta_eq(env, m2, m0)) - rc += 4; - return rc; +static __always_inline uint8_t meta_cmp2recent(uint8_t ab_cmp2int, + bool a_steady, bool b_steady) { + assert(ab_cmp2int < 3 /* && a_steady< 2 && b_steady < 2 */); + return ab_cmp2int > 1 || (ab_cmp2int == 1 && a_steady > b_steady); } -static __always_inline volatile const MDBX_meta * -meta_recent(const enum meta_choice_mode mode, const MDBX_env *env, - volatile const MDBX_meta *a, volatile const MDBX_meta *b) { - const bool a_older_that_b = meta_ot(mode, env, a, b); - eASSERT(env, !meta_eq(env, a, b)); - return a_older_that_b ? b : a; +static __always_inline uint8_t meta_cmp2steady(uint8_t ab_cmp2int, + bool a_steady, bool b_steady) { + assert(ab_cmp2int < 3 /* && a_steady< 2 && b_steady < 2 */); + return a_steady > b_steady || (a_steady == b_steady && ab_cmp2int > 1); } -static const MDBX_meta *meta_ancient_prefer_weak(const MDBX_env *env, - const MDBX_meta *a, - const MDBX_meta *b) { - const bool a_older_that_b = meta_ot(prefer_steady, env, a, b); - eASSERT(env, !meta_eq(env, a, b)); - return a_older_that_b ? a : b; +static __inline bool meta_choice_recent(txnid_t a_txnid, bool a_steady, + txnid_t b_txnid, bool b_steady) { + return meta_cmp2recent(meta_cmp2int(a_txnid, b_txnid, 1), a_steady, b_steady); } -static __always_inline volatile const MDBX_meta * -meta_mostrecent(const enum meta_choice_mode mode, const MDBX_env *env) { - volatile const MDBX_meta *m0 = METAPAGE(env, 0); - volatile const MDBX_meta *m1 = METAPAGE(env, 1); - volatile const MDBX_meta *m2 = METAPAGE(env, 2); - - volatile const MDBX_meta *head = meta_recent(mode, env, m0, m1); - head = meta_recent(mode, env, head, m2); - return head; +static __inline bool meta_choice_steady(txnid_t a_txnid, bool a_steady, + txnid_t b_txnid, bool b_steady) { + return meta_cmp2steady(meta_cmp2int(a_txnid, b_txnid, 1), a_steady, b_steady); } -static __noinline volatile const MDBX_meta * -meta_prefer_steady(const MDBX_env *env) { - return -#if MDBX_CACHE_METAPTR - ((MDBX_env *)env)->cache_steady_meta = -#endif /* MDBX_CACHE_METAPTR */ - meta_mostrecent(prefer_steady, env); +MDBX_MAYBE_UNUSED static __inline uint8_t meta_cmp2pack(uint8_t c01, + uint8_t c02, + uint8_t c12, bool s0, + bool s1, bool s2) { + assert(c01 < 3 && c02 < 3 && c12 < 3); + /* assert(s0 < 2 && s1 < 2 && s2 < 2); */ + const uint8_t recent = meta_cmp2recent(c01, s0, s1) + ? (meta_cmp2recent(c02, s0, s2) ? 0 : 2) + : (meta_cmp2recent(c12, s1, s2) ? 1 : 2); + const uint8_t prefer_steady = meta_cmp2steady(c01, s0, s1) + ? (meta_cmp2steady(c02, s0, s2) ? 0 : 2) + : (meta_cmp2steady(c12, s1, s2) ? 1 : 2); + + uint8_t tail; + if (recent == 0) + tail = meta_cmp2steady(c12, s1, s2) ? 2 : 1; + else if (recent == 1) + tail = meta_cmp2steady(c02, s0, s2) ? 2 : 0; + else + tail = meta_cmp2steady(c01, s0, s1) ? 1 : 0; + + const bool valid = + c01 != 1 || s0 != s1 || c02 != 1 || s0 != s2 || c12 != 1 || s1 != s2; + const bool strict = (c01 != 1 || s0 != s1) && (c02 != 1 || s0 != s2) && + (c12 != 1 || s1 != s2); + return tail | recent << 2 | prefer_steady << 4 | strict << 6 | valid << 7; } -MDBX_NOTHROW_PURE_FUNCTION static __inline const MDBX_meta * -constmeta_prefer_steady(const MDBX_env *env) { -#if MDBX_CACHE_METAPTR - if (likely(env->cache_steady_meta)) { - eASSERT(env, env->cache_steady_meta == meta_mostrecent(prefer_steady, env)); - return (const MDBX_meta *)env->cache_steady_meta; - } -#endif /* MDBX_CACHE_METAPTR */ - return (const MDBX_meta *)meta_prefer_steady(env); +static const uint8_t xyz_fsm_map[2 * 2 * 2 * 3 * 3 * 3] = { + 232, 201, 216, 216, 232, 233, 232, 232, 168, 201, 216, 152, 168, 233, 232, + 168, 233, 201, 216, 201, 233, 233, 232, 233, 168, 201, 152, 216, 232, 169, + 232, 168, 168, 193, 152, 152, 168, 169, 232, 168, 169, 193, 152, 194, 233, + 169, 232, 169, 232, 201, 216, 216, 232, 201, 232, 232, 168, 193, 216, 152, + 168, 193, 232, 168, 193, 193, 210, 194, 225, 193, 225, 193, 168, 137, 212, + 214, 232, 233, 168, 168, 168, 137, 212, 150, 168, 233, 168, 168, 169, 137, + 216, 201, 233, 233, 168, 169, 168, 137, 148, 214, 232, 169, 168, 168, 40, + 129, 148, 150, 168, 169, 168, 40, 169, 129, 152, 194, 233, 169, 168, 169, + 168, 137, 214, 214, 232, 201, 168, 168, 168, 129, 214, 150, 168, 193, 168, + 168, 129, 129, 210, 194, 225, 193, 161, 129, 212, 198, 212, 214, 228, 228, + 212, 212, 148, 201, 212, 150, 164, 233, 212, 148, 233, 201, 216, 201, 233, + 233, 216, 233, 148, 198, 148, 214, 228, 164, 212, 148, 148, 194, 148, 150, + 164, 169, 212, 148, 169, 194, 152, 194, 233, 169, 216, 169, 214, 198, 214, + 214, 228, 198, 212, 214, 150, 194, 214, 150, 164, 193, 212, 150, 194, 194, + 210, 194, 225, 193, 210, 194}; + +__hot static meta_xyz_t meta_tap(const MDBX_env *env) { + meta_snap_t snap; + meta_xyz_t r; + snap = meta_snap(METAPAGE(env, 0)); + r.txnid[0] = snap.txnid; + r.fsm = (uint8_t)snap.is_steady << 0; + snap = meta_snap(METAPAGE(env, 1)); + r.txnid[1] = snap.txnid; + r.fsm += (uint8_t)snap.is_steady << 1; + r.fsm += meta_cmp2int(r.txnid[0], r.txnid[1], 8); + snap = meta_snap(METAPAGE(env, 2)); + r.txnid[2] = snap.txnid; + r.fsm += (uint8_t)snap.is_steady << 2; + r.fsm += meta_cmp2int(r.txnid[0], r.txnid[2], 8 * 3); + r.fsm += meta_cmp2int(r.txnid[1], r.txnid[2], 8 * 3 * 3); + + const uint8_t xyz = xyz_fsm_map[r.fsm]; + r.recent = (xyz >> 2) & 3; + r.prefer_steady = (xyz >> 4) & 3; + r.tail_and_flags = xyz & 0xC3; + return r; } -__hot static __noinline volatile const MDBX_meta * -meta_prefer_last(const MDBX_env *env) { - return -#if MDBX_CACHE_METAPTR - ((MDBX_env *)env)->cache_last_meta = -#endif /* MDBX_CACHE_METAPTR */ - meta_mostrecent(prefer_last, env); +static txnid_t recent_committed_txnid(const MDBX_env *env) { + const txnid_t m0 = meta_txnid(METAPAGE(env, 0)); + const txnid_t m1 = meta_txnid(METAPAGE(env, 1)); + const txnid_t m2 = meta_txnid(METAPAGE(env, 2)); + return (m0 > m1) ? ((m0 > m2) ? m0 : m2) : ((m1 > m2) ? m1 : m2); } -MDBX_NOTHROW_PURE_FUNCTION static __inline const MDBX_meta * -constmeta_prefer_last(const MDBX_env *env) { -#if MDBX_CACHE_METAPTR - if (likely(env->cache_last_meta)) { - eASSERT(env, env->cache_last_meta == meta_mostrecent(prefer_last, env)); - return (const MDBX_meta *)env->cache_last_meta; - } -#endif /* MDBX_CACHE_METAPTR */ - return (const MDBX_meta *)meta_prefer_last(env); +static __inline bool meta_eq(const meta_xyz_t *z, unsigned y, unsigned x) { + assert(y < NUM_METAS && x < NUM_METAS); + return z->txnid[y] == z->txnid[x] && !(((z->fsm >> y) ^ (z->fsm >> x)) & 1); } -__cold static txnid_t recent_committed_txnid(const MDBX_env *env) { - while (true) { - volatile const MDBX_meta *head = meta_prefer_last(env); - const txnid_t recent = meta_txnid(env, head); - if (likely(head == meta_prefer_last(env) && - recent == meta_txnid(env, head))) - return recent; - } +static unsigned meta_eq_mask(const meta_xyz_t *xyz) { + return meta_eq(xyz, 0, 1) | meta_eq(xyz, 1, 2) << 1 | meta_eq(xyz, 2, 0) << 2; } -static const char *durable_caption(volatile const MDBX_meta *const meta) { +__hot static bool meta_should_retry(const MDBX_env *env, meta_xyz_t *xyz) { + const meta_xyz_t prev = *xyz; + *xyz = meta_tap(env); + return prev.fsm != xyz->fsm || prev.txnid[0] != xyz->txnid[0] || + prev.txnid[1] != xyz->txnid[1] || prev.txnid[2] != xyz->txnid[2]; +} + +static __always_inline meta_ptr_t meta_recent(const MDBX_env *env, + const meta_xyz_t *xyz) { + meta_ptr_t r; + r.txnid = xyz->txnid[xyz->recent]; + r.ptr_v = METAPAGE(env, xyz->recent); + r.is_steady = (xyz->fsm >> xyz->recent) & 1; + return r; +} + +static __always_inline meta_ptr_t meta_prefer_steady(const MDBX_env *env, + const meta_xyz_t *xyz) { + meta_ptr_t r; + r.txnid = xyz->txnid[xyz->prefer_steady]; + r.ptr_v = METAPAGE(env, xyz->prefer_steady); + r.is_steady = (xyz->fsm >> xyz->prefer_steady) & 1; + return r; +} + +static __always_inline meta_ptr_t meta_tail(const MDBX_env *env, + const meta_xyz_t *xyz) { + const uint8_t tail = xyz->tail_and_flags & 3; + meta_ptr_t r; + r.txnid = xyz->txnid[tail]; + r.ptr_v = METAPAGE(env, tail); + r.is_steady = (xyz->fsm >> tail) & 1; + return r; +} + +static const char *durable_caption(const volatile MDBX_meta *const meta) { if (META_IS_STEADY(meta)) - return (unaligned_peek_u64_volatile(4, meta->mm_datasync_sign) == + return (unaligned_peek_u64_volatile(4, meta->mm_sign) == meta_sign((const MDBX_meta *)meta)) ? "Steady" : "Tainted"; @@ -5151,9 +5177,8 @@ static const char *durable_caption(volatile const MDBX_meta *const meta) { /*----------------------------------------------------------------------------*/ /* Find oldest txnid still referenced. */ -static txnid_t find_oldest_reader(MDBX_env *env) { +static txnid_t find_oldest_reader(MDBX_env *const env, const txnid_t steady) { const uint32_t nothing_changed = MDBX_STRING_TETRAD("None"); - const txnid_t steady = constmeta_txnid(env, constmeta_prefer_steady(env)); eASSERT(env, steady <= env->me_txn0->mt_txnid); MDBX_lockinfo *const lck = env->me_lck_mmap.lck; @@ -5212,6 +5237,11 @@ static txnid_t find_oldest_reader(MDBX_env *env) { return new_oldest; } +static txnid_t txn_oldest_reader(const MDBX_txn *const txn) { + return find_oldest_reader(txn->mt_env, + txn->tw.xyz.txnid[txn->tw.xyz.prefer_steady]); +} + /* Find largest mvcc-snapshot still referenced. */ __cold static pgno_t find_largest_snapshot(const MDBX_env *env, pgno_t last_used_page) { @@ -5583,7 +5613,6 @@ __cold static int map_resize(MDBX_env *env, const pgno_t used_pgno, } #endif /* MDBX_ENABLE_MADVISE */ - meta_cache_clear(env); rc = osal_mresize(mresize_flags, &env->me_dxb_mmap, size_bytes, limit_bytes); #if MDBX_ENABLE_MADVISE @@ -5675,22 +5704,20 @@ __cold static int map_resize_implicit(MDBX_env *env, const pgno_t used_pgno, static int meta_unsteady(MDBX_env *env, const txnid_t last_steady, MDBX_meta *const meta, mdbx_filehandle_t fd) { const uint64_t wipe = MDBX_DATASIGN_NONE; - if (unlikely(META_IS_STEADY(meta)) && - constmeta_txnid(env, meta) <= last_steady) { + if (unlikely(META_IS_STEADY(meta)) && constmeta_txnid(meta) <= last_steady) { WARNING("wipe txn #%" PRIaTXN ", meta %" PRIaPGNO, last_steady, data_page(meta)->mp_pgno); if (env->me_flags & MDBX_WRITEMAP) - unaligned_poke_u64(4, meta->mm_datasync_sign, wipe); + unaligned_poke_u64(4, meta->mm_sign, wipe); else - return osal_pwrite(fd, &wipe, sizeof(meta->mm_datasync_sign), - (uint8_t *)&meta->mm_datasync_sign - env->me_map); - if (constmeta_txnid(env, meta) == last_steady) - eASSERT(env, meta_checktxnid(env, meta, true)); + return osal_pwrite(fd, &wipe, sizeof(meta->mm_sign), + (uint8_t *)&meta->mm_sign - env->me_map); } return MDBX_SUCCESS; } -__cold static int wipe_steady(MDBX_env *env, const txnid_t last_steady) { +__cold static int wipe_steady(MDBX_txn *txn, const txnid_t last_steady) { + MDBX_env *const env = txn->mt_env; #if MDBX_ENABLE_PGOP_STAT env->me_lck->mti_pgop_stat.wops.weak += 1; #endif /* MDBX_ENABLE_PGOP_STAT */ @@ -5736,7 +5763,11 @@ __cold static int wipe_steady(MDBX_env *env, const txnid_t last_steady) { /* force oldest refresh */ atomic_store32(&env->me_lck->mti_readers_refresh_flag, true, mo_Relaxed); - meta_cache_clear(env); + tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); + txn->tw.xyz = meta_tap(env); + for (MDBX_txn *scan = txn->mt_env->me_txn0; scan; scan = scan->mt_child) + if (scan != txn) + scan->tw.xyz = txn->tw.xyz; return MDBX_SUCCESS; } @@ -6222,7 +6253,7 @@ static pgr_t page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num, int flags) { if (likely(timestamp == 0)) timestamp = osal_monotime(); #endif /* MDBX_ENABLE_PGOP_STAT */ - detent = find_oldest_reader(env) + 1; + detent = txn_oldest_reader(txn) + 1; ret.err = cursor_init(&recur.outer, txn, FREE_DBI); if (unlikely(ret.err != MDBX_SUCCESS)) @@ -6246,7 +6277,7 @@ static pgr_t page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num, int flags) { if (!(flags & MDBX_LIFORECLAIM)) { /* Do not try fetch more if the record will be too recent */ if (op != MDBX_FIRST && ++last >= detent) { - detent = find_oldest_reader(env) + 1; + detent = txn_oldest_reader(txn) + 1; if (detent <= last) break; } @@ -6256,7 +6287,7 @@ static pgr_t page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num, int flags) { if (ret.err == MDBX_NOTFOUND && (flags & MDBX_LIFORECLAIM)) { if (op == MDBX_SET_RANGE) continue; - const txnid_t snap = find_oldest_reader(env); + const txnid_t snap = txn_oldest_reader(txn); if (unlikely(detent <= snap)) { detent = snap + 1; last = snap; @@ -6278,7 +6309,7 @@ static pgr_t page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num, int flags) { } last = unaligned_peek_u64(4, key.iov_base); if (detent <= last) { - detent = find_oldest_reader(env) + 1; + detent = txn_oldest_reader(txn) + 1; if (detent <= last) { if (flags & MDBX_LIFORECLAIM) continue; @@ -6423,15 +6454,15 @@ static pgr_t page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num, int flags) { const size_t next = (size_t)pgno + num; if (flags & MDBX_ALLOC_GC) { - const MDBX_meta *const head = constmeta_prefer_last(env); - const MDBX_meta *const steady = constmeta_prefer_steady(env); + const meta_ptr_t recent = meta_recent(env, &txn->tw.xyz); + const meta_ptr_t prefer_steady = meta_prefer_steady(env, &txn->tw.xyz); /* does reclaiming stopped at the last steady point? */ - if (head != steady && META_IS_STEADY(steady) && - detent == constmeta_txnid(env, steady) + 1) { - DEBUG("gc-kick-steady: head %" PRIaTXN "-%s, tail %" PRIaTXN + if (recent.ptr_c != prefer_steady.ptr_c && prefer_steady.is_steady && + detent == prefer_steady.txnid + 1) { + DEBUG("gc-kick-steady: recent %" PRIaTXN "-%s, steady %" PRIaTXN "-%s, detent %" PRIaTXN, - constmeta_txnid(env, head), durable_caption(head), - constmeta_txnid(env, steady), durable_caption(steady), detent); + recent.txnid, durable_caption(recent.ptr_c), prefer_steady.txnid, + durable_caption(prefer_steady.ptr_c), detent); ret.err = MDBX_RESULT_TRUE; const pgno_t autosync_threshold = atomic_load32(&env->me_lck->mti_autosync_threshold, mo_Relaxed); @@ -6447,12 +6478,13 @@ static pgr_t page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num, int flags) { * AND auto-sync threshold it NOT specified */ if (F_ISSET(env->me_flags, MDBX_UTTERLY_NOSYNC) && ((autosync_threshold | autosync_period) == 0 || - next >= steady->mm_geo.now)) { + next >= prefer_steady.ptr_c->mm_geo.now)) { /* wipe steady checkpoint in MDBX_UTTERLY_NOSYNC mode * without any auto-sync threshold(s). */ - ret.err = wipe_steady(env, detent); + ret.err = wipe_steady(txn, detent); DEBUG("gc-wipe-steady, rc %d", ret.err); - eASSERT(env, steady != meta_prefer_steady(env)); + eASSERT(env, prefer_steady.ptr_c != + meta_prefer_steady(env, &txn->tw.xyz).ptr_c); } else if ((flags & MDBX_ALLOC_NEW) == 0 || (autosync_threshold && atomic_load32(&env->me_lck->mti_unsynced_pages, @@ -6466,10 +6498,12 @@ static pgr_t page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num, int flags) { (next >= txn->mt_end_pgno && (autosync_threshold | autosync_period) == 0)) { /* make steady checkpoint. */ - MDBX_meta meta = *head; - ret.err = sync_locked(env, env->me_flags & MDBX_WRITEMAP, &meta); + MDBX_meta meta = *recent.ptr_c; + ret.err = sync_locked(env, env->me_flags & MDBX_WRITEMAP, &meta, + &txn->tw.xyz); DEBUG("gc-make-steady, rc %d", ret.err); - eASSERT(env, steady != meta_prefer_steady(env)); + eASSERT(env, prefer_steady.ptr_c != + meta_prefer_steady(env, &txn->tw.xyz).ptr_c); } if (likely(ret.err != MDBX_RESULT_TRUE)) { if (unlikely(ret.err != MDBX_SUCCESS)) @@ -6485,7 +6519,7 @@ static pgr_t page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num, int flags) { goto done; if (flags & MDBX_ALLOC_GC) { - const txnid_t laggard = find_oldest_reader(env); + const txnid_t laggard = txn_oldest_reader(txn); if (laggard >= detent || (laggard < txn->mt_txnid - xMDBX_TXNID_STEP && kick_longlived_readers(env, laggard) >= detent)) continue; @@ -6909,15 +6943,22 @@ retry:; goto bailout; } + const bool inside_txn = (env->me_txn0->mt_owner == osal_thread_self()); + meta_ptr_t head; + if (inside_txn | locked) + head = meta_recent(env, &env->me_txn0->tw.xyz); + else { + const meta_xyz_t xyz = meta_tap(env); + head = meta_recent(env, &xyz); + } const pgno_t unsynced_pages = atomic_load32(&env->me_lck->mti_unsynced_pages, mo_Relaxed); - volatile const MDBX_meta *head = meta_prefer_last(env); - const txnid_t head_txnid = meta_txnid(env, head); - const uint32_t synched_meta_txnid_u32 = - atomic_load32(&env->me_lck->mti_meta_sync_txnid, mo_Relaxed); - if (unsynced_pages == 0 && synched_meta_txnid_u32 == (uint32_t)head_txnid && - META_IS_STEADY(head)) - goto bailout; + if (unsynced_pages == 0) { + const uint32_t synched_meta_txnid_u32 = + atomic_load32(&env->me_lck->mti_meta_sync_txnid, mo_Relaxed); + if (synched_meta_txnid_u32 == (uint32_t)head.txnid && head.is_steady) + goto bailout; + } const pgno_t autosync_threshold = atomic_load32(&env->me_lck->mti_autosync_threshold, mo_Relaxed); @@ -6930,13 +6971,13 @@ retry:; autosync_period)) flags &= MDBX_WRITEMAP /* clear flags for full steady sync */; - const bool inside_txn = (env->me_txn0->mt_owner == osal_thread_self()); if (!inside_txn) { if (!locked) { - int err; #if MDBX_ENABLE_PGOP_STAT unsigned wops = 0; #endif /* MDBX_ENABLE_PGOP_STAT */ + + int err; /* pre-sync to avoid latency for writer */ if (unsynced_pages > /* FIXME: define threshold */ 16 && (flags & MDBX_SAFE_NOSYNC) == 0) { @@ -6950,7 +6991,8 @@ retry:; if (unlikely(err != MDBX_SUCCESS)) return err; #endif - const size_t usedbytes = pgno_align2os_bytes(env, head->mm_geo.next); + const size_t usedbytes = + pgno_align2os_bytes(env, head.ptr_c->mm_geo.next); err = osal_msync(&env->me_dxb_mmap, 0, usedbytes, MDBX_SYNC_DATA); #if defined(_WIN32) || defined(_WIN64) osal_srwlock_ReleaseShared(&env->me_remap_guard); @@ -6980,26 +7022,25 @@ retry:; #if MDBX_ENABLE_PGOP_STAT env->me_lck->mti_pgop_stat.wops.weak += wops; #endif /* MDBX_ENABLE_PGOP_STAT */ - meta_cache_clear(env); + env->me_txn0->tw.xyz = meta_tap(env); + eASSERT(env, !env->me_txn && !env->me_txn0->mt_child); goto retry; } - env->me_txn0->mt_txnid = head_txnid; - eASSERT(env, head_txnid == meta_txnid(env, head)); - eASSERT(env, head_txnid == recent_committed_txnid(env)); - find_oldest_reader(env); + eASSERT(env, head.txnid == recent_committed_txnid(env)); + env->me_txn0->mt_txnid = head.txnid; + txn_oldest_reader(env->me_txn0); flags |= MDBX_SHRINK_ALLOWED; } eASSERT(env, inside_txn || locked); eASSERT(env, !inside_txn || (flags & MDBX_SHRINK_ALLOWED) == 0); - if (!META_IS_STEADY(head) || - ((flags & MDBX_SAFE_NOSYNC) == 0 && unsynced_pages)) { + if (!head.is_steady || ((flags & MDBX_SAFE_NOSYNC) == 0 && unsynced_pages)) { DEBUG("meta-head %" PRIaPGNO ", %s, sync_pending %" PRIaPGNO, - data_page((const void *)head)->mp_pgno, durable_caption(head), + data_page(head.ptr_c)->mp_pgno, durable_caption(head.ptr_c), unsynced_pages); - MDBX_meta meta = *head; - rc = sync_locked(env, flags, &meta); + MDBX_meta meta = *head.ptr_c; + rc = sync_locked(env, flags, &meta, &env->me_txn0->tw.xyz); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } @@ -7007,7 +7048,7 @@ retry:; /* LY: sync meta-pages if MDBX_NOMETASYNC enabled * and someone was not synced above. */ if (atomic_load32(&env->me_lck->mti_meta_sync_txnid, mo_Relaxed) != - (uint32_t)head_txnid) { + (uint32_t)head.txnid) { #if MDBX_ENABLE_PGOP_STAT env->me_lck->mti_pgop_stat.wops.weak += 1; #endif /* MDBX_ENABLE_PGOP_STAT */ @@ -7017,7 +7058,7 @@ retry:; MDBX_SYNC_DATA | MDBX_SYNC_IODQ) : osal_fsync(env->me_lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); if (likely(rc == MDBX_SUCCESS)) - atomic_store32(&env->me_lck->mti_meta_sync_txnid, (uint32_t)head_txnid, + atomic_store32(&env->me_lck->mti_meta_sync_txnid, (uint32_t)head.txnid, mo_Relaxed); } @@ -7216,8 +7257,7 @@ static void txn_valgrind(MDBX_env *env, MDBX_txn *txn) { pgno_t last = MAX_PAGENO + 1; if (env->me_txn0 && env->me_txn0->mt_owner == osal_thread_self()) { /* inside write-txn */ - const MDBX_meta *head = constmeta_prefer_last(env); - last = head->mm_geo.next; + last = meta_recent(env, &env->me_txn0->xyz).ptr_v->mm_geo.next; } else if (env->me_flags & MDBX_RDONLY) { /* read-only mode, no write-txn, no wlock mutex */ last = NUM_METAS; @@ -7225,7 +7265,6 @@ static void txn_valgrind(MDBX_env *env, MDBX_txn *txn) { /* no write-txn */ last = NUM_METAS; should_unlock = true; - meta_cache_clear(env); } else { /* write txn is running, therefore shouldn't poison any memory range */ return; @@ -7389,18 +7428,18 @@ __cold int mdbx_thread_unregister(const MDBX_env *env) { } /* check against todo4recovery://erased_by_github/libmdbx/issues/269 */ -static bool meta_checktxnid(const MDBX_env *env, const volatile MDBX_meta *meta, - bool report) { - const txnid_t head_txnid = meta_txnid(env, meta); - const txnid_t freedb_mod_txnid = meta->mm_dbs[FREE_DBI].md_mod_txnid; - const txnid_t maindb_mod_txnid = meta->mm_dbs[MAIN_DBI].md_mod_txnid; +static bool coherency_check(const MDBX_env *env, const txnid_t txnid, + const volatile MDBX_db *dbs, + const volatile MDBX_meta *meta, bool report) { + const txnid_t freedb_mod_txnid = dbs[FREE_DBI].md_mod_txnid; + const txnid_t maindb_mod_txnid = dbs[MAIN_DBI].md_mod_txnid; - const pgno_t freedb_root_pgno = meta->mm_dbs[FREE_DBI].md_root; + const pgno_t freedb_root_pgno = dbs[FREE_DBI].md_root; const MDBX_page *freedb_root = (env->me_map && freedb_root_pgno != P_INVALID) ? pgno2page(env, freedb_root_pgno) : nullptr; - const pgno_t maindb_root_pgno = meta->mm_dbs[MAIN_DBI].md_root; + const pgno_t maindb_root_pgno = dbs[MAIN_DBI].md_root; const MDBX_page *maindb_root = (env->me_map && maindb_root_pgno != P_INVALID) ? pgno2page(env, maindb_root_pgno) : nullptr; @@ -7408,23 +7447,23 @@ static bool meta_checktxnid(const MDBX_env *env, const volatile MDBX_meta *meta, unaligned_peek_u64_volatile(4, &meta->mm_magic_and_version); bool ok = true; - if (unlikely(!head_txnid || head_txnid < freedb_mod_txnid || + if (unlikely(txnid < freedb_mod_txnid || (!freedb_mod_txnid && freedb_root && likely(magic_and_version == MDBX_DATA_MAGIC)))) { if (report) WARNING("catch invalid %sdb.mod_txnid %" PRIaTXN " for meta_txnid %" PRIaTXN " %s", - "free", freedb_mod_txnid, head_txnid, + "free", freedb_mod_txnid, txnid, "(workaround for incoherent flaw of unified page/buffer cache)"); ok = false; } - if (unlikely(head_txnid < maindb_mod_txnid || + if (unlikely(txnid < maindb_mod_txnid || (!maindb_mod_txnid && maindb_root && likely(magic_and_version == MDBX_DATA_MAGIC)))) { if (report) WARNING("catch invalid %sdb.mod_txnid %" PRIaTXN " for meta_txnid %" PRIaTXN " %s", - "main", maindb_mod_txnid, head_txnid, + "main", maindb_mod_txnid, txnid, "(workaround for incoherent flaw of unified page/buffer cache)"); ok = false; } @@ -7461,38 +7500,62 @@ static bool meta_checktxnid(const MDBX_env *env, const volatile MDBX_meta *meta, return ok; } -__cold static bool is_timeout(uint64_t *timestamp) { - if (likely(!*timestamp)) { +__cold static int coherency_timeout(uint64_t *timestamp) { + if (likely(timestamp && *timestamp == 0)) *timestamp = osal_monotime(); - return false; + else if (unlikely(!timestamp || osal_monotime() - *timestamp > 65536 / 10)) { + ERROR("bailout waiting for valid snapshot (%s)", + "workaround for incoherent flaw of unified page/buffer cache"); + return MDBX_CORRUPTED; } - return osal_monotime() - *timestamp > 65536 / 10; + + osal_memory_fence(mo_AcquireRelease, true); +#if defined(_WIN32) || defined(_WIN64) + SwitchToThread(); +#elif defined(__linux__) || defined(__gnu_linux__) || defined(_UNIX03_SOURCE) + sched_yield(); +#elif (defined(_GNU_SOURCE) && __GLIBC_PREREQ(2, 1)) || defined(_OPEN_THREADS) + pthread_yield(); +#else + usleep(42); +#endif + return MDBX_RESULT_TRUE; } /* check with timeout as the workaround * for todo4recovery://erased_by_github/libmdbx/issues/269 */ -static int meta_waittxnid(const MDBX_env *env, const volatile MDBX_meta *meta, - uint64_t *timestamp) { - if (likely(meta_checktxnid(env, meta, !*timestamp))) - return MDBX_SUCCESS; +__hot static int coherency_check_readed(const MDBX_env *env, + const txnid_t txnid, + const volatile MDBX_db *dbs, + const volatile MDBX_meta *meta, + uint64_t *timestamp) { + const bool report = !(timestamp && *timestamp); + if (unlikely(!coherency_check(env, txnid, dbs, meta, report))) + return coherency_timeout(timestamp); + return MDBX_SUCCESS; +} - if (likely(!is_timeout(timestamp))) { - osal_memory_fence(mo_AcquireRelease, true); -#if defined(_WIN32) || defined(_WIN64) - SwitchToThread(); -#elif defined(__linux__) || defined(__gnu_linux__) || defined(_UNIX03_SOURCE) - sched_yield(); -#elif (defined(_GNU_SOURCE) && __GLIBC_PREREQ(2, 1)) || defined(_OPEN_THREADS) - pthread_yield(); -#else - usleep(42); -#endif - return MDBX_RESULT_TRUE; +static int coherency_check_written(const MDBX_env *env, const txnid_t txnid, + const volatile MDBX_meta *meta, + uint64_t *timestamp) { + const bool report = !(timestamp && *timestamp); + const txnid_t head_txnid = meta_txnid(meta); + if (unlikely(head_txnid < MIN_TXNID || (head_txnid < txnid))) { + if (report) + WARNING("catch %s txnid %" PRIaTXN " for meta_%" PRIaPGNO " %s", + (head_txnid < MIN_TXNID) ? "invalid" : "unexpected", head_txnid, + bytes2pgno(env, (const uint8_t *)meta - env->me_dxb_mmap.dxb), + "(workaround for incoherent flaw of unified page/buffer cache)"); + return coherency_timeout(timestamp); } + return coherency_check_readed(env, head_txnid, meta->mm_dbs, meta, timestamp); +} - ERROR("bailout waiting for valid snapshot (%s)", - "workaround for incoherent flaw of unified page/buffer cache"); - return MDBX_CORRUPTED; +static bool coherency_check_meta(const MDBX_env *env, + const volatile MDBX_meta *meta, bool report) { + uint64_t timestamp = 0; + return coherency_check_written(env, 0, meta, report ? ×tamp : nullptr) == + MDBX_SUCCESS; } /* Common code for mdbx_txn_begin() and mdbx_txn_renew(). */ @@ -7571,31 +7634,28 @@ static int txn_renew(MDBX_txn *txn, const unsigned flags) { /* Seek & fetch the last meta */ uint64_t timestamp = 0; unsigned loop = 0; + meta_xyz_t xyz = meta_tap(env); while (1) { - meta_cache_clear(env); - volatile const MDBX_meta *const meta = + const meta_ptr_t head = likely(env->me_stuck_meta < 0) - ? /* regular */ meta_prefer_last(env) - : /* recovery mode */ METAPAGE(env, env->me_stuck_meta); - jitter4testing(false); - const txnid_t target_txnid = meta_txnid(env, meta); - jitter4testing(false); + ? /* regular */ meta_recent(env, &xyz) + : /* recovery mode */ meta_ptr(env, env->me_stuck_meta); if (likely(r)) { safe64_reset(&r->mr_txnid, false); - atomic_store32(&r->mr_snapshot_pages_used, meta->mm_geo.next, + atomic_store32(&r->mr_snapshot_pages_used, head.ptr_v->mm_geo.next, mo_Relaxed); - atomic_store64(&r->mr_snapshot_pages_retired, - unaligned_peek_u64_volatile(4, meta->mm_pages_retired), - mo_Relaxed); - safe64_write(&r->mr_txnid, target_txnid); - jitter4testing(false); + atomic_store64( + &r->mr_snapshot_pages_retired, + unaligned_peek_u64_volatile(4, head.ptr_v->mm_pages_retired), + mo_Relaxed); + safe64_write(&r->mr_txnid, head.txnid); eASSERT(env, r->mr_pid.weak == osal_getpid()); eASSERT(env, r->mr_tid.weak == ((env->me_flags & MDBX_NOTLS) ? 0 : osal_thread_self())); - eASSERT(env, r->mr_txnid.weak == target_txnid || + eASSERT(env, r->mr_txnid.weak == head.txnid || (r->mr_txnid.weak >= SAFE64_INVALID_THRESHOLD && - target_txnid < env->me_lck->mti_oldest_reader.weak)); + head.txnid < env->me_lck->mti_oldest_reader.weak)); atomic_store32(&env->me_lck->mti_readers_refresh_flag, true, mo_AcquireRelease); } else { @@ -7606,39 +7666,48 @@ static int txn_renew(MDBX_txn *txn, const unsigned flags) { jitter4testing(true); /* Snap the state from current meta-head */ - txn->mt_txnid = target_txnid; - txn->mt_geo = meta->mm_geo; - STATIC_ASSERT(CORE_DBS == 2); - txn->mt_dbs[0] = meta->mm_dbs[0]; - txn->mt_dbs[1] = meta->mm_dbs[1]; - txn->mt_canary = meta->mm_canary; + txn->mt_txnid = head.txnid; + txn->mt_geo = head.ptr_v->mm_geo; + memcpy(txn->mt_dbs, head.ptr_c->mm_dbs, CORE_DBS * sizeof(MDBX_db)); + txn->mt_canary = head.ptr_v->mm_canary; - /* LY: Retry on a race, ITS#7970. */ - const txnid_t oldest = - atomic_load64(&env->me_lck->mti_oldest_reader, mo_AcquireRelease); - if (unlikely(target_txnid < oldest || - (meta != meta_prefer_last(env) && env->me_stuck_meta < 0) || - target_txnid != meta_txnid(env, meta))) { + if (unlikely(env->me_stuck_meta >= 0)) + break; + if (unlikely(meta_should_retry(env, &xyz) || + head.txnid < atomic_load64(&env->me_lck->mti_oldest_reader, + mo_AcquireRelease))) { if (unlikely(++loop > 42)) { ERROR("bailout waiting for valid snapshot (%s)", "metapages are too volatile"); rc = MDBX_PROBLEM; + txn->mt_txnid = INVALID_TXNID; + if (likely(r)) + safe64_reset(&r->mr_txnid, false); goto bailout; } timestamp = 0; continue; } - rc = meta_waittxnid(env, meta, ×tamp); + rc = coherency_check_readed(env, head.txnid, txn->mt_dbs, head.ptr_v, + ×tamp); jitter4testing(false); if (likely(rc == MDBX_SUCCESS)) break; - if (unlikely(rc != MDBX_RESULT_TRUE)) + + if (unlikely(rc != MDBX_RESULT_TRUE)) { + txn->mt_txnid = INVALID_TXNID; + if (likely(r)) + safe64_reset(&r->mr_txnid, false); goto bailout; + } } if (unlikely(txn->mt_txnid < MIN_TXNID || txn->mt_txnid > MAX_TXNID)) { ERROR("%s", "environment corrupted by died writer, must shutdown!"); + if (likely(r)) + safe64_reset(&r->mr_txnid, false); + txn->mt_txnid = INVALID_TXNID; rc = MDBX_CORRUPTED; goto bailout; } @@ -7687,22 +7756,21 @@ static int txn_renew(MDBX_txn *txn, const unsigned flags) { } #endif /* Windows */ - meta_cache_clear(env); - jitter4testing(false); - const MDBX_meta *meta = constmeta_prefer_last(env); + txn->tw.xyz = meta_tap(env); + const meta_ptr_t head = meta_recent(env, &txn->tw.xyz); uint64_t timestamp = 0; while ( "workaround for todo4recovery://erased_by_github/libmdbx/issues/269") { - rc = meta_waittxnid(env, meta, ×tamp); + rc = coherency_check_readed(env, head.txnid, head.ptr_v->mm_dbs, + head.ptr_v, ×tamp); if (likely(rc == MDBX_SUCCESS)) break; if (unlikely(rc != MDBX_RESULT_TRUE)) goto bailout; } - jitter4testing(false); - txn->mt_canary = meta->mm_canary; - const txnid_t snap = constmeta_txnid(env, meta); - txn->mt_txnid = safe64_txnid_next(snap); + txn->mt_canary = head.ptr_c->mm_canary; + eASSERT(env, meta_txnid(head.ptr_v) == head.txnid); + txn->mt_txnid = safe64_txnid_next(head.txnid); if (unlikely(txn->mt_txnid > MAX_TXNID)) { rc = MDBX_TXN_FULL; ERROR("txnid overflow, raise %d", rc); @@ -7726,9 +7794,9 @@ static int txn_renew(MDBX_txn *txn, const unsigned flags) { txn->mt_numdbs = env->me_numdbs; memcpy(txn->mt_dbiseqs, env->me_dbiseqs, txn->mt_numdbs * sizeof(unsigned)); /* Copy the DB info and flags */ - memcpy(txn->mt_dbs, meta->mm_dbs, CORE_DBS * sizeof(MDBX_db)); + memcpy(txn->mt_dbs, head.ptr_c->mm_dbs, CORE_DBS * sizeof(MDBX_db)); /* Moved to here to avoid a data race in read TXNs */ - txn->mt_geo = meta->mm_geo; + txn->mt_geo = head.ptr_c->mm_geo; rc = dpl_alloc(txn); if (unlikely(rc != MDBX_SUCCESS)) @@ -8040,6 +8108,7 @@ int mdbx_txn_begin_ex(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags, txn->mt_numdbs = parent->mt_numdbs; txn->mt_owner = parent->mt_owner; memcpy(txn->mt_dbs, parent->mt_dbs, txn->mt_numdbs * sizeof(MDBX_db)); + txn->tw.xyz = parent->tw.xyz; /* Copy parent's mt_dbistate, but clear DB_NEW */ for (unsigned i = 0; i < txn->mt_numdbs; i++) txn->mt_dbistate[i] = @@ -8114,23 +8183,21 @@ int mdbx_txn_info(const MDBX_txn *txn, MDBX_txn_info *info, bool scan_rlt) { info->txn_space_used = pgno2bytes(env, txn->mt_geo.next); if (txn->mt_flags & MDBX_TXN_RDONLY) { - volatile const MDBX_meta *head_meta; - txnid_t head_txnid; + meta_ptr_t head; uint64_t head_retired; + meta_xyz_t xyz = meta_tap(env); do { /* fetch info from volatile head */ - head_meta = meta_prefer_last(env); - head_txnid = meta_txnid(env, head_meta); + head = meta_recent(env, &xyz); head_retired = - unaligned_peek_u64_volatile(4, head_meta->mm_pages_retired); - info->txn_space_limit_soft = pgno2bytes(env, head_meta->mm_geo.now); - info->txn_space_limit_hard = pgno2bytes(env, head_meta->mm_geo.upper); + unaligned_peek_u64_volatile(4, head.ptr_v->mm_pages_retired); + info->txn_space_limit_soft = pgno2bytes(env, head.ptr_v->mm_geo.now); + info->txn_space_limit_hard = pgno2bytes(env, head.ptr_v->mm_geo.upper); info->txn_space_leftover = - pgno2bytes(env, head_meta->mm_geo.now - head_meta->mm_geo.next); - } while (unlikely(head_meta != meta_prefer_last(env) || - head_txnid != meta_txnid(env, head_meta))); + pgno2bytes(env, head.ptr_v->mm_geo.now - head.ptr_v->mm_geo.next); + } while (unlikely(meta_should_retry(env, &xyz))); - info->txn_reader_lag = head_txnid - info->txn_id; + info->txn_reader_lag = head.txnid - info->txn_id; info->txn_space_dirty = info->txn_space_retired = 0; uint64_t reader_snapshot_pages_retired; if (txn->to.reader && @@ -8144,7 +8211,7 @@ int mdbx_txn_info(const MDBX_txn *txn, MDBX_txn_info *info, bool scan_rlt) { MDBX_lockinfo *const lck = env->me_lck_mmap.lck; if (scan_rlt && info->txn_reader_lag > 1 && lck) { /* find next more recent reader */ - txnid_t next_reader = head_txnid; + txnid_t next_reader = head.txnid; const unsigned snap_nreaders = atomic_load32(&lck->mti_numreaders, mo_AcquireRelease); for (unsigned i = 0; i < snap_nreaders; ++i) { @@ -8195,7 +8262,7 @@ int mdbx_txn_info(const MDBX_txn *txn, MDBX_txn_info *info, bool scan_rlt) { const unsigned snap_nreaders = atomic_load32(&lck->mti_numreaders, mo_AcquireRelease); if (snap_nreaders) { - oldest_snapshot = find_oldest_reader(env); + oldest_snapshot = txn_oldest_reader(txn); if (oldest_snapshot == txn->mt_txnid - 1) { /* check if there is at least one reader */ bool exists = false; @@ -8516,6 +8583,8 @@ static int txn_end(MDBX_txn *txn, const unsigned mode) { (parent->mt_flags & MDBX_TXN_HAS_CHILD) != 0); eASSERT(env, pnl_check_allocated(txn->tw.reclaimed_pglist, txn->mt_next_pgno - MDBX_ENABLE_REFUND)); + eASSERT(env, + memcmp(&txn->tw.xyz, &parent->tw.xyz, sizeof(meta_xyz_t)) == 0); if (txn->tw.lifo_reclaimed) { eASSERT(env, MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) >= @@ -9268,7 +9337,7 @@ retry: retry_rid: ctx->cursor.outer.mc_flags &= ~C_RECLAIMING; do { - snap_oldest = find_oldest_reader(env); + snap_oldest = txn_oldest_reader(txn); rc = page_alloc_slowpath(&ctx->cursor.outer, 0, MDBX_ALLOC_GC | MDBX_ALLOC_SLOT | MDBX_ALLOC_FAKE) @@ -9301,7 +9370,7 @@ retry: ctx->rid = MDBX_PNL_LAST(txn->tw.lifo_reclaimed); } else { tASSERT(txn, txn->tw.last_reclaimed == 0); - if (unlikely(find_oldest_reader(env) != snap_oldest)) + if (unlikely(txn_oldest_reader(txn) != snap_oldest)) /* should retry page_alloc_slowpath(MDBX_ALLOC_GC) * if the oldest reader changes since the last attempt */ goto retry_rid; @@ -9396,7 +9465,7 @@ retry: } else { tASSERT(txn, txn->tw.lifo_reclaimed == NULL); if (unlikely(ctx->rid == 0)) { - ctx->rid = find_oldest_reader(env); + ctx->rid = txn_oldest_reader(txn); rc = mdbx_cursor_get(&ctx->cursor.outer, &key, NULL, MDBX_FIRST); if (rc == MDBX_SUCCESS) { if (unlikely(key.iov_len != sizeof(txnid_t))) { @@ -10312,6 +10381,7 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { DEBUG("update main's entry for sub-db %u, mod_txnid %" PRIaTXN " -> %" PRIaTXN, i, db->md_mod_txnid, txn->mt_txnid); + /* Может быть mod_txnid > front после коммита вложенных тразакций */ db->md_mod_txnid = txn->mt_txnid; data.iov_base = db; WITH_CURSOR_TRACKING(couple.outer, @@ -10360,14 +10430,14 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { ts_3 = latency ? osal_monotime() : 0; if (likely(rc == MDBX_SUCCESS)) { - const MDBX_meta *head = constmeta_prefer_last(env); + const meta_ptr_t head = meta_recent(env, &txn->tw.xyz); MDBX_meta meta; - memcpy(meta.mm_magic_and_version, head->mm_magic_and_version, 8); - meta.mm_extra_flags = head->mm_extra_flags; - meta.mm_validator_id = head->mm_validator_id; - meta.mm_extra_pagehdr = head->mm_extra_pagehdr; + memcpy(meta.mm_magic_and_version, head.ptr_c->mm_magic_and_version, 8); + meta.mm_extra_flags = head.ptr_c->mm_extra_flags; + meta.mm_validator_id = head.ptr_c->mm_validator_id; + meta.mm_extra_pagehdr = head.ptr_c->mm_extra_pagehdr; unaligned_poke_u64(4, meta.mm_pages_retired, - unaligned_peek_u64(4, head->mm_pages_retired) + + unaligned_peek_u64(4, head.ptr_c->mm_pages_retired) + MDBX_PNL_SIZE(txn->tw.retired_pages)); meta.mm_geo = txn->mt_geo; meta.mm_dbs[FREE_DBI] = txn->mt_dbs[FREE_DBI]; @@ -10385,7 +10455,7 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { meta_set_txnid(env, &meta, commit_txnid); rc = sync_locked(env, env->me_flags | txn->mt_flags | MDBX_SHRINK_ALLOWED, - &meta); + &meta, &txn->tw.xyz); } ts_4 = latency ? osal_monotime() : 0; if (unlikely(rc != MDBX_SUCCESS)) { @@ -10461,11 +10531,10 @@ static int validate_meta(MDBX_env *env, MDBX_meta *const meta, /* LY: check signature as a checksum */ if (META_IS_STEADY(meta) && - unlikely(unaligned_peek_u64(4, &meta->mm_datasync_sign) != - meta_sign(meta))) { + unlikely(unaligned_peek_u64(4, &meta->mm_sign) != meta_sign(meta))) { WARNING("meta[%u] has invalid steady-checksum (0x%" PRIx64 " != 0x%" PRIx64 "), skip it", - meta_number, unaligned_peek_u64(4, &meta->mm_datasync_sign), + meta_number, unaligned_peek_u64(4, &meta->mm_sign), meta_sign(meta)); return MDBX_RESULT_TRUE; } @@ -10682,7 +10751,7 @@ __cold static int read_header(MDBX_env *env, MDBX_meta *dest, return rc; memset(dest, 0, sizeof(MDBX_meta)); - unaligned_poke_u64(4, dest->mm_datasync_sign, MDBX_DATASIGN_WEAK); + unaligned_poke_u64(4, dest->mm_sign, MDBX_DATASIGN_WEAK); rc = MDBX_CORRUPTED; /* Read twice all meta pages so we can find the latest one. */ @@ -10738,10 +10807,18 @@ __cold static int read_header(MDBX_env *env, MDBX_meta *dest, if (rc != MDBX_SUCCESS) continue; - if ((env->me_stuck_meta < 0) - ? meta_ot(meta_bootid_match(meta) ? prefer_last : prefer_steady, - env, dest, meta) - : (meta_number == (unsigned)env->me_stuck_meta)) { + bool latch; + if (env->me_stuck_meta >= 0) + latch = (meta_number == (unsigned)env->me_stuck_meta); + else if (meta_bootid_match(meta)) + latch = meta_choice_recent( + meta->unsafe_txnid, SIGN_IS_STEADY(meta->unsafe_sign), + dest->unsafe_txnid, SIGN_IS_STEADY(dest->unsafe_sign)); + else + latch = meta_choice_steady( + meta->unsafe_txnid, SIGN_IS_STEADY(meta->unsafe_sign), + dest->unsafe_txnid, SIGN_IS_STEADY(dest->unsafe_sign)); + if (latch) { *dest = *meta; if (!lck_exclusive && !META_IS_STEADY(dest)) loop_limit += 1; /* LY: should re-read to hush race with update */ @@ -10805,8 +10882,8 @@ __cold static MDBX_page *meta_model(const MDBX_env *env, MDBX_page *model, model_meta->mm_dbs[FREE_DBI].md_root = P_INVALID; model_meta->mm_dbs[MAIN_DBI].md_root = P_INVALID; meta_set_txnid(env, model_meta, MIN_TXNID + num); - unaligned_poke_u64(4, model_meta->mm_datasync_sign, meta_sign(model_meta)); - eASSERT(env, meta_checktxnid(env, model_meta, true)); + unaligned_poke_u64(4, model_meta->mm_sign, meta_sign(model_meta)); + eASSERT(env, coherency_check_meta(env, model_meta, true)); return (MDBX_page *)((uint8_t *)model + env->me_psize); } @@ -10817,9 +10894,6 @@ __cold static MDBX_meta *init_metas(const MDBX_env *env, void *buffer) { MDBX_page *page1 = meta_model(env, page0, 0); MDBX_page *page2 = meta_model(env, page1, 1); meta_model(env, page2, 2); - eASSERT(env, !meta_eq(env, page_meta(page0), page_meta(page1))); - eASSERT(env, !meta_eq(env, page_meta(page1), page_meta(page2))); - eASSERT(env, !meta_eq(env, page_meta(page2), page_meta(page0))); return page_meta(page2); } @@ -10837,16 +10911,15 @@ static size_t madvise_threshold(const MDBX_env *env, } #endif /* MDBX_ENABLE_MADVISE */ -static int sync_locked(MDBX_env *env, unsigned flags, - MDBX_meta *const pending) { +static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending, + meta_xyz_t *const xyz) { eASSERT(env, ((env->me_flags ^ flags) & MDBX_WRITEMAP) == 0); const MDBX_meta *const meta0 = METAPAGE(env, 0); const MDBX_meta *const meta1 = METAPAGE(env, 1); const MDBX_meta *const meta2 = METAPAGE(env, 2); - const MDBX_meta *const head = constmeta_prefer_last(env); + const meta_ptr_t head = meta_recent(env, xyz); int rc; - eASSERT(env, meta_eq_mask(env) == 0); eASSERT(env, pending < METAPAGE(env, 0) || pending > METAPAGE(env, NUM_METAS)); eASSERT(env, (env->me_flags & (MDBX_RDONLY | MDBX_FATAL_ERROR)) == 0); @@ -10872,8 +10945,9 @@ static int sync_locked(MDBX_env *env, unsigned flags, if (flags & MDBX_SHRINK_ALLOWED) { /* LY: check conditions to discard unused pages */ const pgno_t largest_pgno = find_largest_snapshot( - env, (head->mm_geo.next > pending->mm_geo.next) ? head->mm_geo.next - : pending->mm_geo.next); + env, (head.ptr_c->mm_geo.next > pending->mm_geo.next) + ? head.ptr_c->mm_geo.next + : pending->mm_geo.next); eASSERT(env, largest_pgno >= NUM_METAS); #if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__) const pgno_t edge = env->me_poison_edge; @@ -10948,26 +11022,25 @@ static int sync_locked(MDBX_env *env, unsigned flags, const pgno_t bottom = (aligned > pending->mm_geo.lower) ? aligned : pending->mm_geo.lower; if (pending->mm_geo.now > bottom) { - if (META_IS_STEADY(meta_prefer_steady(env))) + if (XYZ_HAVE_STEADY(xyz)) /* force steady, but only if steady-checkpoint is present */ flags &= MDBX_WRITEMAP | MDBX_SHRINK_ALLOWED; shrink = pending->mm_geo.now - bottom; pending->mm_geo.now = bottom; - if (unlikely(constmeta_txnid(env, head) == - unaligned_peek_u64(4, pending->mm_txnid_a))) { - const txnid_t txnid = - safe64_txnid_next(unaligned_peek_u64(4, pending->mm_txnid_a)); + if (unlikely(head.txnid == pending->unsafe_txnid)) { + const txnid_t txnid = safe64_txnid_next(pending->unsafe_txnid); NOTICE("force-forward pending-txn %" PRIaTXN " -> %" PRIaTXN, - unaligned_peek_u64(4, pending->mm_txnid_a), txnid); - ENSURE(env, env->me_txn0->mt_owner != osal_thread_self() && - !env->me_txn); + pending->unsafe_txnid, txnid); + ENSURE(env, !env->me_txn0 || + (env->me_txn0->mt_owner != osal_thread_self() && + !env->me_txn)); if (unlikely(txnid > MAX_TXNID)) { rc = MDBX_TXN_FULL; ERROR("txnid overflow, raise %d", rc); goto fail; } meta_set_txnid(env, pending, txnid); - eASSERT(env, meta_checktxnid(env, pending, true)); + eASSERT(env, coherency_check_meta(env, pending, true)); } } } @@ -10981,7 +11054,7 @@ static int sync_locked(MDBX_env *env, unsigned flags, enum osal_syncmode_bits mode_bits = MDBX_SYNC_NONE; if ((flags & MDBX_SAFE_NOSYNC) == 0) { mode_bits = MDBX_SYNC_DATA; - if (pending->mm_geo.next > meta_prefer_steady(env)->mm_geo.now) + if (pending->mm_geo.next > meta_prefer_steady(env, xyz).ptr_c->mm_geo.now) mode_bits |= MDBX_SYNC_SIZE; if (flags & MDBX_NOMETASYNC) mode_bits |= MDBX_SYNC_IODQ; @@ -11000,42 +11073,41 @@ static int sync_locked(MDBX_env *env, unsigned flags, rc = (flags & MDBX_SAFE_NOSYNC) ? MDBX_RESULT_TRUE /* carry non-steady */ : MDBX_RESULT_FALSE /* carry steady */; } - eASSERT(env, meta_checktxnid(env, pending, true)); + eASSERT(env, coherency_check_meta(env, pending, true)); /* Steady or Weak */ if (rc == MDBX_RESULT_FALSE /* carry steady */) { atomic_store64(&env->me_lck->mti_sync_timestamp, osal_monotime(), mo_Relaxed); - unaligned_poke_u64(4, pending->mm_datasync_sign, meta_sign(pending)); + unaligned_poke_u64(4, pending->mm_sign, meta_sign(pending)); atomic_store32(&env->me_lck->mti_unsynced_pages, 0, mo_Relaxed); } else { assert(rc == MDBX_RESULT_TRUE /* carry non-steady */); - unaligned_poke_u64(4, pending->mm_datasync_sign, MDBX_DATASIGN_WEAK); + unaligned_poke_u64(4, pending->mm_sign, MDBX_DATASIGN_WEAK); } + const bool legal4overwrite = + head.txnid == pending->unsafe_txnid && + memcmp(&head.ptr_c->mm_dbs, &pending->mm_dbs, sizeof(pending->mm_dbs)) == + 0 && + memcmp(&head.ptr_c->mm_canary, &pending->mm_canary, + sizeof(pending->mm_canary)) == 0 && + memcmp(&head.ptr_c->mm_geo, &pending->mm_geo, sizeof(pending->mm_geo)) == + 0; MDBX_meta *target = nullptr; - if (constmeta_txnid(env, head) == - unaligned_peek_u64(4, pending->mm_txnid_a)) { - eASSERT(env, - memcmp(&head->mm_dbs, &pending->mm_dbs, sizeof(head->mm_dbs)) == 0); - eASSERT(env, memcmp(&head->mm_canary, &pending->mm_canary, - sizeof(head->mm_canary)) == 0); - eASSERT(env, memcmp(&head->mm_geo, &pending->mm_geo, - sizeof(pending->mm_geo)) == 0); - if (!META_IS_STEADY(head) && META_IS_STEADY(pending)) - target = (MDBX_meta *)head; + if (head.txnid == pending->unsafe_txnid) { + ENSURE(env, legal4overwrite); + if (!head.is_steady && META_IS_STEADY(pending)) + target = (MDBX_meta *)head.ptr_c; else { - ENSURE(env, meta_eq(env, head, pending)); - DEBUG("%s", "skip update meta"); + WARNING("%s", "skip update meta"); return MDBX_SUCCESS; } - } else if (head == meta0) - target = (MDBX_meta *)meta_ancient_prefer_weak(env, meta1, meta2); - else if (head == meta1) - target = (MDBX_meta *)meta_ancient_prefer_weak(env, meta0, meta2); - else { - eASSERT(env, head == meta2); - target = (MDBX_meta *)meta_ancient_prefer_weak(env, meta0, meta1); + } else { + const unsigned xyz_tail = xyz->tail_and_flags & 3; + ENSURE(env, xyz_tail < NUM_METAS && xyz_tail != xyz->recent && + xyz_tail != xyz->prefer_steady); + target = (MDBX_meta *)meta_tail(env, xyz).ptr_c; } /* LY: step#2 - update meta-page. */ @@ -11046,49 +11118,51 @@ static int sync_locked(MDBX_env *env, unsigned flags, pending->mm_dbs[FREE_DBI].md_root, pending->mm_geo.lower, pending->mm_geo.next, pending->mm_geo.now, pending->mm_geo.upper, pv2pages(pending->mm_geo.grow_pv), pv2pages(pending->mm_geo.shrink_pv), - unaligned_peek_u64(4, pending->mm_txnid_a), durable_caption(pending)); + pending->unsafe_txnid, durable_caption(pending)); DEBUG("meta0: %s, %s, txn_id %" PRIaTXN ", root %" PRIaPGNO "/%" PRIaPGNO, - (meta0 == head) ? "head" - : (meta0 == target) ? "tail" - : "stay", - durable_caption(meta0), constmeta_txnid(env, meta0), + (meta0 == head.ptr_c) ? "head" + : (meta0 == target) ? "tail" + : "stay", + durable_caption(meta0), constmeta_txnid(meta0), meta0->mm_dbs[MAIN_DBI].md_root, meta0->mm_dbs[FREE_DBI].md_root); DEBUG("meta1: %s, %s, txn_id %" PRIaTXN ", root %" PRIaPGNO "/%" PRIaPGNO, - (meta1 == head) ? "head" - : (meta1 == target) ? "tail" - : "stay", - durable_caption(meta1), constmeta_txnid(env, meta1), + (meta1 == head.ptr_c) ? "head" + : (meta1 == target) ? "tail" + : "stay", + durable_caption(meta1), constmeta_txnid(meta1), meta1->mm_dbs[MAIN_DBI].md_root, meta1->mm_dbs[FREE_DBI].md_root); DEBUG("meta2: %s, %s, txn_id %" PRIaTXN ", root %" PRIaPGNO "/%" PRIaPGNO, - (meta2 == head) ? "head" - : (meta2 == target) ? "tail" - : "stay", - durable_caption(meta2), constmeta_txnid(env, meta2), + (meta2 == head.ptr_c) ? "head" + : (meta2 == target) ? "tail" + : "stay", + durable_caption(meta2), constmeta_txnid(meta2), meta2->mm_dbs[MAIN_DBI].md_root, meta2->mm_dbs[FREE_DBI].md_root); - eASSERT(env, !meta_eq(env, pending, meta0)); - eASSERT(env, !meta_eq(env, pending, meta1)); - eASSERT(env, !meta_eq(env, pending, meta2)); + eASSERT(env, pending->unsafe_txnid != constmeta_txnid(meta0) || + (META_IS_STEADY(pending) && !META_IS_STEADY(meta0))); + eASSERT(env, pending->unsafe_txnid != constmeta_txnid(meta1) || + (META_IS_STEADY(pending) && !META_IS_STEADY(meta1))); + eASSERT(env, pending->unsafe_txnid != constmeta_txnid(meta2) || + (META_IS_STEADY(pending) && !META_IS_STEADY(meta2))); eASSERT(env, ((env->me_flags ^ flags) & MDBX_WRITEMAP) == 0); - ENSURE(env, target == head || constmeta_txnid(env, target) < - unaligned_peek_u64(4, pending->mm_txnid_a)); + ENSURE(env, target == head.ptr_c || + constmeta_txnid(target) < pending->unsafe_txnid); #if MDBX_ENABLE_PGOP_STAT env->me_lck->mti_pgop_stat.wops.weak += 1; #endif /* MDBX_ENABLE_PGOP_STAT */ if (flags & MDBX_WRITEMAP) { jitter4testing(true); - if (likely(target != head)) { + if (likely(target != head.ptr_c)) { /* LY: 'invalidate' the meta. */ - meta_update_begin(env, target, - unaligned_peek_u64(4, pending->mm_txnid_a)); - unaligned_poke_u64(4, target->mm_datasync_sign, MDBX_DATASIGN_WEAK); + meta_update_begin(env, target, pending->unsafe_txnid); + unaligned_poke_u64(4, target->mm_sign, MDBX_DATASIGN_WEAK); #ifndef NDEBUG /* debug: provoke failure to catch a violators, but don't touch mm_psize * to allow readers catch actual pagesize. */ uint8_t *provoke_begin = (uint8_t *)&target->mm_dbs[FREE_DBI].md_root; - uint8_t *provoke_end = (uint8_t *)&target->mm_datasync_sign; + uint8_t *provoke_end = (uint8_t *)&target->mm_sign; memset(provoke_begin, 0xCC, provoke_end - provoke_begin); jitter4testing(false); #endif @@ -11104,21 +11178,14 @@ static int sync_locked(MDBX_env *env, unsigned flags, /* LY: 'commit' the meta */ meta_update_end(env, target, unaligned_peek_u64(4, pending->mm_txnid_b)); jitter4testing(true); - eASSERT(env, meta_checktxnid(env, target, true)); + eASSERT(env, coherency_check_meta(env, target, true)); } else { - /* dangerous case (target == head), only mm_datasync_sign could + /* dangerous case (target == head), only mm_sign could * me updated, check assertions once again */ - ENSURE(env, constmeta_txnid(env, head) == - unaligned_peek_u64(4, pending->mm_txnid_a) && - !META_IS_STEADY(head) && META_IS_STEADY(pending)); - ENSURE(env, memcmp(&head->mm_geo, &pending->mm_geo, - sizeof(head->mm_geo)) == 0); - ENSURE(env, memcmp(&head->mm_dbs, &pending->mm_dbs, - sizeof(head->mm_dbs)) == 0); - ENSURE(env, memcmp(&head->mm_canary, &pending->mm_canary, - sizeof(head->mm_canary)) == 0); + eASSERT(env, + legal4overwrite && !head.is_steady && META_IS_STEADY(pending)); } - memcpy(target->mm_datasync_sign, pending->mm_datasync_sign, 8); + memcpy(target->mm_sign, pending->mm_sign, 8); osal_flush_incoherent_cpu_writeback(); jitter4testing(true); /* sync meta-pages */ @@ -11156,19 +11223,24 @@ static int sync_locked(MDBX_env *env, unsigned flags, } } - meta_cache_clear(env); uint64_t timestamp = 0; while ("workaround for todo4recovery://erased_by_github/libmdbx/issues/269") { - rc = meta_waittxnid(env, target, ×tamp); + rc = + coherency_check_written(env, pending->unsafe_txnid, target, ×tamp); if (likely(rc == MDBX_SUCCESS)) break; if (unlikely(rc != MDBX_RESULT_TRUE)) goto fail; } env->me_lck->mti_meta_sync_txnid.weak = - (uint32_t)unaligned_peek_u64(4, pending->mm_txnid_a) - + (uint32_t)pending->unsafe_txnid - ((flags & MDBX_NOMETASYNC) ? UINT32_MAX / 3 : 0); + *xyz = meta_tap(env); + for (MDBX_txn *txn = env->me_txn0; txn; txn = txn->mt_child) + if (xyz != &txn->tw.xyz) + txn->tw.xyz = *xyz; + /* LY: shrink datafile if needed */ if (unlikely(shrink)) { VERBOSE("shrink to %" PRIaPGNO " pages (-%" PRIaPGNO ")", @@ -11177,7 +11249,7 @@ static int sync_locked(MDBX_env *env, unsigned flags, pending->mm_geo.upper); if (rc != MDBX_SUCCESS && rc != MDBX_EPERM) goto fail; - eASSERT(env, meta_checktxnid(env, target, true)); + eASSERT(env, coherency_check_meta(env, target, true)); } MDBX_lockinfo *const lck = env->me_lck_mmap.lck; @@ -11415,34 +11487,37 @@ mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t size_now, if (unlikely(err != MDBX_SUCCESS)) return err; need_unlock = true; - meta_cache_clear(env); + env->me_txn0->tw.xyz = meta_tap(env); + eASSERT(env, !env->me_txn && !env->me_txn0->mt_child); } - const MDBX_meta *head = constmeta_prefer_last(env); + const meta_ptr_t head = meta_recent(env, &env->me_txn0->tw.xyz); if (!inside_txn) { - env->me_txn0->mt_txnid = constmeta_txnid(env, head); - find_oldest_reader(env); + env->me_txn0->mt_txnid = head.txnid; + txn_oldest_reader(env->me_txn0); } - /* get untouched params from DB */ + /* get untouched params from current TXN or DB */ if (pagesize <= 0 || pagesize >= INT_MAX) pagesize = env->me_psize; + const MDBX_geo *const geo = + inside_txn ? &env->me_txn->mt_geo : &head.ptr_c->mm_geo; if (size_lower < 0) - size_lower = pgno2bytes(env, head->mm_geo.lower); + size_lower = pgno2bytes(env, geo->lower); if (size_now < 0) - size_now = pgno2bytes(env, head->mm_geo.now); + size_now = pgno2bytes(env, geo->now); if (size_upper < 0) - size_upper = pgno2bytes(env, head->mm_geo.upper); + size_upper = pgno2bytes(env, geo->upper); if (growth_step < 0) - growth_step = pgno2bytes(env, pv2pages(head->mm_geo.grow_pv)); + growth_step = pgno2bytes(env, pv2pages(geo->grow_pv)); if (shrink_threshold < 0) - shrink_threshold = pgno2bytes(env, pv2pages(head->mm_geo.shrink_pv)); + shrink_threshold = pgno2bytes(env, pv2pages(geo->shrink_pv)); if (pagesize != (intptr_t)env->me_psize) { rc = MDBX_EINVAL; goto bailout; } const size_t usedbytes = - pgno2bytes(env, find_largest_snapshot(env, head->mm_geo.next)); + pgno2bytes(env, find_largest_snapshot(env, geo->next)); if ((size_t)size_upper < usedbytes) { rc = MDBX_MAP_FULL; goto bailout; @@ -11639,19 +11714,20 @@ mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t size_now, const MDBX_geo *current_geo; if (!inside_txn) { eASSERT(env, need_unlock); - const MDBX_meta *head = constmeta_prefer_last(env); + const meta_ptr_t head = meta_recent(env, &env->me_txn0->tw.xyz); uint64_t timestamp = 0; while ("workaround for " "todo4recovery://erased_by_github/libmdbx/issues/269") { - meta = *head; - rc = meta_waittxnid(env, &meta, ×tamp); + meta = *head.ptr_c; + rc = coherency_check_readed(env, head.txnid, meta.mm_dbs, &meta, + ×tamp); if (likely(rc == MDBX_SUCCESS)) break; if (unlikely(rc != MDBX_RESULT_TRUE)) goto bailout; } - const txnid_t txnid = safe64_txnid_next(constmeta_txnid(env, &meta)); + const txnid_t txnid = safe64_txnid_next(head.txnid); if (unlikely(txnid > MAX_TXNID)) { rc = MDBX_TXN_FULL; ERROR("txnid overflow, raise %d", rc); @@ -11734,7 +11810,7 @@ mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t size_now, env->me_txn->mt_flags |= MDBX_TXN_DIRTY; } else { meta.mm_geo = new_geo; - rc = sync_locked(env, env->me_flags, &meta); + rc = sync_locked(env, env->me_flags, &meta, &env->me_txn0->tw.xyz); } if (likely(rc == MDBX_SUCCESS)) { @@ -11786,9 +11862,9 @@ __cold static int alloc_page_buf(MDBX_env *env) { /* Further setup required for opening an MDBX environment */ __cold static int setup_dxb(MDBX_env *env, const int lck_rc, const mdbx_mode_t mode_bits) { - MDBX_meta meta; + MDBX_meta header; int rc = MDBX_RESULT_FALSE; - int err = read_header(env, &meta, lck_rc, mode_bits); + int err = read_header(env, &header, lck_rc, mode_bits); if (unlikely(err != MDBX_SUCCESS)) { if (lck_rc != /* lck exclusive */ MDBX_RESULT_TRUE || err != MDBX_ENODATA || (env->me_flags & MDBX_RDONLY) != 0 || @@ -11809,7 +11885,7 @@ __cold static int setup_dxb(MDBX_env *env, const int lck_rc, if (unlikely(err != MDBX_SUCCESS)) return err; - meta = *init_metas(env, env->me_pbuf); + header = *init_metas(env, env->me_pbuf); err = osal_pwrite(env->me_lazy_fd, env->me_pbuf, env->me_psize * NUM_METAS, 0); if (unlikely(err != MDBX_SUCCESS)) @@ -11822,34 +11898,36 @@ __cold static int setup_dxb(MDBX_env *env, const int lck_rc, return err; #ifndef NDEBUG /* just for checking */ - err = read_header(env, &meta, lck_rc, mode_bits); + err = read_header(env, &header, lck_rc, mode_bits); if (unlikely(err != MDBX_SUCCESS)) return err; #endif } - VERBOSE( - "header: root %" PRIaPGNO "/%" PRIaPGNO ", geo %" PRIaPGNO "/%" PRIaPGNO - "-%" PRIaPGNO "/%" PRIaPGNO " +%u -%u, txn_id %" PRIaTXN ", %s", - meta.mm_dbs[MAIN_DBI].md_root, meta.mm_dbs[FREE_DBI].md_root, - meta.mm_geo.lower, meta.mm_geo.next, meta.mm_geo.now, meta.mm_geo.upper, - pv2pages(meta.mm_geo.grow_pv), pv2pages(meta.mm_geo.shrink_pv), - unaligned_peek_u64(4, meta.mm_txnid_a), durable_caption(&meta)); + VERBOSE("header: root %" PRIaPGNO "/%" PRIaPGNO ", geo %" PRIaPGNO + "/%" PRIaPGNO "-%" PRIaPGNO "/%" PRIaPGNO " +%u -%u, txn_id %" PRIaTXN + ", %s", + header.mm_dbs[MAIN_DBI].md_root, header.mm_dbs[FREE_DBI].md_root, + header.mm_geo.lower, header.mm_geo.next, header.mm_geo.now, + header.mm_geo.upper, pv2pages(header.mm_geo.grow_pv), + pv2pages(header.mm_geo.shrink_pv), + unaligned_peek_u64(4, header.mm_txnid_a), durable_caption(&header)); - if (env->me_psize != meta.mm_psize) - setup_pagesize(env, meta.mm_psize); - const size_t used_bytes = pgno2bytes(env, meta.mm_geo.next); + if (env->me_psize != header.mm_psize) + setup_pagesize(env, header.mm_psize); + const size_t used_bytes = pgno2bytes(env, header.mm_geo.next); const size_t used_aligned2os_bytes = ceil_powerof2(used_bytes, env->me_os_psize); if ((env->me_flags & MDBX_RDONLY) /* readonly */ || lck_rc != MDBX_RESULT_TRUE /* not exclusive */ || /* recovery mode */ env->me_stuck_meta >= 0) { /* use present params from db */ - const size_t pagesize = meta.mm_psize; + const size_t pagesize = header.mm_psize; err = mdbx_env_set_geometry( - env, meta.mm_geo.lower * pagesize, meta.mm_geo.now * pagesize, - meta.mm_geo.upper * pagesize, pv2pages(meta.mm_geo.grow_pv) * pagesize, - pv2pages(meta.mm_geo.shrink_pv) * pagesize, meta.mm_psize); + env, header.mm_geo.lower * pagesize, header.mm_geo.now * pagesize, + header.mm_geo.upper * pagesize, + pv2pages(header.mm_geo.grow_pv) * pagesize, + pv2pages(header.mm_geo.shrink_pv) * pagesize, header.mm_psize); if (unlikely(err != MDBX_SUCCESS)) { ERROR("%s: err %d", "could not apply preconfigured geometry from db", err); @@ -11867,13 +11945,13 @@ __cold static int setup_dxb(MDBX_env *env, const int lck_rc, * - shrink threshold or growth step * But ignore change just a 'now/current' size. */ if (bytes_align2os_bytes(env, env->me_dbgeo.upper) != - pgno2bytes(env, meta.mm_geo.upper) || + pgno2bytes(env, header.mm_geo.upper) || bytes_align2os_bytes(env, env->me_dbgeo.lower) != - pgno2bytes(env, meta.mm_geo.lower) || + pgno2bytes(env, header.mm_geo.lower) || bytes_align2os_bytes(env, env->me_dbgeo.shrink) != - pgno2bytes(env, pv2pages(meta.mm_geo.shrink_pv)) || + pgno2bytes(env, pv2pages(header.mm_geo.shrink_pv)) || bytes_align2os_bytes(env, env->me_dbgeo.grow) != - pgno2bytes(env, pv2pages(meta.mm_geo.grow_pv))) { + pgno2bytes(env, pv2pages(header.mm_geo.grow_pv))) { if (env->me_dbgeo.shrink && env->me_dbgeo.now > used_bytes) /* pre-shrink if enabled */ @@ -11882,44 +11960,45 @@ __cold static int setup_dxb(MDBX_env *env, const int lck_rc, err = mdbx_env_set_geometry(env, env->me_dbgeo.lower, env->me_dbgeo.now, env->me_dbgeo.upper, env->me_dbgeo.grow, - env->me_dbgeo.shrink, meta.mm_psize); + env->me_dbgeo.shrink, header.mm_psize); if (unlikely(err != MDBX_SUCCESS)) { ERROR("%s: err %d", "could not apply preconfigured db-geometry", err); return (err == MDBX_EINVAL) ? MDBX_INCOMPATIBLE : err; } /* update meta fields */ - meta.mm_geo.now = bytes2pgno(env, env->me_dbgeo.now); - meta.mm_geo.lower = bytes2pgno(env, env->me_dbgeo.lower); - meta.mm_geo.upper = bytes2pgno(env, env->me_dbgeo.upper); - meta.mm_geo.grow_pv = pages2pv(bytes2pgno(env, env->me_dbgeo.grow)); - meta.mm_geo.shrink_pv = pages2pv(bytes2pgno(env, env->me_dbgeo.shrink)); + header.mm_geo.now = bytes2pgno(env, env->me_dbgeo.now); + header.mm_geo.lower = bytes2pgno(env, env->me_dbgeo.lower); + header.mm_geo.upper = bytes2pgno(env, env->me_dbgeo.upper); + header.mm_geo.grow_pv = pages2pv(bytes2pgno(env, env->me_dbgeo.grow)); + header.mm_geo.shrink_pv = pages2pv(bytes2pgno(env, env->me_dbgeo.shrink)); VERBOSE("amended: root %" PRIaPGNO "/%" PRIaPGNO ", geo %" PRIaPGNO "/%" PRIaPGNO "-%" PRIaPGNO "/%" PRIaPGNO " +%u -%u, txn_id %" PRIaTXN ", %s", - meta.mm_dbs[MAIN_DBI].md_root, meta.mm_dbs[FREE_DBI].md_root, - meta.mm_geo.lower, meta.mm_geo.next, meta.mm_geo.now, - meta.mm_geo.upper, pv2pages(meta.mm_geo.grow_pv), - pv2pages(meta.mm_geo.shrink_pv), - unaligned_peek_u64(4, meta.mm_txnid_a), durable_caption(&meta)); + header.mm_dbs[MAIN_DBI].md_root, header.mm_dbs[FREE_DBI].md_root, + header.mm_geo.lower, header.mm_geo.next, header.mm_geo.now, + header.mm_geo.upper, pv2pages(header.mm_geo.grow_pv), + pv2pages(header.mm_geo.shrink_pv), + unaligned_peek_u64(4, header.mm_txnid_a), + durable_caption(&header)); } else { /* fetch back 'now/current' size, since it was ignored during comparison * and may differ. */ - env->me_dbgeo.now = pgno_align2os_bytes(env, meta.mm_geo.now); + env->me_dbgeo.now = pgno_align2os_bytes(env, header.mm_geo.now); } - ENSURE(env, meta.mm_geo.now >= meta.mm_geo.next); + ENSURE(env, header.mm_geo.now >= header.mm_geo.next); } else { /* geo-params are not pre-configured by user, * get current values from the meta. */ - env->me_dbgeo.now = pgno2bytes(env, meta.mm_geo.now); - env->me_dbgeo.lower = pgno2bytes(env, meta.mm_geo.lower); - env->me_dbgeo.upper = pgno2bytes(env, meta.mm_geo.upper); - env->me_dbgeo.grow = pgno2bytes(env, pv2pages(meta.mm_geo.grow_pv)); - env->me_dbgeo.shrink = pgno2bytes(env, pv2pages(meta.mm_geo.shrink_pv)); + env->me_dbgeo.now = pgno2bytes(env, header.mm_geo.now); + env->me_dbgeo.lower = pgno2bytes(env, header.mm_geo.lower); + env->me_dbgeo.upper = pgno2bytes(env, header.mm_geo.upper); + env->me_dbgeo.grow = pgno2bytes(env, pv2pages(header.mm_geo.grow_pv)); + env->me_dbgeo.shrink = pgno2bytes(env, pv2pages(header.mm_geo.shrink_pv)); } - ENSURE(env, pgno_align2os_bytes(env, meta.mm_geo.now) == env->me_dbgeo.now); + ENSURE(env, pgno_align2os_bytes(env, header.mm_geo.now) == env->me_dbgeo.now); ENSURE(env, env->me_dbgeo.now >= used_bytes); const uint64_t filesize_before = env->me_dxb_mmap.filesize; if (unlikely(filesize_before != env->me_dbgeo.now)) { @@ -11937,7 +12016,7 @@ __cold static int setup_dxb(MDBX_env *env, const int lck_rc, if (filesize_before < used_bytes) { ERROR("last-page beyond end-of-file (last %" PRIaPGNO ", have %" PRIaPGNO ")", - meta.mm_geo.next, bytes2pgno(env, (size_t)filesize_before)); + header.mm_geo.next, bytes2pgno(env, (size_t)filesize_before)); return MDBX_CORRUPTED; } @@ -12011,6 +12090,8 @@ __cold static int setup_dxb(MDBX_env *env, const int lck_rc, : env->me_dxb_mmap.limit); #endif /* MDBX_USE_VALGRIND || __SANITIZE_ADDRESS__ */ + meta_xyz_t xyz = meta_tap(env); + eASSERT(env, !env->me_txn && !env->me_txn0); //-------------------------------- validate/rollback head & steady meta-pages if (unlikely(env->me_stuck_meta >= 0)) { /* recovery mode */ @@ -12024,7 +12105,7 @@ __cold static int setup_dxb(MDBX_env *env, const int lck_rc, } } else /* not recovery mode */ while (1) { - const unsigned meta_clash_mask = meta_eq_mask(env); + const unsigned meta_clash_mask = meta_eq_mask(&xyz); if (unlikely(meta_clash_mask)) { ERROR("meta-pages are clashed: mask 0x%d", meta_clash_mask); return MDBX_CORRUPTED; @@ -12033,11 +12114,7 @@ __cold static int setup_dxb(MDBX_env *env, const int lck_rc, if (lck_rc != /* lck exclusive */ MDBX_RESULT_TRUE) { /* non-exclusive mode, * meta-pages should be validated by a first process opened the DB */ - volatile const MDBX_meta *const head = meta_prefer_last(env); - volatile const MDBX_meta *const steady = meta_prefer_steady(env); - const txnid_t head_txnid = meta_txnid(env, head); - const txnid_t steady_txnid = meta_txnid(env, steady); - if (head_txnid == steady_txnid) + if (xyz.recent == xyz.prefer_steady) break; if (!env->me_lck_mmap.lck) { @@ -12055,29 +12132,29 @@ __cold static int setup_dxb(MDBX_env *env, const int lck_rc, eASSERT(env, lck_rc == MDBX_RESULT_TRUE); /* exclusive mode */ + const meta_ptr_t recent = meta_recent(env, &xyz); + const meta_ptr_t prefer_steady = meta_prefer_steady(env, &xyz); MDBX_meta clone; - const MDBX_meta *const steady = constmeta_prefer_steady(env); - const MDBX_meta *const head = constmeta_prefer_last(env); - const txnid_t steady_txnid = constmeta_txnid(env, steady); - if (META_IS_STEADY(steady)) { - err = validate_meta_copy(env, steady, &clone); + if (prefer_steady.is_steady) { + err = validate_meta_copy(env, prefer_steady.ptr_c, &clone); if (unlikely(err != MDBX_SUCCESS)) { ERROR("meta[%u] with %s txnid %" PRIaTXN " is corrupted, %s needed", - bytes2pgno(env, (uint8_t *)steady - env->me_map), "steady", - steady_txnid, "manual recovery"); + bytes2pgno(env, (uint8_t *)prefer_steady.ptr_c - env->me_map), + "steady", prefer_steady.txnid, "manual recovery"); return MDBX_CORRUPTED; } - if (steady == head) + if (prefer_steady.ptr_c == recent.ptr_c) break; } - const pgno_t pgno = bytes2pgno(env, (uint8_t *)head - env->me_map); - const txnid_t head_txnid = constmeta_txnid(env, head); - const bool head_valid = - validate_meta_copy(env, head, &clone) == MDBX_SUCCESS; - eASSERT(env, !META_IS_STEADY(steady) || head_txnid != steady_txnid); - if (unlikely(!head_valid)) { - if (unlikely(!META_IS_STEADY(steady))) { + const pgno_t pgno = + bytes2pgno(env, (uint8_t *)recent.ptr_c - env->me_map); + const bool last_valid = + validate_meta_copy(env, recent.ptr_c, &clone) == MDBX_SUCCESS; + eASSERT(env, + !prefer_steady.is_steady || recent.txnid != prefer_steady.txnid); + if (unlikely(!last_valid)) { + if (unlikely(!prefer_steady.is_steady)) { ERROR("%s for open or automatic rollback, %s", "there are no suitable meta-pages", "manual recovery is required"); @@ -12085,11 +12162,11 @@ __cold static int setup_dxb(MDBX_env *env, const int lck_rc, } WARNING("meta[%u] with last txnid %" PRIaTXN " is corrupted, rollback needed", - pgno, head_txnid); + pgno, recent.txnid); goto purge_meta_head; } - if (meta_bootid_match(head)) { + if (meta_bootid_match(recent.ptr_c)) { if (env->me_flags & MDBX_RDONLY) { ERROR("%s, but boot-id(%016" PRIx64 "-%016" PRIx64 ") is MATCH: " "rollback NOT needed, steady-sync NEEDED%s", @@ -12100,12 +12177,12 @@ __cold static int setup_dxb(MDBX_env *env, const int lck_rc, WARNING("%s, but boot-id(%016" PRIx64 "-%016" PRIx64 ") is MATCH: " "rollback NOT needed, steady-sync NEEDED%s", "opening after an unclean shutdown", bootid.x, bootid.y, ""); - meta = clone; - atomic_store32(&env->me_lck->mti_unsynced_pages, meta.mm_geo.next, + header = clone; + atomic_store32(&env->me_lck->mti_unsynced_pages, header.mm_geo.next, mo_Relaxed); break; } - if (unlikely(!META_IS_STEADY(steady))) { + if (unlikely(!prefer_steady.is_steady)) { ERROR("%s, but %s for automatic rollback: %s", "opening after an unclean shutdown", "there are no suitable meta-pages", @@ -12115,30 +12192,31 @@ __cold static int setup_dxb(MDBX_env *env, const int lck_rc, if (env->me_flags & MDBX_RDONLY) { ERROR("%s and rollback needed: (from head %" PRIaTXN " to steady %" PRIaTXN ")%s", - "opening after an unclean shutdown", head_txnid, steady_txnid, - ", but unable in read-only mode"); + "opening after an unclean shutdown", recent.txnid, + prefer_steady.txnid, ", but unable in read-only mode"); return MDBX_WANNA_RECOVERY; } purge_meta_head: NOTICE("%s and doing automatic rollback: " "purge%s meta[%u] with%s txnid %" PRIaTXN, - "opening after an unclean shutdown", head_valid ? "" : " invalid", - pgno, head_valid ? " weak" : "", head_txnid); - ENSURE(env, META_IS_STEADY(steady)); - err = override_meta(env, pgno, 0, head_valid ? head : steady); + "opening after an unclean shutdown", last_valid ? "" : " invalid", + pgno, last_valid ? " weak" : "", recent.txnid); + ENSURE(env, prefer_steady.is_steady); + err = override_meta(env, pgno, 0, + last_valid ? recent.ptr_c : prefer_steady.ptr_c); if (err) { ERROR("rollback: overwrite meta[%u] with txnid %" PRIaTXN ", error %d", - pgno, head_txnid, err); + pgno, recent.txnid, err); return err; } - ENSURE(env, 0 == meta_txnid(env, head)); - ENSURE(env, 0 == meta_eq_mask(env)); + xyz = meta_tap(env); + ENSURE(env, 0 == meta_txnid(recent.ptr_v)); + ENSURE(env, 0 == meta_eq_mask(&xyz)); } if (lck_rc == /* lck exclusive */ MDBX_RESULT_TRUE) { //-------------------------------------------------- shrink DB & update geo - const MDBX_meta *head = constmeta_prefer_last(env); /* re-check size after mmap */ if ((env->me_dxb_mmap.current & (env->me_os_psize - 1)) != 0 || env->me_dxb_mmap.current < used_bytes) { @@ -12147,28 +12225,30 @@ __cold static int setup_dxb(MDBX_env *env, const int lck_rc, return MDBX_PROBLEM; } if (env->me_dxb_mmap.current != env->me_dbgeo.now) { - meta.mm_geo.now = bytes2pgno(env, env->me_dxb_mmap.current); + header.mm_geo.now = bytes2pgno(env, env->me_dxb_mmap.current); NOTICE("need update meta-geo to filesize %" PRIuPTR " bytes, %" PRIaPGNO " pages", - env->me_dxb_mmap.current, meta.mm_geo.now); + env->me_dxb_mmap.current, header.mm_geo.now); } - if (memcmp(&meta.mm_geo, &head->mm_geo, sizeof(meta.mm_geo))) { + const meta_ptr_t recent = meta_recent(env, &xyz); + if (memcmp(&header.mm_geo, &recent.ptr_c->mm_geo, sizeof(header.mm_geo))) { if ((env->me_flags & MDBX_RDONLY) != 0 || /* recovery mode */ env->me_stuck_meta >= 0) { WARNING("skipped update meta.geo in %s mode: from l%" PRIaPGNO "-n%" PRIaPGNO "-u%" PRIaPGNO "/s%u-g%u, to l%" PRIaPGNO "-n%" PRIaPGNO "-u%" PRIaPGNO "/s%u-g%u", (env->me_stuck_meta < 0) ? "read-only" : "recovery", - head->mm_geo.lower, head->mm_geo.now, head->mm_geo.upper, - pv2pages(head->mm_geo.shrink_pv), - pv2pages(head->mm_geo.grow_pv), meta.mm_geo.lower, - meta.mm_geo.now, meta.mm_geo.upper, - pv2pages(meta.mm_geo.shrink_pv), pv2pages(meta.mm_geo.grow_pv)); + recent.ptr_c->mm_geo.lower, recent.ptr_c->mm_geo.now, + recent.ptr_c->mm_geo.upper, + pv2pages(recent.ptr_c->mm_geo.shrink_pv), + pv2pages(recent.ptr_c->mm_geo.grow_pv), header.mm_geo.lower, + header.mm_geo.now, header.mm_geo.upper, + pv2pages(header.mm_geo.shrink_pv), + pv2pages(header.mm_geo.grow_pv)); } else { - const txnid_t txnid = constmeta_txnid(env, head); - const txnid_t next_txnid = safe64_txnid_next(txnid); - if (unlikely(txnid > MAX_TXNID)) { + const txnid_t next_txnid = safe64_txnid_next(recent.txnid); + if (unlikely(next_txnid > MAX_TXNID)) { ERROR("txnid overflow, raise %d", MDBX_TXN_FULL); return MDBX_TXN_FULL; } @@ -12177,27 +12257,31 @@ __cold static int setup_dxb(MDBX_env *env, const int lck_rc, "/s%u-g%u (txn#%" PRIaTXN "), " "to l%" PRIaPGNO "-n%" PRIaPGNO "-u%" PRIaPGNO "/s%u-g%u (txn#%" PRIaTXN ")", - head->mm_geo.lower, head->mm_geo.now, head->mm_geo.upper, - pv2pages(head->mm_geo.shrink_pv), pv2pages(head->mm_geo.grow_pv), - txnid, meta.mm_geo.lower, meta.mm_geo.now, meta.mm_geo.upper, - pv2pages(meta.mm_geo.shrink_pv), pv2pages(meta.mm_geo.grow_pv), - next_txnid); + recent.ptr_c->mm_geo.lower, recent.ptr_c->mm_geo.now, + recent.ptr_c->mm_geo.upper, + pv2pages(recent.ptr_c->mm_geo.shrink_pv), + pv2pages(recent.ptr_c->mm_geo.grow_pv), recent.txnid, + header.mm_geo.lower, header.mm_geo.now, header.mm_geo.upper, + pv2pages(header.mm_geo.shrink_pv), + pv2pages(header.mm_geo.grow_pv), next_txnid); - ENSURE(env, meta_eq(env, &meta, head)); - meta_set_txnid(env, &meta, next_txnid); - err = sync_locked(env, env->me_flags | MDBX_SHRINK_ALLOWED, &meta); + ENSURE(env, header.unsafe_txnid == recent.txnid); + meta_set_txnid(env, &header, next_txnid); + err = sync_locked(env, env->me_flags | MDBX_SHRINK_ALLOWED, &header, + &xyz); if (err) { ERROR("error %d, while updating meta.geo: " "from l%" PRIaPGNO "-n%" PRIaPGNO "-u%" PRIaPGNO "/s%u-g%u (txn#%" PRIaTXN "), " "to l%" PRIaPGNO "-n%" PRIaPGNO "-u%" PRIaPGNO "/s%u-g%u (txn#%" PRIaTXN ")", - err, head->mm_geo.lower, head->mm_geo.now, head->mm_geo.upper, - pv2pages(head->mm_geo.shrink_pv), - pv2pages(head->mm_geo.grow_pv), txnid, meta.mm_geo.lower, - meta.mm_geo.now, meta.mm_geo.upper, - pv2pages(meta.mm_geo.shrink_pv), pv2pages(meta.mm_geo.grow_pv), - next_txnid); + err, recent.ptr_c->mm_geo.lower, recent.ptr_c->mm_geo.now, + recent.ptr_c->mm_geo.upper, + pv2pages(recent.ptr_c->mm_geo.shrink_pv), + pv2pages(recent.ptr_c->mm_geo.grow_pv), recent.txnid, + header.mm_geo.lower, header.mm_geo.now, header.mm_geo.upper, + pv2pages(header.mm_geo.shrink_pv), + pv2pages(header.mm_geo.grow_pv), header.unsafe_txnid); return err; } } @@ -12209,15 +12293,15 @@ __cold static int setup_dxb(MDBX_env *env, const int lck_rc, if ((env->me_flags & MDBX_RDONLY) == 0 && env->me_stuck_meta < 0 && (runtime_flags & MDBX_DBG_DONT_UPGRADE) == 0) { for (int n = 0; n < NUM_METAS; ++n) { - MDBX_meta *const pmeta = METAPAGE(env, n); - if (unlikely(unaligned_peek_u64(4, &pmeta->mm_magic_and_version) != + MDBX_meta *const meta = METAPAGE(env, n); + if (unlikely(unaligned_peek_u64(4, &meta->mm_magic_and_version) != MDBX_DATA_MAGIC)) { - const txnid_t txnid = constmeta_txnid(env, pmeta); + const txnid_t txnid = constmeta_txnid(meta); NOTICE("%s %s" "meta[%u], txnid %" PRIaTXN, "updating db-format signature for", - META_IS_STEADY(pmeta) ? "stead-" : "weak-", n, txnid); - err = override_meta(env, n, txnid, pmeta); + META_IS_STEADY(meta) ? "stead-" : "weak-", n, txnid); + err = override_meta(env, n, txnid, meta); if (unlikely(err != MDBX_SUCCESS) && /* Just ignore the MDBX_PROBLEM error, since here it is * returned only in case of the attempt to upgrade an obsolete @@ -12228,6 +12312,7 @@ __cold static int setup_dxb(MDBX_env *env, const int lck_rc, "updating db-format signature for", n, txnid, err); return err; } + xyz = meta_tap(env); } } } @@ -12561,12 +12646,12 @@ __cold static int __must_check_result override_meta(MDBX_env *env, meta_model(env, page, target); MDBX_meta *const model = page_meta(page); meta_set_txnid(env, model, txnid); - eASSERT(env, meta_checktxnid(env, model, true)); + eASSERT(env, coherency_check_meta(env, model, true)); if (shape) { - if (txnid && unlikely(!meta_checktxnid(env, shape, false))) { + if (txnid && unlikely(!coherency_check_meta(env, shape, false))) { ERROR("bailout overriding meta-%u since model failed " "freedb/maindb %s-check for txnid #%" PRIaTXN, - target, "pre", constmeta_txnid(env, shape)); + target, "pre", constmeta_txnid(shape)); return MDBX_PROBLEM; } if (runtime_flags & MDBX_DBG_DONT_UPGRADE) @@ -12587,7 +12672,7 @@ __cold static int __must_check_result override_meta(MDBX_env *env, model->mm_dbs[MAIN_DBI].md_root != P_INVALID)) memcpy(&model->mm_magic_and_version, &shape->mm_magic_and_version, sizeof(model->mm_magic_and_version)); - if (unlikely(!meta_checktxnid(env, model, false))) { + if (unlikely(!coherency_check_meta(env, model, false))) { ERROR("bailout overriding meta-%u since model failed " "freedb/maindb %s-check for txnid #%" PRIaTXN, target, "post", txnid); @@ -12595,7 +12680,7 @@ __cold static int __must_check_result override_meta(MDBX_env *env, } } } - unaligned_poke_u64(4, model->mm_datasync_sign, meta_sign(model)); + unaligned_poke_u64(4, model->mm_sign, meta_sign(model)); rc = validate_meta(env, model, page, target, nullptr); if (unlikely(MDBX_IS_ERROR(rc))) return MDBX_PROBLEM; @@ -12629,7 +12714,7 @@ __cold static int __must_check_result override_meta(MDBX_env *env, } osal_flush_incoherent_mmap(env->me_map, pgno2bytes(env, NUM_METAS), env->me_os_psize); - meta_cache_clear(env); + eASSERT(env, !env->me_txn && !env->me_txn0); return rc; } @@ -12645,18 +12730,18 @@ __cold int mdbx_env_turn_for_recovery(MDBX_env *env, unsigned target) { return MDBX_EPERM; const MDBX_meta *target_meta = METAPAGE(env, target); - txnid_t new_txnid = safe64_txnid_next(constmeta_txnid(env, target_meta)); + txnid_t new_txnid = safe64_txnid_next(constmeta_txnid(target_meta)); for (unsigned n = 0; n < NUM_METAS; ++n) { - MDBX_page *page = pgno2page(env, n); - MDBX_meta meta = *page_meta(page); if (n == target) continue; - if (validate_meta(env, &meta, page, n, nullptr) != MDBX_SUCCESS) { + MDBX_meta meta = *METAPAGE(env, target); + if (validate_meta(env, &meta, pgno2page(env, n), n, nullptr) != + MDBX_SUCCESS) { int err = override_meta(env, n, 0, nullptr); if (unlikely(err != MDBX_SUCCESS)) return err; } else { - txnid_t txnid = constmeta_txnid(env, &meta); + txnid_t txnid = constmeta_txnid(&meta); if (new_txnid <= txnid) new_txnid = safe64_txnid_next(txnid); } @@ -13157,14 +13242,15 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, #if MDBX_DEBUG if (rc == MDBX_SUCCESS) { - const MDBX_meta *meta = (const MDBX_meta *)meta_prefer_last(env); - const MDBX_db *db = &meta->mm_dbs[MAIN_DBI]; + const meta_xyz_t xyz = meta_tap(env); + const meta_ptr_t head = meta_recent(env, &xyz); + const MDBX_db *db = &head.ptr_c->mm_dbs[MAIN_DBI]; DEBUG("opened database version %u, pagesize %u", - (uint8_t)unaligned_peek_u64(4, meta->mm_magic_and_version), + (uint8_t)unaligned_peek_u64(4, head.ptr_c->mm_magic_and_version), env->me_psize); DEBUG("using meta page %" PRIaPGNO ", txn %" PRIaTXN, - data_page(meta)->mp_pgno, constmeta_txnid(env, meta)); + data_page(head.ptr_c)->mp_pgno, head.txnid); DEBUG("depth: %u", db->md_depth); DEBUG("entries: %" PRIu64, db->md_entries); DEBUG("branch pages: %" PRIaPGNO, db->md_branch_pages); @@ -13959,6 +14045,7 @@ __hot static int page_search(MDBX_cursor *mc, const MDBX_val *key, int flags) { if ((scan->mt_flags & MDBX_TXN_DIRTY) && (mc->mc_dbi == MAIN_DBI || (scan->mt_dbistate[mc->mc_dbi] & DBI_DIRTY))) { + /* После коммита вложенных тразакций может быть mod_txnid > front */ pp_txnid = scan->mt_front; break; } @@ -19412,7 +19499,7 @@ __cold static void compacting_fixup_meta(MDBX_env *env, MDBX_meta *meta) { /* Update signature */ assert(meta->mm_geo.now >= meta->mm_geo.next); - unaligned_poke_u64(4, meta->mm_datasync_sign, meta_sign(meta)); + unaligned_poke_u64(4, meta->mm_sign, meta_sign(meta)); } /* Make resizeable */ @@ -19444,7 +19531,7 @@ __cold static int env_compact(MDBX_env *env, MDBX_txn *read_txn, /* copy canary sequences if present */ if (read_txn->mt_canary.v) { meta->mm_canary = read_txn->mt_canary; - meta->mm_canary.v = constmeta_txnid(env, meta); + meta->mm_canary.v = constmeta_txnid(meta); } if (read_txn->mt_dbs[MAIN_DBI].md_root == P_INVALID) { @@ -19598,17 +19685,19 @@ __cold static int env_copy_asis(MDBX_env *env, MDBX_txn *read_txn, jitter4testing(false); const size_t meta_bytes = pgno2bytes(env, NUM_METAS); + const meta_xyz_t xyz = meta_tap(env); /* Make a snapshot of meta-pages, * but writing ones after the data was flushed */ memcpy(buffer, env->me_map, meta_bytes); MDBX_meta *const headcopy = /* LY: get pointer to the snapshot copy */ - (MDBX_meta *)(buffer + ((uint8_t *)meta_prefer_last(env) - env->me_map)); + (MDBX_meta *)(buffer + + ((uint8_t *)meta_recent(env, &xyz).ptr_c - env->me_map)); mdbx_txn_unlock(env); if (flags & MDBX_CP_FORCE_DYNAMIC_SIZE) meta_make_sizeable(headcopy); /* Update signature to steady */ - unaligned_poke_u64(4, headcopy->mm_datasync_sign, meta_sign(headcopy)); + unaligned_poke_u64(4, headcopy->mm_sign, meta_sign(headcopy)); /* Copy the data */ const size_t whole_size = pgno_align2os_bytes(env, read_txn->mt_end_pgno); @@ -19862,7 +19951,6 @@ __cold int mdbx_env_set_flags(MDBX_env *env, MDBX_env_flags_t flags, if (unlikely(rc)) return rc; should_unlock = true; - meta_cache_clear(env); } if (onoff) @@ -20174,21 +20262,30 @@ __cold static int fetch_envinfo_ex(const MDBX_env *env, const MDBX_txn *txn, if (unlikely(env->me_flags & MDBX_FATAL_ERROR)) return MDBX_PANIC; - volatile const MDBX_meta *const recent_meta = meta_prefer_last(env); - arg->mi_recent_txnid = meta_txnid(env, recent_meta); - arg->mi_meta0_txnid = meta_txnid(env, meta0); - arg->mi_meta0_sign = unaligned_peek_u64(4, meta0->mm_datasync_sign); - arg->mi_meta1_txnid = meta_txnid(env, meta1); - arg->mi_meta1_sign = unaligned_peek_u64(4, meta1->mm_datasync_sign); - arg->mi_meta2_txnid = meta_txnid(env, meta2); - arg->mi_meta2_sign = unaligned_peek_u64(4, meta2->mm_datasync_sign); + meta_xyz_t holder; + meta_xyz_t const *xyz; + if (txn && !(txn->mt_flags & MDBX_TXN_RDONLY)) + xyz = &txn->tw.xyz; + else { + holder = meta_tap(env); + xyz = &holder; + } + + const meta_ptr_t head = meta_recent(env, xyz); + arg->mi_recent_txnid = head.txnid; + arg->mi_meta0_txnid = xyz->txnid[0]; + arg->mi_meta0_sign = unaligned_peek_u64(4, meta0->mm_sign); + arg->mi_meta1_txnid = xyz->txnid[1]; + arg->mi_meta1_sign = unaligned_peek_u64(4, meta1->mm_sign); + arg->mi_meta2_txnid = xyz->txnid[2]; + arg->mi_meta2_sign = unaligned_peek_u64(4, meta2->mm_sign); if (likely(bytes > size_before_bootid)) { memcpy(&arg->mi_bootid.meta0, &meta0->mm_bootid, 16); memcpy(&arg->mi_bootid.meta1, &meta1->mm_bootid, 16); memcpy(&arg->mi_bootid.meta2, &meta2->mm_bootid, 16); } - volatile const MDBX_meta *txn_meta = recent_meta; + const volatile MDBX_meta *txn_meta = head.ptr_v; arg->mi_last_pgno = txn_meta->mm_geo.next - 1; arg->mi_geo.current = pgno2bytes(env, txn_meta->mm_geo.now); if (txn) { @@ -20858,7 +20955,6 @@ int mdbx_drop(MDBX_txn *txn, MDBX_dbi dbi, bool del) { txn->mt_dbs[dbi].md_entries = 0; txn->mt_dbs[dbi].md_root = P_INVALID; txn->mt_dbs[dbi].md_seq = 0; - /* txn->mt_dbs[dbi].md_mod_txnid = txn->mt_txnid; */ txn->mt_flags |= MDBX_TXN_DIRTY; } @@ -20936,19 +21032,18 @@ __cold int mdbx_reader_list(const MDBX_env *env, MDBX_reader_list_func *func, size_t bytes_retained = 0; uint64_t lag = 0; if (txnid) { + meta_xyz_t xyz = meta_tap(env); retry_header:; - volatile const MDBX_meta *const recent_meta = meta_prefer_last(env); + const meta_ptr_t head = meta_recent(env, &xyz); const uint64_t head_pages_retired = - unaligned_peek_u64_volatile(4, recent_meta->mm_pages_retired); - const txnid_t head_txnid = meta_txnid(env, recent_meta); - if (unlikely(recent_meta != meta_prefer_last(env) || + unaligned_peek_u64_volatile(4, head.ptr_v->mm_pages_retired); + if (unlikely(meta_should_retry(env, &xyz) || head_pages_retired != unaligned_peek_u64_volatile( - 4, recent_meta->mm_pages_retired)) || - head_txnid != meta_txnid(env, recent_meta)) + 4, head.ptr_v->mm_pages_retired))) goto retry_header; - lag = (head_txnid - txnid) / xMDBX_TXNID_STEP; + lag = (head.txnid - txnid) / xMDBX_TXNID_STEP; bytes_used = pgno2bytes(env, pages_used); bytes_retained = (head_pages_retired > reader_pages_retired) ? pgno2bytes(env, (pgno_t)(head_pages_retired - @@ -21136,19 +21231,20 @@ __cold int mdbx_setup_debug(int level, int flags, MDBX_debug_func *logger) { __cold static txnid_t kick_longlived_readers(MDBX_env *env, const txnid_t laggard) { DEBUG("DB size maxed out by reading #%" PRIaTXN, laggard); + osal_memory_fence(mo_AcquireRelease, false); MDBX_hsr_func *const callback = env->me_hsr_callback; txnid_t oldest = 0; bool notify_eof_of_loop = false; int retry = 0; do { + const txnid_t steady = + env->me_txn->tw.xyz.txnid[env->me_txn->tw.xyz.prefer_steady]; env->me_lck->mti_readers_refresh_flag.weak = /* force refresh */ true; - oldest = find_oldest_reader(env); + oldest = find_oldest_reader(env, steady); eASSERT(env, oldest < env->me_txn0->mt_txnid); eASSERT(env, oldest >= laggard); eASSERT(env, oldest >= env->me_lck->mti_oldest_reader.weak); - osal_memory_fence(mo_AcquireRelease, false); - const txnid_t steady = meta_txnid(env, meta_prefer_steady(env)); MDBX_lockinfo *const lck = env->me_lck_mmap.lck; if (oldest == steady || oldest > laggard || /* without-LCK mode */ !lck) break; @@ -21181,11 +21277,10 @@ __cold static txnid_t kick_longlived_readers(MDBX_env *env, stucked->mr_snapshot_pages_retired.weak != hold_retired) continue; - const MDBX_meta *head_meta = constmeta_prefer_last(env); - const txnid_t gap = - (constmeta_txnid(env, head_meta) - laggard) / xMDBX_TXNID_STEP; + const meta_ptr_t head = meta_recent(env, &env->me_txn->tw.xyz); + const txnid_t gap = (head.txnid - laggard) / xMDBX_TXNID_STEP; const uint64_t head_retired = - unaligned_peek_u64(4, head_meta->mm_pages_retired); + unaligned_peek_u64(4, head.ptr_c->mm_pages_retired); const size_t space = (head_retired > hold_retired) ? pgno2bytes(env, (pgno_t)(head_retired - hold_retired)) @@ -21268,18 +21363,18 @@ int mdbx_txn_straggler(const MDBX_txn *txn, int *percent) return 0; } - txnid_t recent = 0; - volatile const MDBX_meta *meta = nullptr; + txnid_t lag; + meta_xyz_t xyz = meta_tap(env); do { - meta = meta_prefer_last(env); - recent = meta_txnid(env, meta); + const meta_ptr_t head = meta_recent(env, &xyz); if (percent) { - const pgno_t maxpg = meta->mm_geo.now; - *percent = (int)((meta->mm_geo.next * UINT64_C(100) + maxpg / 2) / maxpg); + const pgno_t maxpg = head.ptr_v->mm_geo.now; + *percent = + (int)((head.ptr_v->mm_geo.next * UINT64_C(100) + maxpg / 2) / maxpg); } - } while (unlikely(recent != meta_txnid(env, meta))); + lag = (head.txnid - txn->mt_txnid) / xMDBX_TXNID_STEP; + } while (unlikely(meta_should_retry(env, &xyz))); - txnid_t lag = (recent - txn->mt_txnid) / xMDBX_TXNID_STEP; return (lag > INT_MAX) ? INT_MAX : (int)lag; } @@ -22687,7 +22782,6 @@ __cold int mdbx_env_set_option(MDBX_env *env, const MDBX_option_t option, if (unlikely(err != MDBX_SUCCESS)) return err; should_unlock = true; - meta_cache_clear(env); } env->me_options.dp_reserve_limit = (unsigned)value; while (env->me_dp_reserve_len > env->me_options.dp_reserve_limit) { @@ -22724,7 +22818,6 @@ __cold int mdbx_env_set_option(MDBX_env *env, const MDBX_option_t option, if (unlikely(err != MDBX_SUCCESS)) return err; should_unlock = true; - meta_cache_clear(env); } if (env->me_txn) err = MDBX_EPERM /* unable change during transaction */; @@ -22891,6 +22984,51 @@ __cold void global_ctor(void) { bootid = osal_bootid(); +#if MDBX_DEBUG + for (unsigned i = 0; i < 2 * 2 * 2 * 3 * 3 * 3; ++i) { + const bool s0 = (i >> 0) & 1; + const bool s1 = (i >> 1) & 1; + const bool s2 = (i >> 2) & 1; + const uint8_t c01 = (i / (8 * 1)) % 3; + const uint8_t c02 = (i / (8 * 3)) % 3; + const uint8_t c12 = (i / (8 * 9)) % 3; + const uint8_t xyz = meta_cmp2pack(c01, c02, c12, s0, s1, s2); + + const uint8_t recent = (xyz >> 2) & 3; + const uint8_t prefer_steady = (xyz >> 4) & 3; + const uint8_t tail = xyz & 3; + const bool strict = (xyz & 64) != 0; + const bool valid = (xyz & 128) != 0; + + const uint8_t recent_chk = meta_cmp2recent(c01, s0, s1) + ? (meta_cmp2recent(c02, s0, s2) ? 0 : 2) + : (meta_cmp2recent(c12, s1, s2) ? 1 : 2); + const uint8_t prefer_steady_chk = + meta_cmp2steady(c01, s0, s1) ? (meta_cmp2steady(c02, s0, s2) ? 0 : 2) + : (meta_cmp2steady(c12, s1, s2) ? 1 : 2); + + uint8_t tail_chk; + if (recent_chk == 0) + tail_chk = meta_cmp2steady(c12, s1, s2) ? 2 : 1; + else if (recent_chk == 1) + tail_chk = meta_cmp2steady(c02, s0, s2) ? 2 : 0; + else + tail_chk = meta_cmp2steady(c01, s0, s1) ? 1 : 0; + + const bool valid_chk = + c01 != 1 || s0 != s1 || c02 != 1 || s0 != s2 || c12 != 1 || s1 != s2; + const bool strict_chk = (c01 != 1 || s0 != s1) && (c02 != 1 || s0 != s2) && + (c12 != 1 || s1 != s2); + assert(recent == recent_chk); + assert(prefer_steady == prefer_steady_chk); + assert(tail == tail_chk); + assert(valid == valid_chk); + assert(strict == strict_chk); + // printf(" %d, ", xyz); + assert(xyz_fsm_map[i] == xyz); + } +#endif /* MDBX_DEBUG*/ + #if 0 /* debug */ for (unsigned i = 0; i < 65536; ++i) { size_t pages = pv2pages(i); diff --git a/src/internals.h b/src/internals.h index ab74f25a..77b2048a 100644 --- a/src/internals.h +++ b/src/internals.h @@ -455,7 +455,10 @@ typedef struct MDBX_meta { uint32_t mm_magic_and_version[2]; /* txnid that committed this page, the first of a two-phase-update pair */ - MDBX_atomic_uint32_t mm_txnid_a[2]; + union { + MDBX_atomic_uint32_t mm_txnid_a[2]; + uint64_t unsafe_txnid; + }; uint16_t mm_extra_flags; /* extra DB flags, zero (nothing) for now */ uint8_t mm_validator_id; /* ID of checksum and page validation method, @@ -474,8 +477,11 @@ typedef struct MDBX_meta { #define MDBX_DATASIGN_WEAK 1u #define SIGN_IS_STEADY(sign) ((sign) > MDBX_DATASIGN_WEAK) #define META_IS_STEADY(meta) \ - SIGN_IS_STEADY(unaligned_peek_u64_volatile(4, (meta)->mm_datasync_sign)) - uint32_t mm_datasync_sign[2]; + SIGN_IS_STEADY(unaligned_peek_u64_volatile(4, (meta)->mm_sign)) + union { + uint32_t mm_sign[2]; + uint64_t unsafe_sign; + }; /* txnid that committed this page, the second of a two-phase-update pair */ MDBX_atomic_uint32_t mm_txnid_b[2]; @@ -913,6 +919,13 @@ typedef struct MDBX_dbx { md_vlen_max; /* min/max value/data length for the database */ } MDBX_dbx; +typedef struct xyz { + uint8_t fsm, recent, prefer_steady, tail_and_flags; +#define XYZ_HAVE_STEADY(xyz) ((xyz)->fsm & 7) +#define XYZ_VALID(xyz) ((xyz)->tail_pgno_and_flags & 128) + txnid_t txnid[NUM_METAS]; +} meta_xyz_t; + /* A database transaction. * Every operation requires a transaction handle. */ struct MDBX_txn { @@ -986,6 +999,7 @@ struct MDBX_txn { MDBX_reader *reader; } to; struct { + meta_xyz_t xyz; /* In write txns, array of cursors for each DB */ pgno_t *reclaimed_pglist; /* Reclaimed GC pages */ txnid_t last_reclaimed; /* ID of last used record */ @@ -1199,10 +1213,6 @@ struct MDBX_env { MDBX_txn *me_txn; /* current write transaction */ osal_fastmutex_t me_dbi_lock; -#if MDBX_CACHE_METAPTR - volatile const MDBX_meta *cache_last_meta; - volatile const MDBX_meta *cache_steady_meta; -#endif /* MDBX_CACHE_METAPTR */ MDBX_dbi me_numdbs; /* number of DBs opened */ MDBX_page *me_dp_reserve; /* list of malloc'ed blocks for re-use */ diff --git a/src/options.h b/src/options.h index 4fb8b4fb..b5d760b4 100644 --- a/src/options.h +++ b/src/options.h @@ -92,12 +92,6 @@ #error MDBX_ENABLE_BIGFOOT must be defined as 0 or 1 #endif /* MDBX_ENABLE_BIGFOOT */ -#ifndef MDBX_CACHE_METAPTR -#define MDBX_CACHE_METAPTR 0 -#elif !(MDBX_CACHE_METAPTR == 0 || MDBX_CACHE_METAPTR == 1) -#error MDBX_CACHE_METAPTR must be defined as 0 or 1 -#endif /* MDBX_CACHE_METAPTR */ - /** Controls use of POSIX madvise() hints and friends. */ #ifndef MDBX_ENABLE_MADVISE #define MDBX_ENABLE_MADVISE 1 From 08a8f844dc0bf6a048066c21a21ecbb573e06841 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Thu, 18 Aug 2022 01:10:27 +0300 Subject: [PATCH 095/364] =?UTF-8?q?mdbx:=20=C3=974=20ARM-Neon=20accelerate?= =?UTF-8?q?d=20`scan4seq()`.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/base.h | 3 +++ src/core.c | 76 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 79 insertions(+) diff --git a/src/base.h b/src/base.h index 96e88f11..a927f805 100644 --- a/src/base.h +++ b/src/base.h @@ -417,6 +417,9 @@ __extern_C key_t ftok(const char *, int); #include #include #endif /* __ia32__ */ +#ifdef __ARM_NEON +#include +#endif #elif defined(__SUNPRO_C) || defined(__sun) || defined(sun) #include #elif (defined(_HPUX_SOURCE) || defined(__hpux) || defined(__HP_aCC)) && \ diff --git a/src/core.c b/src/core.c index 5e3a2a45..3598c00d 100644 --- a/src/core.c +++ b/src/core.c @@ -5866,6 +5866,11 @@ MDBX_MAYBE_UNUSED static __always_inline size_t __builtin_clz(unsigned value) { } #endif /* _MSC_VER */ +#if defined(_MSC_VER) && !defined(__builtin_clzl) && \ + !__has_builtin(__builtin_clzl) +#define __builtin_clzl(value) __builtin_clz(value) +#endif /* _MSC_VER */ + #if !defined(MDBX_ATTRIBUTE_TARGET) && \ (__has_attribute(__target__) || __GNUC_PREREQ(5, 0)) #define MDBX_ATTRIBUTE_TARGET(target) __attribute__((__target__(target))) @@ -6098,6 +6103,74 @@ scan4seq_avx512bw(pgno_t *range, const size_t len, const unsigned seq) { } #endif /* MDBX_ATTRIBUTE_TARGET_AVX512BW */ +#if (defined(__ARM_NEON) || defined(__ARM_NEON__)) && \ + (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) +static __always_inline size_t diffcmp2mask_neon(const pgno_t *const ptr, + const ptrdiff_t offset, + const uint32x4_t pattern) { + const uint32x4_t f = vld1q_u32(ptr); + const uint32x4_t l = vld1q_u32(ptr + offset); + const uint16x4_t cmp = vmovn_u32(vceqq_u32(vsubq_u32(f, l), pattern)); + if (sizeof(size_t) > 7) + return vget_lane_u64(vreinterpret_u64_u16(cmp), 0); + else + return vget_lane_u32(vreinterpret_u32_u8(vmovn_u16(vcombine_u16(cmp, cmp))), + 0); +} + +__hot static pgno_t *scan4seq_neon(pgno_t *range, const size_t len, + const unsigned seq) { + assert(seq > 0 && len > seq); +#if MDBX_PNL_ASCENDING +#error "FIXME: Not implemented" +#endif /* MDBX_PNL_ASCENDING */ + assert(range[-(ptrdiff_t)len] == len); + pgno_t *const detent = range - len + seq; + const ptrdiff_t offset = -(ptrdiff_t)seq; + const pgno_t target = (pgno_t)offset; + const uint32x4_t pattern = vmovq_n_u32(target); + size_t mask; + if (likely(len > seq + 3)) { + do { + mask = diffcmp2mask_neon(range - 3, offset, pattern); + if (mask) { +#ifndef __SANITIZE_ADDRESS__ + found: +#endif /* __SANITIZE_ADDRESS__ */ + return (pgno_t *)((char *)range - + (__builtin_clzl(mask) >> sizeof(size_t) / 4)); + } + range -= 4; + } while (range > detent + 3); + if (range == detent) + return nullptr; + } + + /* Далее происходит чтение от 4 до 12 лишних байт, которые могут быть не + * только за пределами региона выделенного под PNL, но и пересекать границу + * страницы памяти. Что может приводить как к ошибкам ASAN, так и к падению. + * Поэтому проверяем смещение на странице, а с ASAN всегда страхуемся. */ +#ifndef __SANITIZE_ADDRESS__ + const unsigned on_page_safe_mask = 0xff0 /* enough for '-15' bytes offset */; + if (likely(on_page_safe_mask & (uintptr_t)(range + offset)) && + !RUNNING_ON_VALGRIND) { + const unsigned extra = (unsigned)(detent + 4 - range); + assert(extra > 0 && extra < 4); + mask = (~(size_t)0) << (extra * sizeof(size_t) * 2); + mask &= diffcmp2mask_neon(range - 3, offset, pattern); + if (mask) + goto found; + return nullptr; + } +#endif /* __SANITIZE_ADDRESS__ */ + do + if (*range - range[offset] == target) + return range; + while (--range != detent); + return nullptr; +} +#endif /* __ARM_NEON || __ARM_NEON__ */ + #if defined(__AVX512BW__) && defined(MDBX_ATTRIBUTE_TARGET_AVX512BW) #define scan4seq_default scan4seq_avx512bw #define scan4seq scan4seq_default @@ -6105,6 +6178,9 @@ scan4seq_avx512bw(pgno_t *range, const size_t len, const unsigned seq) { #define scan4seq_default scan4seq_avx2 #elif defined(__SSE2__) && defined(MDBX_ATTRIBUTE_TARGET_SSE2) #define scan4seq_default scan4seq_sse2 +#elif (defined(__ARM_NEON) || defined(__ARM_NEON__)) && \ + (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) +#define scan4seq_default scan4seq_neon /* Choosing of another variants should be added here. */ #endif /* scan4seq_default */ From 4cef1c2376b56dbe07e9eec03e0d5d4e74e21177 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Thu, 18 Aug 2022 01:39:06 +0300 Subject: [PATCH 096/364] mdbx: avoid extra using `F_ISSET()` macro. --- src/core.c | 77 +++++++++++++++++++++++++++--------------------------- 1 file changed, 38 insertions(+), 39 deletions(-) diff --git a/src/core.c b/src/core.c index 3598c00d..2a5afce8 100644 --- a/src/core.c +++ b/src/core.c @@ -3449,10 +3449,9 @@ const char *mdbx_dump_val(const MDBX_val *key, char *const buf, static const char *leafnode_type(MDBX_node *n) { static const char *const tp[2][2] = {{"", ": DB"}, {": sub-page", ": sub-DB"}}; - return F_ISSET(node_flags(n), F_BIGDATA) + return (node_flags(n) & F_BIGDATA) ? ": large page" - : tp[F_ISSET(node_flags(n), F_DUPDATA)] - [F_ISSET(node_flags(n), F_SUBDATA)]; + : tp[!!(node_flags(n) & F_DUPDATA)][!!(node_flags(n) & F_SUBDATA)]; } /* Display all the keys in the page. */ @@ -3512,7 +3511,7 @@ MDBX_MAYBE_UNUSED static void page_list(MDBX_page *mp) { DKEY(&key)); total += nsize; } else { - if (F_ISSET(node_flags(node), F_BIGDATA)) + if (node_flags(node) & F_BIGDATA) nsize += sizeof(pgno_t); else nsize += (unsigned)node_ds(node); @@ -7818,7 +7817,7 @@ static int txn_renew(MDBX_txn *txn, const unsigned flags) { /* Not yet touching txn == env->me_txn0, it may be active */ jitter4testing(false); - rc = mdbx_txn_lock(env, F_ISSET(flags, MDBX_TXN_TRY)); + rc = mdbx_txn_lock(env, !!(flags & MDBX_TXN_TRY)); if (unlikely(rc)) return rc; if (unlikely(env->me_flags & MDBX_FATAL_ERROR)) { @@ -7978,7 +7977,7 @@ static __always_inline int check_txn_rw(const MDBX_txn *txn, int bad_bits) { if (unlikely(err)) return err; - if (unlikely(F_ISSET(txn->mt_flags, MDBX_TXN_RDONLY))) + if (unlikely(txn->mt_flags & MDBX_TXN_RDONLY)) return MDBX_EACCESS; return MDBX_SUCCESS; @@ -8600,11 +8599,11 @@ static int txn_end(MDBX_txn *txn, const unsigned mode) { cursors_eot(txn, false); int rc = MDBX_SUCCESS; - if (F_ISSET(txn->mt_flags, MDBX_TXN_RDONLY)) { + if (txn->mt_flags & MDBX_TXN_RDONLY) { if (txn->to.reader) { MDBX_reader *slot = txn->to.reader; eASSERT(env, slot->mr_pid.weak == env->me_pid); - if (likely(!F_ISSET(txn->mt_flags, MDBX_TXN_FINISHED))) { + if (likely(!(txn->mt_flags & MDBX_TXN_FINISHED))) { eASSERT(env, txn->mt_txnid == slot->mr_txnid.weak && slot->mr_txnid.weak >= env->me_lck->mti_oldest_reader.weak); @@ -8632,7 +8631,7 @@ static int txn_end(MDBX_txn *txn, const unsigned mode) { txn->mt_numdbs = 0; /* prevent further DBI activity */ txn->mt_flags = MDBX_TXN_RDONLY | MDBX_TXN_FINISHED; txn->mt_owner = 0; - } else if (!F_ISSET(txn->mt_flags, MDBX_TXN_FINISHED)) { + } else if (!(txn->mt_flags & MDBX_TXN_FINISHED)) { #if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__) if (txn == env->me_txn0) txn_valgrind(env, nullptr); @@ -8763,7 +8762,7 @@ int mdbx_txn_abort(MDBX_txn *txn) { if (unlikely(rc != MDBX_SUCCESS)) return rc; - if (F_ISSET(txn->mt_flags, MDBX_TXN_RDONLY)) + if (txn->mt_flags & MDBX_TXN_RDONLY) /* LY: don't close DBI-handles */ return txn_end(txn, MDBX_END_ABORT | MDBX_END_UPDATE | MDBX_END_SLOT | MDBX_END_FREE); @@ -10271,7 +10270,7 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { /* txn_end() mode for a commit which writes nothing */ unsigned end_mode = MDBX_END_PURE_COMMIT | MDBX_END_UPDATE | MDBX_END_SLOT | MDBX_END_FREE; - if (unlikely(F_ISSET(txn->mt_flags, MDBX_TXN_RDONLY))) + if (unlikely(txn->mt_flags & MDBX_TXN_RDONLY)) goto done; if (txn->mt_child) { @@ -13153,8 +13152,8 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, env->me_dbxs[FREE_DBI].md_cmp = cmp_int_align4; /* aligned MDBX_INTEGERKEY */ env->me_dbxs[FREE_DBI].md_dcmp = cmp_lenfast; - rc = osal_openfile(F_ISSET(flags, MDBX_RDONLY) ? MDBX_OPEN_DXB_READ - : MDBX_OPEN_DXB_LAZY, + rc = osal_openfile((flags & MDBX_RDONLY) ? MDBX_OPEN_DXB_READ + : MDBX_OPEN_DXB_LAZY, env, env_pathname.dxb, &env->me_lazy_fd, mode); if (rc != MDBX_SUCCESS) goto bailout; @@ -14274,7 +14273,7 @@ int mdbx_get_ex(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data, if (cx.outer.mc_xcursor != NULL) { MDBX_node *node = page_node(cx.outer.mc_pg[cx.outer.mc_top], cx.outer.mc_ki[cx.outer.mc_top]); - if (F_ISSET(node_flags(node), F_DUPDATA)) { + if (node_flags(node) & F_DUPDATA) { // coverity[uninit_use : FALSE] tASSERT(txn, cx.outer.mc_xcursor == &cx.inner && (cx.inner.mx_cursor.mc_flags & C_INITIALIZED)); @@ -14369,7 +14368,7 @@ static int cursor_next(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, if (mc->mc_db->md_flags & MDBX_DUPSORT) { node = page_node(mp, mc->mc_ki[mc->mc_top]); - if (F_ISSET(node_flags(node), F_DUPDATA)) { + if (node_flags(node) & F_DUPDATA) { if (op == MDBX_NEXT || op == MDBX_NEXT_DUP) { rc = cursor_next(&mc->mc_xcursor->mx_cursor, data, NULL, MDBX_NEXT); if (op != MDBX_NEXT || rc != MDBX_NOTFOUND) { @@ -14427,7 +14426,7 @@ skip: } node = page_node(mp, mc->mc_ki[mc->mc_top]); - if (F_ISSET(node_flags(node), F_DUPDATA)) { + if (node_flags(node) & F_DUPDATA) { rc = cursor_xinit1(mc, node, mp); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -14465,7 +14464,7 @@ static int cursor_prev(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, if ((mc->mc_db->md_flags & MDBX_DUPSORT) && mc->mc_ki[mc->mc_top] < page_numkeys(mp)) { node = page_node(mp, mc->mc_ki[mc->mc_top]); - if (F_ISSET(node_flags(node), F_DUPDATA)) { + if (node_flags(node) & F_DUPDATA) { if (op == MDBX_PREV || op == MDBX_PREV_DUP) { rc = cursor_prev(&mc->mc_xcursor->mx_cursor, data, NULL, MDBX_PREV); if (op != MDBX_PREV || rc != MDBX_NOTFOUND) { @@ -14518,7 +14517,7 @@ static int cursor_prev(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, node = page_node(mp, mc->mc_ki[mc->mc_top]); - if (F_ISSET(node_flags(node), F_DUPDATA)) { + if (node_flags(node) & F_DUPDATA) { rc = cursor_xinit1(mc, node, mp); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -14737,7 +14736,7 @@ got_node: return ret; } - if (F_ISSET(node_flags(node), F_DUPDATA)) { + if (node_flags(node) & F_DUPDATA) { ret.err = cursor_xinit1(mc, node, mp); if (unlikely(ret.err != MDBX_SUCCESS)) return ret; @@ -14849,7 +14848,7 @@ static int cursor_first(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data) { } MDBX_node *node = page_node(mp, 0); - if (F_ISSET(node_flags(node), F_DUPDATA)) { + if (node_flags(node) & F_DUPDATA) { rc = cursor_xinit1(mc, node, mp); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -14898,7 +14897,7 @@ static int cursor_last(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data) { } MDBX_node *node = page_node(mp, mc->mc_ki[mc->mc_top]); - if (F_ISSET(node_flags(node), F_DUPDATA)) { + if (node_flags(node) & F_DUPDATA) { rc = cursor_xinit1(mc, node, mp); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -14958,7 +14957,7 @@ __hot int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, MDBX_node *node = page_node(mp, mc->mc_ki[mc->mc_top]); get_key_optional(node, key); if (data) { - if (F_ISSET(node_flags(node), F_DUPDATA)) { + if (node_flags(node) & F_DUPDATA) { if (unlikely(!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED))) { rc = cursor_xinit1(mc, node, mp); if (unlikely(rc != MDBX_SUCCESS)) @@ -15077,7 +15076,7 @@ __hot int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, } { MDBX_node *node = page_node(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); - if (!F_ISSET(node_flags(node), F_DUPDATA)) { + if (!(node_flags(node) & F_DUPDATA)) { get_key_optional(node, key); rc = node_read(mc, node, data, mc->mc_pg[mc->mc_top]); break; @@ -15340,7 +15339,7 @@ __hot int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, if (unlikely(flags & MDBX_MULTIPLE)) { if (unlikely(flags & MDBX_RESERVE)) return MDBX_EINVAL; - if (unlikely(!F_ISSET(mc->mc_db->md_flags, MDBX_DUPFIXED))) + if (unlikely(!(mc->mc_db->md_flags & MDBX_DUPFIXED))) return MDBX_INCOMPATIBLE; dcount = data[1].iov_len; if (unlikely(dcount < 2 || data->iov_len == 0)) @@ -15459,9 +15458,9 @@ __hot int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, if (unlikely((flags & MDBX_MULTIPLE))) goto drop_current; - if (F_ISSET(mc->mc_db->md_flags, MDBX_DUPSORT)) { + if (mc->mc_db->md_flags & MDBX_DUPSORT) { MDBX_node *node = page_node(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); - if (F_ISSET(node_flags(node), F_DUPDATA)) { + if (node_flags(node) & F_DUPDATA) { cASSERT(mc, mc->mc_xcursor != NULL && (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)); /* Если за ключом более одного значения, либо если размер данных @@ -15687,7 +15686,7 @@ __hot int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, MDBX_node *node = page_node(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); /* Large/Overflow page overwrites need special handling */ - if (unlikely(F_ISSET(node_flags(node), F_BIGDATA))) { + if (unlikely(node_flags(node) & F_BIGDATA)) { int dpages = (node_size(key, data) > env->me_leaf_nodemax) ? number_of_ovpages(env, data->iov_len) : 0; @@ -15742,7 +15741,7 @@ __hot int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, } } node_set_ds(node, data->iov_len); - if (F_ISSET(flags, MDBX_RESERVE)) + if (flags & MDBX_RESERVE) data->iov_base = page_data(lp.page); else memcpy(page_data(lp.page), data->iov_base, data->iov_len); @@ -15764,7 +15763,7 @@ __hot int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, (char *)(mc->mc_pg[mc->mc_top]) + env->me_psize); /* DB has dups? */ - if (F_ISSET(mc->mc_db->md_flags, MDBX_DUPSORT)) { + if (mc->mc_db->md_flags & MDBX_DUPSORT) { /* Prepare (sub-)page/sub-DB to accept the new item, if needed. * fp: old sub-page or a header faking it. * mp: new (sub-)page. offset: growth in page size. @@ -15775,7 +15774,7 @@ __hot int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, mp->mp_pgno = mc->mc_pg[mc->mc_top]->mp_pgno; /* Was a single item before, must convert now */ - if (!F_ISSET(node_flags(node), F_DUPDATA)) { + if (!(node_flags(node) & F_DUPDATA)) { /* does data match? */ const int cmp = mc->mc_dbx->md_dcmp(data, &olddata); @@ -15931,7 +15930,7 @@ __hot int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, /* same size, just replace it. Note that we could * also reuse this node if the new data is smaller, * but instead we opt to shrink the node in that case. */ - if (F_ISSET(flags, MDBX_RESERVE)) + if (flags & MDBX_RESERVE) data->iov_base = olddata.iov_base; else if (!(mc->mc_flags & C_SUB)) memcpy(olddata.iov_base, data->iov_base, data->iov_len); @@ -16147,13 +16146,13 @@ __hot int mdbx_cursor_del(MDBX_cursor *mc, MDBX_put_flags_t flags) { goto del_key; MDBX_node *node = page_node(mp, mc->mc_ki[mc->mc_top]); - if (F_ISSET(node_flags(node), F_DUPDATA)) { + if (node_flags(node) & F_DUPDATA) { if (flags & (MDBX_ALLDUPS | /* for compatibility */ MDBX_NODUPDATA)) { /* cursor_del() will subtract the final entry */ mc->mc_db->md_entries -= mc->mc_xcursor->mx_db.md_entries - 1; mc->mc_xcursor->mx_cursor.mc_flags &= ~C_INITIALIZED; } else { - if (!F_ISSET(node_flags(node), F_SUBDATA)) + if (!(node_flags(node) & F_SUBDATA)) mc->mc_xcursor->mx_cursor.mc_pg[0] = node_data(node); rc = mdbx_cursor_del(&mc->mc_xcursor->mx_cursor, MDBX_NOSPILL); if (unlikely(rc)) @@ -16212,7 +16211,7 @@ __hot int mdbx_cursor_del(MDBX_cursor *mc, MDBX_put_flags_t flags) { return MDBX_INCOMPATIBLE; /* add large/overflow pages to free list */ - if (F_ISSET(node_flags(node), F_BIGDATA)) { + if (node_flags(node) & F_BIGDATA) { pgr_t lp = page_get_large(mc, node_largedata_pgno(node), mp->mp_txnid); if (unlikely((rc = lp.err) || (rc = page_retire(mc, lp.page)))) goto fail; @@ -16836,7 +16835,7 @@ int mdbx_cursor_bind(MDBX_txn *txn, MDBX_cursor *mc, MDBX_dbi dbi) { if (unlikely(!check_dbi(txn, dbi, DBI_VALID))) return MDBX_BAD_DBI; - if (unlikely(dbi == FREE_DBI && !F_ISSET(txn->mt_flags, MDBX_TXN_RDONLY))) + if (unlikely(dbi == FREE_DBI && !(txn->mt_flags & MDBX_TXN_RDONLY))) return MDBX_EACCESS; if (unlikely(mc->mc_backup)) /* Cursor from parent transaction */ { @@ -17027,7 +17026,7 @@ int mdbx_cursor_count(const MDBX_cursor *mc, size_t *countp) { *countp = 1; if (mc->mc_xcursor != NULL) { MDBX_node *node = page_node(mp, mc->mc_ki[mc->mc_top]); - if (F_ISSET(node_flags(node), F_DUPDATA)) { + if (node_flags(node) & F_DUPDATA) { cASSERT(mc, mc->mc_xcursor && (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)); *countp = unlikely(mc->mc_xcursor->mx_db.md_entries > PTRDIFF_MAX) @@ -19217,7 +19216,7 @@ int mdbx_put(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, MDBX_val *data, /* LY: allows update (explicit overwrite) only for unique keys */ MDBX_node *node = page_node(cx.outer.mc_pg[cx.outer.mc_top], cx.outer.mc_ki[cx.outer.mc_top]); - if (F_ISSET(node_flags(node), F_DUPDATA)) { + if (node_flags(node) & F_DUPDATA) { tASSERT(txn, XCURSOR_INITED(&cx.outer) && cx.outer.mc_xcursor->mx_db.md_entries > 1); rc = MDBX_EMULTIVAL; @@ -22162,7 +22161,7 @@ int mdbx_estimate_range(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *begin_key, if (begin.outer.mc_xcursor != NULL) { MDBX_node *node = page_node(begin.outer.mc_pg[begin.outer.mc_top], begin.outer.mc_ki[begin.outer.mc_top]); - if (F_ISSET(node_flags(node), F_DUPDATA)) { + if (node_flags(node) & F_DUPDATA) { /* LY: return the number of duplicates for given key */ tASSERT(txn, begin.outer.mc_xcursor == &begin.inner && (begin.inner.mx_cursor.mc_flags & C_INITIALIZED)); @@ -22329,7 +22328,7 @@ int mdbx_replace_ex(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, if (flags & MDBX_CURRENT) { /* disallow update/delete for multi-values */ MDBX_node *node = page_node(page, cx.outer.mc_ki[cx.outer.mc_top]); - if (F_ISSET(node_flags(node), F_DUPDATA)) { + if (node_flags(node) & F_DUPDATA) { tASSERT(txn, XCURSOR_INITED(&cx.outer) && cx.outer.mc_xcursor->mx_db.md_entries > 1); if (cx.outer.mc_xcursor->mx_db.md_entries > 1) { From b759dfafd72bd422da958d0243320ecad8675613 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Fri, 19 Aug 2022 22:23:55 +0300 Subject: [PATCH 097/364] mdbx: counting large/overflow dirty pages (unused for now). This is a basis for [Large/Overflow pages accounting for dirty-room](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/192). --- src/core.c | 48 ++++++++++++++++++++++++++++++++++-------------- src/internals.h | 1 + 2 files changed, 35 insertions(+), 14 deletions(-) diff --git a/src/core.c b/src/core.c index 2a5afce8..18de1413 100644 --- a/src/core.c +++ b/src/core.c @@ -2721,6 +2721,7 @@ static __always_inline void dpl_clear(MDBX_dpl *dl) { static const MDBX_page dpl_stub_pageB = {{0}, 0, P_BAD, {0}, /* pgno */ 0}; assert(dpl_stub_pageB.mp_flags == P_BAD && dpl_stub_pageB.mp_pgno == 0); dl->sorted = dpl_setlen(dl, 0); + dl->pages_including_loose = 0; dl->items[0].ptr = (MDBX_page *)&dpl_stub_pageB; dl->items[0].pgno = 0; dl->items[0].extra = 0; @@ -2944,10 +2945,11 @@ MDBX_MAYBE_UNUSED static const MDBX_page *debug_dpl_find(const MDBX_txn *txn, return nullptr; } -static void dpl_remove(const MDBX_txn *txn, unsigned i) { +static void dpl_remove_ex(const MDBX_txn *txn, unsigned i, unsigned npages) { MDBX_dpl *dl = txn->tw.dirtylist; assert((int)i > 0 && i <= dl->length); assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); + dl->pages_including_loose -= npages; dl->sorted -= dl->sorted >= i; dl->length -= 1; memmove(dl->items + i, dl->items + i + 1, @@ -2955,6 +2957,10 @@ static void dpl_remove(const MDBX_txn *txn, unsigned i) { assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); } +static void dpl_remove(const MDBX_txn *txn, unsigned i) { + dpl_remove_ex(txn, i, dpl_npages(txn->tw.dirtylist, i)); +} + static __always_inline int __must_check_result dpl_append(MDBX_txn *txn, pgno_t pgno, MDBX_page *page, @@ -3001,6 +3007,7 @@ static __always_inline int __must_check_result dpl_append(MDBX_txn *txn, dl->items[length].lru = txn->tw.dirtylru++; dl->length = length; dl->sorted = sorted; + dl->pages_including_loose += npages; assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); return MDBX_SUCCESS; } @@ -3655,10 +3662,8 @@ static void dlist_free(MDBX_txn *txn) { MDBX_env *env = txn->mt_env; MDBX_dpl *const dl = txn->tw.dirtylist; - for (unsigned i = 1; i <= dl->length; i++) { - MDBX_page *dp = dl->items[i].ptr; - dpage_free(env, dp, dpl_npages(dl, i)); - } + for (unsigned i = 1; i <= dl->length; i++) + dpage_free(env, dl->items[i].ptr, dpl_npages(dl, i)); dpl_clear(dl); } @@ -3682,7 +3687,7 @@ MDBX_MAYBE_UNUSED __cold static bool dirtylist_check(MDBX_txn *txn) { if (!AUDIT_ENABLED()) return true; - unsigned loose = 0; + unsigned loose = 0, pages = 0; for (unsigned i = dl->length; i > 0; --i) { const MDBX_page *const dp = dl->items[i].ptr; if (!dp) @@ -3704,6 +3709,7 @@ MDBX_MAYBE_UNUSED __cold static bool dirtylist_check(MDBX_txn *txn) { return false; const unsigned num = dpl_npages(dl, i); + pages += num; tASSERT(txn, txn->mt_next_pgno >= dp->mp_pgno + num); if (unlikely(txn->mt_next_pgno < dp->mp_pgno + num)) return false; @@ -3734,6 +3740,10 @@ MDBX_MAYBE_UNUSED __cold static bool dirtylist_check(MDBX_txn *txn) { if (unlikely(loose != txn->tw.loose_count)) return false; + tASSERT(txn, pages == dl->pages_including_loose); + if (unlikely(pages != dl->pages_including_loose)) + return false; + for (unsigned i = 1; i <= MDBX_PNL_SIZE(txn->tw.retired_pages); ++i) { const MDBX_page *const dp = debug_dpl_find(txn, txn->tw.retired_pages[i]); tASSERT(txn, !dp); @@ -3829,6 +3839,7 @@ static void refund_loose(MDBX_txn *txn) { most, txn->mt_next_pgno); txn->tw.loose_count -= refunded; txn->tw.dirtyroom += refunded; + dl->pages_including_loose -= refunded; assert(txn->tw.dirtyroom <= txn->mt_env->me_options.dp_limit); txn->mt_next_pgno = most; @@ -3882,6 +3893,7 @@ static void refund_loose(MDBX_txn *txn) { dl->sorted = dl->length; txn->tw.loose_count -= refunded; txn->tw.dirtyroom += refunded; + dl->pages_including_loose -= refunded; tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length == (txn->mt_parent ? txn->mt_parent->tw.dirtyroom : txn->mt_env->me_options.dp_limit)); @@ -3981,7 +3993,7 @@ static __inline void page_wash(MDBX_txn *txn, const unsigned di, MDBX_page *const mp, const unsigned npages) { tASSERT(txn, di && di <= txn->tw.dirtylist->length && txn->tw.dirtylist->items[di].ptr == mp); - dpl_remove(txn, di); + dpl_remove_ex(txn, di, npages); txn->tw.dirtyroom++; tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length == (txn->mt_parent ? txn->mt_parent->tw.dirtyroom @@ -8534,12 +8546,12 @@ static void dpl_sift(MDBX_txn *const txn, MDBX_PNL pl, const bool spilled) { } /* update loop */ - unsigned w = r; + unsigned npages, w = r; remove_dl: - if ((txn->mt_env->me_flags & MDBX_WRITEMAP) == 0) { - MDBX_page *dp = dl->items[r].ptr; - dpage_free(txn->mt_env, dp, dpl_npages(dl, r)); - } + npages = dpl_npages(dl, r); + dl->pages_including_loose -= npages; + if ((txn->mt_env->me_flags & MDBX_WRITEMAP) == 0) + dpage_free(txn->mt_env, dl->items[r].ptr, npages); ++r; next_i: i += step; @@ -9253,6 +9265,7 @@ retry: tASSERT(txn, txn->tw.loose_count == dl->length - w); dpl_setlen(dl, w); dl->sorted = 0; + dl->pages_including_loose -= txn->tw.loose_count; txn->tw.dirtyroom += txn->tw.loose_count; tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length == (txn->mt_parent ? txn->mt_parent->tw.dirtyroom @@ -9906,8 +9919,8 @@ static __inline void txn_merge(MDBX_txn *const parent, MDBX_txn *const txn, unsigned n = dst->length; while (n && dst->items[n].pgno >= parent->mt_next_pgno) { if (!(txn->mt_env->me_flags & MDBX_WRITEMAP)) { - MDBX_page *dp = dst->items[n].ptr; - dpage_free(txn->mt_env, dp, dpl_npages(dst, n)); + unsigned npages = dpl_npages(dst, n); + dpage_free(txn->mt_env, dst->items[n].ptr, npages); } --n; } @@ -10218,6 +10231,13 @@ static __inline void txn_merge(MDBX_txn *const parent, MDBX_txn *const txn, assert(parent->tw.dirtyroom <= parent->mt_env->me_options.dp_limit); dpl_setlen(dst, dst->sorted); parent->tw.dirtylru = txn->tw.dirtylru; + + /* В текущем понимании выгоднее пересчитать кол-во страниц, + * чем подмешивать лишние ветвления и вычисления в циклы выше. */ + dst->pages_including_loose = 0; + for (r = 1; r <= dst->length; ++r) + dst->pages_including_loose += dpl_npages(dst, r); + tASSERT(parent, dirtylist_check(parent)); dpl_free(txn); diff --git a/src/internals.h b/src/internals.h index 77b2048a..c58345c0 100644 --- a/src/internals.h +++ b/src/internals.h @@ -868,6 +868,7 @@ typedef struct MDBX_dp { typedef struct MDBX_dpl { unsigned sorted; unsigned length; + unsigned pages_including_loose; /* number of pages, but not an entries. */ unsigned detent; /* allocated size excluding the MDBX_DPL_RESERVE_GAP */ #if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || \ (!defined(__cplusplus) && defined(_MSC_VER)) From b617f25eaad385634dd7b280c6b3f8570fc165b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Sat, 20 Aug 2022 00:28:32 +0300 Subject: [PATCH 098/364] mdbx: refine & rename internal `xyz` to `troika`. --- src/core.c | 266 +++++++++++++++++++++++++----------------------- src/internals.h | 12 ++- 2 files changed, 145 insertions(+), 133 deletions(-) diff --git a/src/core.c b/src/core.c index 18de1413..1da32061 100644 --- a/src/core.c +++ b/src/core.c @@ -3125,7 +3125,7 @@ static int __must_check_result read_header(MDBX_env *env, MDBX_meta *meta, const mdbx_mode_t mode_bits); static int __must_check_result sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending, - meta_xyz_t *const xyz); + meta_troika_t *const troika); static int env_close(MDBX_env *env); struct node_result { @@ -5057,10 +5057,9 @@ static __inline bool meta_choice_steady(txnid_t a_txnid, bool a_steady, return meta_cmp2steady(meta_cmp2int(a_txnid, b_txnid, 1), a_steady, b_steady); } -MDBX_MAYBE_UNUSED static __inline uint8_t meta_cmp2pack(uint8_t c01, - uint8_t c02, - uint8_t c12, bool s0, - bool s1, bool s2) { +MDBX_MAYBE_UNUSED static uint8_t meta_cmp2pack(uint8_t c01, uint8_t c02, + uint8_t c12, bool s0, bool s1, + bool s2) { assert(c01 < 3 && c02 < 3 && c12 < 3); /* assert(s0 < 2 && s1 < 2 && s2 < 2); */ const uint8_t recent = meta_cmp2recent(c01, s0, s1) @@ -5085,7 +5084,14 @@ MDBX_MAYBE_UNUSED static __inline uint8_t meta_cmp2pack(uint8_t c01, return tail | recent << 2 | prefer_steady << 4 | strict << 6 | valid << 7; } -static const uint8_t xyz_fsm_map[2 * 2 * 2 * 3 * 3 * 3] = { +static __inline void meta_troika_unpack(meta_troika_t *troika, + const uint8_t packed) { + troika->recent = (packed >> 2) & 3; + troika->prefer_steady = (packed >> 4) & 3; + troika->tail_and_flags = packed & 0xC3; +} + +static const uint8_t troika_fsm_map[2 * 2 * 2 * 3 * 3 * 3] = { 232, 201, 216, 216, 232, 233, 232, 232, 168, 201, 216, 152, 168, 233, 232, 168, 233, 201, 216, 201, 233, 233, 232, 233, 168, 201, 152, 216, 232, 169, 232, 168, 168, 193, 152, 152, 168, 169, 232, 168, 169, 193, 152, 194, 233, @@ -5102,27 +5108,24 @@ static const uint8_t xyz_fsm_map[2 * 2 * 2 * 3 * 3 * 3] = { 214, 228, 198, 212, 214, 150, 194, 214, 150, 164, 193, 212, 150, 194, 194, 210, 194, 225, 193, 210, 194}; -__hot static meta_xyz_t meta_tap(const MDBX_env *env) { +__hot static meta_troika_t meta_tap(const MDBX_env *env) { meta_snap_t snap; - meta_xyz_t r; + meta_troika_t troika; snap = meta_snap(METAPAGE(env, 0)); - r.txnid[0] = snap.txnid; - r.fsm = (uint8_t)snap.is_steady << 0; + troika.txnid[0] = snap.txnid; + troika.fsm = (uint8_t)snap.is_steady << 0; snap = meta_snap(METAPAGE(env, 1)); - r.txnid[1] = snap.txnid; - r.fsm += (uint8_t)snap.is_steady << 1; - r.fsm += meta_cmp2int(r.txnid[0], r.txnid[1], 8); + troika.txnid[1] = snap.txnid; + troika.fsm += (uint8_t)snap.is_steady << 1; + troika.fsm += meta_cmp2int(troika.txnid[0], troika.txnid[1], 8); snap = meta_snap(METAPAGE(env, 2)); - r.txnid[2] = snap.txnid; - r.fsm += (uint8_t)snap.is_steady << 2; - r.fsm += meta_cmp2int(r.txnid[0], r.txnid[2], 8 * 3); - r.fsm += meta_cmp2int(r.txnid[1], r.txnid[2], 8 * 3 * 3); + troika.txnid[2] = snap.txnid; + troika.fsm += (uint8_t)snap.is_steady << 2; + troika.fsm += meta_cmp2int(troika.txnid[0], troika.txnid[2], 8 * 3); + troika.fsm += meta_cmp2int(troika.txnid[1], troika.txnid[2], 8 * 3 * 3); - const uint8_t xyz = xyz_fsm_map[r.fsm]; - r.recent = (xyz >> 2) & 3; - r.prefer_steady = (xyz >> 4) & 3; - r.tail_and_flags = xyz & 0xC3; - return r; + meta_troika_unpack(&troika, troika_fsm_map[troika.fsm]); + return troika; } static txnid_t recent_committed_txnid(const MDBX_env *env) { @@ -5132,47 +5135,51 @@ static txnid_t recent_committed_txnid(const MDBX_env *env) { return (m0 > m1) ? ((m0 > m2) ? m0 : m2) : ((m1 > m2) ? m1 : m2); } -static __inline bool meta_eq(const meta_xyz_t *z, unsigned y, unsigned x) { - assert(y < NUM_METAS && x < NUM_METAS); - return z->txnid[y] == z->txnid[x] && !(((z->fsm >> y) ^ (z->fsm >> x)) & 1); +static __inline bool meta_eq(const meta_troika_t *troika, unsigned a, + unsigned b) { + assert(a < NUM_METAS && b < NUM_METAS); + return troika->txnid[a] == troika->txnid[b] && + (((troika->fsm >> a) ^ (troika->fsm >> b)) & 1) == 0; } -static unsigned meta_eq_mask(const meta_xyz_t *xyz) { - return meta_eq(xyz, 0, 1) | meta_eq(xyz, 1, 2) << 1 | meta_eq(xyz, 2, 0) << 2; +static unsigned meta_eq_mask(const meta_troika_t *troika) { + return meta_eq(troika, 0, 1) | meta_eq(troika, 1, 2) << 1 | + meta_eq(troika, 2, 0) << 2; } -__hot static bool meta_should_retry(const MDBX_env *env, meta_xyz_t *xyz) { - const meta_xyz_t prev = *xyz; - *xyz = meta_tap(env); - return prev.fsm != xyz->fsm || prev.txnid[0] != xyz->txnid[0] || - prev.txnid[1] != xyz->txnid[1] || prev.txnid[2] != xyz->txnid[2]; +__hot static bool meta_should_retry(const MDBX_env *env, + meta_troika_t *troika) { + const meta_troika_t prev = *troika; + *troika = meta_tap(env); + return prev.fsm != troika->fsm || prev.txnid[0] != troika->txnid[0] || + prev.txnid[1] != troika->txnid[1] || prev.txnid[2] != troika->txnid[2]; } static __always_inline meta_ptr_t meta_recent(const MDBX_env *env, - const meta_xyz_t *xyz) { + const meta_troika_t *troika) { meta_ptr_t r; - r.txnid = xyz->txnid[xyz->recent]; - r.ptr_v = METAPAGE(env, xyz->recent); - r.is_steady = (xyz->fsm >> xyz->recent) & 1; + r.txnid = troika->txnid[troika->recent]; + r.ptr_v = METAPAGE(env, troika->recent); + r.is_steady = (troika->fsm >> troika->recent) & 1; return r; } -static __always_inline meta_ptr_t meta_prefer_steady(const MDBX_env *env, - const meta_xyz_t *xyz) { +static __always_inline meta_ptr_t +meta_prefer_steady(const MDBX_env *env, const meta_troika_t *troika) { meta_ptr_t r; - r.txnid = xyz->txnid[xyz->prefer_steady]; - r.ptr_v = METAPAGE(env, xyz->prefer_steady); - r.is_steady = (xyz->fsm >> xyz->prefer_steady) & 1; + r.txnid = troika->txnid[troika->prefer_steady]; + r.ptr_v = METAPAGE(env, troika->prefer_steady); + r.is_steady = (troika->fsm >> troika->prefer_steady) & 1; return r; } static __always_inline meta_ptr_t meta_tail(const MDBX_env *env, - const meta_xyz_t *xyz) { - const uint8_t tail = xyz->tail_and_flags & 3; + const meta_troika_t *troika) { + const uint8_t tail = troika->tail_and_flags & 3; meta_ptr_t r; - r.txnid = xyz->txnid[tail]; + r.txnid = troika->txnid[tail]; r.ptr_v = METAPAGE(env, tail); - r.is_steady = (xyz->fsm >> tail) & 1; + r.is_steady = (troika->fsm >> tail) & 1; return r; } @@ -5250,7 +5257,7 @@ static txnid_t find_oldest_reader(MDBX_env *const env, const txnid_t steady) { static txnid_t txn_oldest_reader(const MDBX_txn *const txn) { return find_oldest_reader(txn->mt_env, - txn->tw.xyz.txnid[txn->tw.xyz.prefer_steady]); + txn->tw.troika.txnid[txn->tw.troika.prefer_steady]); } /* Find largest mvcc-snapshot still referenced. */ @@ -5775,10 +5782,10 @@ __cold static int wipe_steady(MDBX_txn *txn, const txnid_t last_steady) { /* force oldest refresh */ atomic_store32(&env->me_lck->mti_readers_refresh_flag, true, mo_Relaxed); tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); - txn->tw.xyz = meta_tap(env); + txn->tw.troika = meta_tap(env); for (MDBX_txn *scan = txn->mt_env->me_txn0; scan; scan = scan->mt_child) if (scan != txn) - scan->tw.xyz = txn->tw.xyz; + scan->tw.troika = txn->tw.troika; return MDBX_SUCCESS; } @@ -6541,8 +6548,8 @@ static pgr_t page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num, int flags) { const size_t next = (size_t)pgno + num; if (flags & MDBX_ALLOC_GC) { - const meta_ptr_t recent = meta_recent(env, &txn->tw.xyz); - const meta_ptr_t prefer_steady = meta_prefer_steady(env, &txn->tw.xyz); + const meta_ptr_t recent = meta_recent(env, &txn->tw.troika); + const meta_ptr_t prefer_steady = meta_prefer_steady(env, &txn->tw.troika); /* does reclaiming stopped at the last steady point? */ if (recent.ptr_c != prefer_steady.ptr_c && prefer_steady.is_steady && detent == prefer_steady.txnid + 1) { @@ -6571,7 +6578,7 @@ static pgr_t page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num, int flags) { ret.err = wipe_steady(txn, detent); DEBUG("gc-wipe-steady, rc %d", ret.err); eASSERT(env, prefer_steady.ptr_c != - meta_prefer_steady(env, &txn->tw.xyz).ptr_c); + meta_prefer_steady(env, &txn->tw.troika).ptr_c); } else if ((flags & MDBX_ALLOC_NEW) == 0 || (autosync_threshold && atomic_load32(&env->me_lck->mti_unsynced_pages, @@ -6587,10 +6594,10 @@ static pgr_t page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num, int flags) { /* make steady checkpoint. */ MDBX_meta meta = *recent.ptr_c; ret.err = sync_locked(env, env->me_flags & MDBX_WRITEMAP, &meta, - &txn->tw.xyz); + &txn->tw.troika); DEBUG("gc-make-steady, rc %d", ret.err); eASSERT(env, prefer_steady.ptr_c != - meta_prefer_steady(env, &txn->tw.xyz).ptr_c); + meta_prefer_steady(env, &txn->tw.troika).ptr_c); } if (likely(ret.err != MDBX_RESULT_TRUE)) { if (unlikely(ret.err != MDBX_SUCCESS)) @@ -7033,10 +7040,10 @@ retry:; const bool inside_txn = (env->me_txn0->mt_owner == osal_thread_self()); meta_ptr_t head; if (inside_txn | locked) - head = meta_recent(env, &env->me_txn0->tw.xyz); + head = meta_recent(env, &env->me_txn0->tw.troika); else { - const meta_xyz_t xyz = meta_tap(env); - head = meta_recent(env, &xyz); + const meta_troika_t troika = meta_tap(env); + head = meta_recent(env, &troika); } const pgno_t unsynced_pages = atomic_load32(&env->me_lck->mti_unsynced_pages, mo_Relaxed); @@ -7109,7 +7116,7 @@ retry:; #if MDBX_ENABLE_PGOP_STAT env->me_lck->mti_pgop_stat.wops.weak += wops; #endif /* MDBX_ENABLE_PGOP_STAT */ - env->me_txn0->tw.xyz = meta_tap(env); + env->me_txn0->tw.troika = meta_tap(env); eASSERT(env, !env->me_txn && !env->me_txn0->mt_child); goto retry; } @@ -7127,7 +7134,7 @@ retry:; data_page(head.ptr_c)->mp_pgno, durable_caption(head.ptr_c), unsynced_pages); MDBX_meta meta = *head.ptr_c; - rc = sync_locked(env, flags, &meta, &env->me_txn0->tw.xyz); + rc = sync_locked(env, flags, &meta, &env->me_txn0->tw.troika); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } @@ -7344,7 +7351,7 @@ static void txn_valgrind(MDBX_env *env, MDBX_txn *txn) { pgno_t last = MAX_PAGENO + 1; if (env->me_txn0 && env->me_txn0->mt_owner == osal_thread_self()) { /* inside write-txn */ - last = meta_recent(env, &env->me_txn0->xyz).ptr_v->mm_geo.next; + last = meta_recent(env, &env->me_txn0->troika).ptr_v->mm_geo.next; } else if (env->me_flags & MDBX_RDONLY) { /* read-only mode, no write-txn, no wlock mutex */ last = NUM_METAS; @@ -7721,11 +7728,11 @@ static int txn_renew(MDBX_txn *txn, const unsigned flags) { /* Seek & fetch the last meta */ uint64_t timestamp = 0; unsigned loop = 0; - meta_xyz_t xyz = meta_tap(env); + meta_troika_t troika = meta_tap(env); while (1) { const meta_ptr_t head = likely(env->me_stuck_meta < 0) - ? /* regular */ meta_recent(env, &xyz) + ? /* regular */ meta_recent(env, &troika) : /* recovery mode */ meta_ptr(env, env->me_stuck_meta); if (likely(r)) { safe64_reset(&r->mr_txnid, false); @@ -7760,7 +7767,7 @@ static int txn_renew(MDBX_txn *txn, const unsigned flags) { if (unlikely(env->me_stuck_meta >= 0)) break; - if (unlikely(meta_should_retry(env, &xyz) || + if (unlikely(meta_should_retry(env, &troika) || head.txnid < atomic_load64(&env->me_lck->mti_oldest_reader, mo_AcquireRelease))) { if (unlikely(++loop > 42)) { @@ -7843,8 +7850,8 @@ static int txn_renew(MDBX_txn *txn, const unsigned flags) { } #endif /* Windows */ - txn->tw.xyz = meta_tap(env); - const meta_ptr_t head = meta_recent(env, &txn->tw.xyz); + txn->tw.troika = meta_tap(env); + const meta_ptr_t head = meta_recent(env, &txn->tw.troika); uint64_t timestamp = 0; while ( "workaround for todo4recovery://erased_by_github/libmdbx/issues/269") { @@ -8197,7 +8204,7 @@ int mdbx_txn_begin_ex(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags, txn->mt_numdbs = parent->mt_numdbs; txn->mt_owner = parent->mt_owner; memcpy(txn->mt_dbs, parent->mt_dbs, txn->mt_numdbs * sizeof(MDBX_db)); - txn->tw.xyz = parent->tw.xyz; + txn->tw.troika = parent->tw.troika; /* Copy parent's mt_dbistate, but clear DB_NEW */ for (unsigned i = 0; i < txn->mt_numdbs; i++) txn->mt_dbistate[i] = @@ -8274,17 +8281,17 @@ int mdbx_txn_info(const MDBX_txn *txn, MDBX_txn_info *info, bool scan_rlt) { if (txn->mt_flags & MDBX_TXN_RDONLY) { meta_ptr_t head; uint64_t head_retired; - meta_xyz_t xyz = meta_tap(env); + meta_troika_t troika = meta_tap(env); do { /* fetch info from volatile head */ - head = meta_recent(env, &xyz); + head = meta_recent(env, &troika); head_retired = unaligned_peek_u64_volatile(4, head.ptr_v->mm_pages_retired); info->txn_space_limit_soft = pgno2bytes(env, head.ptr_v->mm_geo.now); info->txn_space_limit_hard = pgno2bytes(env, head.ptr_v->mm_geo.upper); info->txn_space_leftover = pgno2bytes(env, head.ptr_v->mm_geo.now - head.ptr_v->mm_geo.next); - } while (unlikely(meta_should_retry(env, &xyz))); + } while (unlikely(meta_should_retry(env, &troika))); info->txn_reader_lag = head.txnid - info->txn_id; info->txn_space_dirty = info->txn_space_retired = 0; @@ -8672,8 +8679,8 @@ static int txn_end(MDBX_txn *txn, const unsigned mode) { (parent->mt_flags & MDBX_TXN_HAS_CHILD) != 0); eASSERT(env, pnl_check_allocated(txn->tw.reclaimed_pglist, txn->mt_next_pgno - MDBX_ENABLE_REFUND)); - eASSERT(env, - memcmp(&txn->tw.xyz, &parent->tw.xyz, sizeof(meta_xyz_t)) == 0); + eASSERT(env, memcmp(&txn->tw.troika, &parent->tw.troika, + sizeof(meta_troika_t)) == 0); if (txn->tw.lifo_reclaimed) { eASSERT(env, MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) >= @@ -10530,7 +10537,7 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { ts_3 = latency ? osal_monotime() : 0; if (likely(rc == MDBX_SUCCESS)) { - const meta_ptr_t head = meta_recent(env, &txn->tw.xyz); + const meta_ptr_t head = meta_recent(env, &txn->tw.troika); MDBX_meta meta; memcpy(meta.mm_magic_and_version, head.ptr_c->mm_magic_and_version, 8); meta.mm_extra_flags = head.ptr_c->mm_extra_flags; @@ -10555,7 +10562,7 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { meta_set_txnid(env, &meta, commit_txnid); rc = sync_locked(env, env->me_flags | txn->mt_flags | MDBX_SHRINK_ALLOWED, - &meta, &txn->tw.xyz); + &meta, &txn->tw.troika); } ts_4 = latency ? osal_monotime() : 0; if (unlikely(rc != MDBX_SUCCESS)) { @@ -11012,12 +11019,12 @@ static size_t madvise_threshold(const MDBX_env *env, #endif /* MDBX_ENABLE_MADVISE */ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending, - meta_xyz_t *const xyz) { + meta_troika_t *const troika) { eASSERT(env, ((env->me_flags ^ flags) & MDBX_WRITEMAP) == 0); const MDBX_meta *const meta0 = METAPAGE(env, 0); const MDBX_meta *const meta1 = METAPAGE(env, 1); const MDBX_meta *const meta2 = METAPAGE(env, 2); - const meta_ptr_t head = meta_recent(env, xyz); + const meta_ptr_t head = meta_recent(env, troika); int rc; eASSERT(env, @@ -11122,7 +11129,7 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending, const pgno_t bottom = (aligned > pending->mm_geo.lower) ? aligned : pending->mm_geo.lower; if (pending->mm_geo.now > bottom) { - if (XYZ_HAVE_STEADY(xyz)) + if (TROIKA_HAVE_STEADY(troika)) /* force steady, but only if steady-checkpoint is present */ flags &= MDBX_WRITEMAP | MDBX_SHRINK_ALLOWED; shrink = pending->mm_geo.now - bottom; @@ -11154,7 +11161,8 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending, enum osal_syncmode_bits mode_bits = MDBX_SYNC_NONE; if ((flags & MDBX_SAFE_NOSYNC) == 0) { mode_bits = MDBX_SYNC_DATA; - if (pending->mm_geo.next > meta_prefer_steady(env, xyz).ptr_c->mm_geo.now) + if (pending->mm_geo.next > + meta_prefer_steady(env, troika).ptr_c->mm_geo.now) mode_bits |= MDBX_SYNC_SIZE; if (flags & MDBX_NOMETASYNC) mode_bits |= MDBX_SYNC_IODQ; @@ -11204,10 +11212,10 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending, return MDBX_SUCCESS; } } else { - const unsigned xyz_tail = xyz->tail_and_flags & 3; - ENSURE(env, xyz_tail < NUM_METAS && xyz_tail != xyz->recent && - xyz_tail != xyz->prefer_steady); - target = (MDBX_meta *)meta_tail(env, xyz).ptr_c; + const unsigned troika_tail = troika->tail_and_flags & 3; + ENSURE(env, troika_tail < NUM_METAS && troika_tail != troika->recent && + troika_tail != troika->prefer_steady); + target = (MDBX_meta *)meta_tail(env, troika).ptr_c; } /* LY: step#2 - update meta-page. */ @@ -11336,10 +11344,10 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending, (uint32_t)pending->unsafe_txnid - ((flags & MDBX_NOMETASYNC) ? UINT32_MAX / 3 : 0); - *xyz = meta_tap(env); + *troika = meta_tap(env); for (MDBX_txn *txn = env->me_txn0; txn; txn = txn->mt_child) - if (xyz != &txn->tw.xyz) - txn->tw.xyz = *xyz; + if (troika != &txn->tw.troika) + txn->tw.troika = *troika; /* LY: shrink datafile if needed */ if (unlikely(shrink)) { @@ -11587,10 +11595,10 @@ mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t size_now, if (unlikely(err != MDBX_SUCCESS)) return err; need_unlock = true; - env->me_txn0->tw.xyz = meta_tap(env); + env->me_txn0->tw.troika = meta_tap(env); eASSERT(env, !env->me_txn && !env->me_txn0->mt_child); env->me_txn0->mt_txnid = - env->me_txn0->tw.xyz.txnid[env->me_txn0->tw.xyz.recent]; + env->me_txn0->tw.troika.txnid[env->me_txn0->tw.troika.recent]; txn_oldest_reader(env->me_txn0); } @@ -11599,7 +11607,7 @@ mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t size_now, pagesize = env->me_psize; const MDBX_geo *const geo = inside_txn ? &env->me_txn->mt_geo - : &meta_recent(env, &env->me_txn0->tw.xyz).ptr_c->mm_geo; + : &meta_recent(env, &env->me_txn0->tw.troika).ptr_c->mm_geo; if (size_lower < 0) size_lower = pgno2bytes(env, geo->lower); if (size_now < 0) @@ -11813,7 +11821,7 @@ mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t size_now, const MDBX_geo *current_geo; if (!inside_txn) { eASSERT(env, need_unlock); - const meta_ptr_t head = meta_recent(env, &env->me_txn0->tw.xyz); + const meta_ptr_t head = meta_recent(env, &env->me_txn0->tw.troika); uint64_t timestamp = 0; while ("workaround for " @@ -11909,7 +11917,7 @@ mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t size_now, env->me_txn->mt_flags |= MDBX_TXN_DIRTY; } else { meta.mm_geo = new_geo; - rc = sync_locked(env, env->me_flags, &meta, &env->me_txn0->tw.xyz); + rc = sync_locked(env, env->me_flags, &meta, &env->me_txn0->tw.troika); } if (likely(rc == MDBX_SUCCESS)) { @@ -12189,7 +12197,7 @@ __cold static int setup_dxb(MDBX_env *env, const int lck_rc, : env->me_dxb_mmap.limit); #endif /* MDBX_USE_VALGRIND || __SANITIZE_ADDRESS__ */ - meta_xyz_t xyz = meta_tap(env); + meta_troika_t troika = meta_tap(env); eASSERT(env, !env->me_txn && !env->me_txn0); //-------------------------------- validate/rollback head & steady meta-pages if (unlikely(env->me_stuck_meta >= 0)) { @@ -12204,7 +12212,7 @@ __cold static int setup_dxb(MDBX_env *env, const int lck_rc, } } else /* not recovery mode */ while (1) { - const unsigned meta_clash_mask = meta_eq_mask(&xyz); + const unsigned meta_clash_mask = meta_eq_mask(&troika); if (unlikely(meta_clash_mask)) { ERROR("meta-pages are clashed: mask 0x%d", meta_clash_mask); return MDBX_CORRUPTED; @@ -12213,7 +12221,7 @@ __cold static int setup_dxb(MDBX_env *env, const int lck_rc, if (lck_rc != /* lck exclusive */ MDBX_RESULT_TRUE) { /* non-exclusive mode, * meta-pages should be validated by a first process opened the DB */ - if (xyz.recent == xyz.prefer_steady) + if (troika.recent == troika.prefer_steady) break; if (!env->me_lck_mmap.lck) { @@ -12231,8 +12239,8 @@ __cold static int setup_dxb(MDBX_env *env, const int lck_rc, eASSERT(env, lck_rc == MDBX_RESULT_TRUE); /* exclusive mode */ - const meta_ptr_t recent = meta_recent(env, &xyz); - const meta_ptr_t prefer_steady = meta_prefer_steady(env, &xyz); + const meta_ptr_t recent = meta_recent(env, &troika); + const meta_ptr_t prefer_steady = meta_prefer_steady(env, &troika); MDBX_meta clone; if (prefer_steady.is_steady) { err = validate_meta_copy(env, prefer_steady.ptr_c, &clone); @@ -12309,9 +12317,9 @@ __cold static int setup_dxb(MDBX_env *env, const int lck_rc, pgno, recent.txnid, err); return err; } - xyz = meta_tap(env); + troika = meta_tap(env); ENSURE(env, 0 == meta_txnid(recent.ptr_v)); - ENSURE(env, 0 == meta_eq_mask(&xyz)); + ENSURE(env, 0 == meta_eq_mask(&troika)); } if (lck_rc == /* lck exclusive */ MDBX_RESULT_TRUE) { @@ -12330,7 +12338,7 @@ __cold static int setup_dxb(MDBX_env *env, const int lck_rc, env->me_dxb_mmap.current, header.mm_geo.now); } - const meta_ptr_t recent = meta_recent(env, &xyz); + const meta_ptr_t recent = meta_recent(env, &troika); if (memcmp(&header.mm_geo, &recent.ptr_c->mm_geo, sizeof(header.mm_geo))) { if ((env->me_flags & MDBX_RDONLY) != 0 || /* recovery mode */ env->me_stuck_meta >= 0) { @@ -12367,7 +12375,7 @@ __cold static int setup_dxb(MDBX_env *env, const int lck_rc, ENSURE(env, header.unsafe_txnid == recent.txnid); meta_set_txnid(env, &header, next_txnid); err = sync_locked(env, env->me_flags | MDBX_SHRINK_ALLOWED, &header, - &xyz); + &troika); if (err) { ERROR("error %d, while updating meta.geo: " "from l%" PRIaPGNO "-n%" PRIaPGNO "-u%" PRIaPGNO @@ -12411,7 +12419,7 @@ __cold static int setup_dxb(MDBX_env *env, const int lck_rc, "updating db-format signature for", n, txnid, err); return err; } - xyz = meta_tap(env); + troika = meta_tap(env); } } } @@ -13341,8 +13349,8 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, #if MDBX_DEBUG if (rc == MDBX_SUCCESS) { - const meta_xyz_t xyz = meta_tap(env); - const meta_ptr_t head = meta_recent(env, &xyz); + const meta_troika_t troika = meta_tap(env); + const meta_ptr_t head = meta_recent(env, &troika); const MDBX_db *db = &head.ptr_c->mm_dbs[MAIN_DBI]; DEBUG("opened database version %u, pagesize %u", @@ -19784,13 +19792,13 @@ __cold static int env_copy_asis(MDBX_env *env, MDBX_txn *read_txn, jitter4testing(false); const size_t meta_bytes = pgno2bytes(env, NUM_METAS); - const meta_xyz_t xyz = meta_tap(env); + const meta_troika_t troika = meta_tap(env); /* Make a snapshot of meta-pages, * but writing ones after the data was flushed */ memcpy(buffer, env->me_map, meta_bytes); MDBX_meta *const headcopy = /* LY: get pointer to the snapshot copy */ (MDBX_meta *)(buffer + - ((uint8_t *)meta_recent(env, &xyz).ptr_c - env->me_map)); + ((uint8_t *)meta_recent(env, &troika).ptr_c - env->me_map)); mdbx_txn_unlock(env); if (flags & MDBX_CP_FORCE_DYNAMIC_SIZE) @@ -20361,22 +20369,22 @@ __cold static int fetch_envinfo_ex(const MDBX_env *env, const MDBX_txn *txn, if (unlikely(env->me_flags & MDBX_FATAL_ERROR)) return MDBX_PANIC; - meta_xyz_t holder; - meta_xyz_t const *xyz; + meta_troika_t holder; + meta_troika_t const *troika; if (txn && !(txn->mt_flags & MDBX_TXN_RDONLY)) - xyz = &txn->tw.xyz; + troika = &txn->tw.troika; else { holder = meta_tap(env); - xyz = &holder; + troika = &holder; } - const meta_ptr_t head = meta_recent(env, xyz); + const meta_ptr_t head = meta_recent(env, troika); arg->mi_recent_txnid = head.txnid; - arg->mi_meta0_txnid = xyz->txnid[0]; + arg->mi_meta0_txnid = troika->txnid[0]; arg->mi_meta0_sign = unaligned_peek_u64(4, meta0->mm_sign); - arg->mi_meta1_txnid = xyz->txnid[1]; + arg->mi_meta1_txnid = troika->txnid[1]; arg->mi_meta1_sign = unaligned_peek_u64(4, meta1->mm_sign); - arg->mi_meta2_txnid = xyz->txnid[2]; + arg->mi_meta2_txnid = troika->txnid[2]; arg->mi_meta2_sign = unaligned_peek_u64(4, meta2->mm_sign); if (likely(bytes > size_before_bootid)) { memcpy(&arg->mi_bootid.meta0, &meta0->mm_bootid, 16); @@ -21131,12 +21139,12 @@ __cold int mdbx_reader_list(const MDBX_env *env, MDBX_reader_list_func *func, size_t bytes_retained = 0; uint64_t lag = 0; if (txnid) { - meta_xyz_t xyz = meta_tap(env); + meta_troika_t troika = meta_tap(env); retry_header:; - const meta_ptr_t head = meta_recent(env, &xyz); + const meta_ptr_t head = meta_recent(env, &troika); const uint64_t head_pages_retired = unaligned_peek_u64_volatile(4, head.ptr_v->mm_pages_retired); - if (unlikely(meta_should_retry(env, &xyz) || + if (unlikely(meta_should_retry(env, &troika) || head_pages_retired != unaligned_peek_u64_volatile( 4, head.ptr_v->mm_pages_retired))) @@ -21337,7 +21345,7 @@ __cold static txnid_t kick_longlived_readers(MDBX_env *env, int retry = 0; do { const txnid_t steady = - env->me_txn->tw.xyz.txnid[env->me_txn->tw.xyz.prefer_steady]; + env->me_txn->tw.troika.txnid[env->me_txn->tw.troika.prefer_steady]; env->me_lck->mti_readers_refresh_flag.weak = /* force refresh */ true; oldest = find_oldest_reader(env, steady); eASSERT(env, oldest < env->me_txn0->mt_txnid); @@ -21376,7 +21384,7 @@ __cold static txnid_t kick_longlived_readers(MDBX_env *env, stucked->mr_snapshot_pages_retired.weak != hold_retired) continue; - const meta_ptr_t head = meta_recent(env, &env->me_txn->tw.xyz); + const meta_ptr_t head = meta_recent(env, &env->me_txn->tw.troika); const txnid_t gap = (head.txnid - laggard) / xMDBX_TXNID_STEP; const uint64_t head_retired = unaligned_peek_u64(4, head.ptr_c->mm_pages_retired); @@ -21463,16 +21471,16 @@ int mdbx_txn_straggler(const MDBX_txn *txn, int *percent) } txnid_t lag; - meta_xyz_t xyz = meta_tap(env); + meta_troika_t troika = meta_tap(env); do { - const meta_ptr_t head = meta_recent(env, &xyz); + const meta_ptr_t head = meta_recent(env, &troika); if (percent) { const pgno_t maxpg = head.ptr_v->mm_geo.now; *percent = (int)((head.ptr_v->mm_geo.next * UINT64_C(100) + maxpg / 2) / maxpg); } lag = (head.txnid - txn->mt_txnid) / xMDBX_TXNID_STEP; - } while (unlikely(meta_should_retry(env, &xyz))); + } while (unlikely(meta_should_retry(env, &troika))); return (lag > INT_MAX) ? INT_MAX : (int)lag; } @@ -23091,13 +23099,15 @@ __cold void global_ctor(void) { const uint8_t c01 = (i / (8 * 1)) % 3; const uint8_t c02 = (i / (8 * 3)) % 3; const uint8_t c12 = (i / (8 * 9)) % 3; - const uint8_t xyz = meta_cmp2pack(c01, c02, c12, s0, s1, s2); - const uint8_t recent = (xyz >> 2) & 3; - const uint8_t prefer_steady = (xyz >> 4) & 3; - const uint8_t tail = xyz & 3; - const bool strict = (xyz & 64) != 0; - const bool valid = (xyz & 128) != 0; + const uint8_t packed = meta_cmp2pack(c01, c02, c12, s0, s1, s2); + meta_troika_t troika; + troika.fsm = (uint8_t)i; + meta_troika_unpack(&troika, packed); + + const uint8_t tail = TROIKA_TAIL(&troika); + const bool strict = TROIKA_STRICT_VALID(&troika); + const bool valid = TROIKA_VALID(&troika); const uint8_t recent_chk = meta_cmp2recent(c01, s0, s1) ? (meta_cmp2recent(c02, s0, s2) ? 0 : 2) @@ -23118,13 +23128,13 @@ __cold void global_ctor(void) { c01 != 1 || s0 != s1 || c02 != 1 || s0 != s2 || c12 != 1 || s1 != s2; const bool strict_chk = (c01 != 1 || s0 != s1) && (c02 != 1 || s0 != s2) && (c12 != 1 || s1 != s2); - assert(recent == recent_chk); - assert(prefer_steady == prefer_steady_chk); + assert(troika.recent == recent_chk); + assert(troika.prefer_steady == prefer_steady_chk); assert(tail == tail_chk); assert(valid == valid_chk); assert(strict == strict_chk); - // printf(" %d, ", xyz); - assert(xyz_fsm_map[i] == xyz); + // printf(" %d, ", packed); + assert(troika_fsm_map[troika.fsm] == packed); } #endif /* MDBX_DEBUG*/ diff --git a/src/internals.h b/src/internals.h index c58345c0..74137815 100644 --- a/src/internals.h +++ b/src/internals.h @@ -920,12 +920,14 @@ typedef struct MDBX_dbx { md_vlen_max; /* min/max value/data length for the database */ } MDBX_dbx; -typedef struct xyz { +typedef struct troika { uint8_t fsm, recent, prefer_steady, tail_and_flags; -#define XYZ_HAVE_STEADY(xyz) ((xyz)->fsm & 7) -#define XYZ_VALID(xyz) ((xyz)->tail_pgno_and_flags & 128) +#define TROIKA_HAVE_STEADY(troika) ((troika)->fsm & 7) +#define TROIKA_STRICT_VALID(troika) ((troika)->tail_and_flags & 64) +#define TROIKA_VALID(troika) ((troika)->tail_and_flags & 128) +#define TROIKA_TAIL(troika) ((troika)->tail_and_flags & 3) txnid_t txnid[NUM_METAS]; -} meta_xyz_t; +} meta_troika_t; /* A database transaction. * Every operation requires a transaction handle. */ @@ -1000,7 +1002,7 @@ struct MDBX_txn { MDBX_reader *reader; } to; struct { - meta_xyz_t xyz; + meta_troika_t troika; /* In write txns, array of cursors for each DB */ pgno_t *reclaimed_pglist; /* Reclaimed GC pages */ txnid_t last_reclaimed; /* ID of last used record */ From ceba040e32388834687581002f195788f7090a13 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Sat, 20 Aug 2022 00:42:16 +0300 Subject: [PATCH 099/364] mdbx: add `meta_xyz_dump()`. --- src/core.c | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/src/core.c b/src/core.c index 1da32061..f18a8c36 100644 --- a/src/core.c +++ b/src/core.c @@ -5192,6 +5192,26 @@ static const char *durable_caption(const volatile MDBX_meta *const meta) { return "Weak"; } +__cold static void meta_troika_dump(const MDBX_env *env, + const meta_troika_t *troika) { + const meta_ptr_t recent = meta_recent(env, troika); + const meta_ptr_t prefer_steady = meta_prefer_steady(env, troika); + const meta_ptr_t tail = meta_tail(env, troika); + NOTICE("%" PRIaTXN ".%c:%" PRIaTXN ".%c:%" PRIaTXN ".%c, fsm=0x%02x, " + "head=%d-%" PRIaTXN ".%c, " + "base=%d-%" PRIaTXN ".%c, " + "tail=%d-%" PRIaTXN ".%c, " + "valid %c, strict %c", + troika->txnid[0], (troika->fsm & 1) ? 's' : 'w', troika->txnid[1], + (troika->fsm & 2) ? 's' : 'w', troika->txnid[2], + (troika->fsm & 4) ? 's' : 'w', troika->fsm, troika->recent, + recent.txnid, recent.is_steady ? 's' : 'w', troika->prefer_steady, + prefer_steady.txnid, prefer_steady.is_steady ? 's' : 'w', + troika->tail_and_flags % NUM_METAS, tail.txnid, + tail.is_steady ? 's' : 'w', TROIKA_VALID(troika) ? 'Y' : 'N', + TROIKA_STRICT_VALID(troika) ? 'Y' : 'N'); +} + /*----------------------------------------------------------------------------*/ /* Find oldest txnid still referenced. */ @@ -12198,6 +12218,9 @@ __cold static int setup_dxb(MDBX_env *env, const int lck_rc, #endif /* MDBX_USE_VALGRIND || __SANITIZE_ADDRESS__ */ meta_troika_t troika = meta_tap(env); +#if MDBX_DEBUG + meta_troika_dump(env, &troika); +#endif eASSERT(env, !env->me_txn && !env->me_txn0); //-------------------------------- validate/rollback head & steady meta-pages if (unlikely(env->me_stuck_meta >= 0)) { @@ -12208,6 +12231,7 @@ __cold static int setup_dxb(MDBX_env *env, const int lck_rc, if (unlikely(err != MDBX_SUCCESS)) { ERROR("target meta[%u] is corrupted", bytes2pgno(env, (uint8_t *)data_page(target) - env->me_map)); + meta_troika_dump(env, &troika); return MDBX_CORRUPTED; } } else /* not recovery mode */ @@ -12215,6 +12239,7 @@ __cold static int setup_dxb(MDBX_env *env, const int lck_rc, const unsigned meta_clash_mask = meta_eq_mask(&troika); if (unlikely(meta_clash_mask)) { ERROR("meta-pages are clashed: mask 0x%d", meta_clash_mask); + meta_troika_dump(env, &troika); return MDBX_CORRUPTED; } @@ -12228,6 +12253,7 @@ __cold static int setup_dxb(MDBX_env *env, const int lck_rc, /* LY: without-lck (read-only) mode, so it is impossible that other * process made weak checkpoint. */ ERROR("%s", "without-lck, unable recovery/rollback"); + meta_troika_dump(env, &troika); return MDBX_WANNA_RECOVERY; } @@ -12248,6 +12274,7 @@ __cold static int setup_dxb(MDBX_env *env, const int lck_rc, ERROR("meta[%u] with %s txnid %" PRIaTXN " is corrupted, %s needed", bytes2pgno(env, (uint8_t *)prefer_steady.ptr_c - env->me_map), "steady", prefer_steady.txnid, "manual recovery"); + meta_troika_dump(env, &troika); return MDBX_CORRUPTED; } if (prefer_steady.ptr_c == recent.ptr_c) @@ -12265,11 +12292,13 @@ __cold static int setup_dxb(MDBX_env *env, const int lck_rc, ERROR("%s for open or automatic rollback, %s", "there are no suitable meta-pages", "manual recovery is required"); + meta_troika_dump(env, &troika); return MDBX_CORRUPTED; } WARNING("meta[%u] with last txnid %" PRIaTXN " is corrupted, rollback needed", pgno, recent.txnid); + meta_troika_dump(env, &troika); goto purge_meta_head; } @@ -12279,6 +12308,7 @@ __cold static int setup_dxb(MDBX_env *env, const int lck_rc, "rollback NOT needed, steady-sync NEEDED%s", "opening after an unclean shutdown", bootid.x, bootid.y, ", but unable in read-only mode"); + meta_troika_dump(env, &troika); return MDBX_WANNA_RECOVERY; } WARNING("%s, but boot-id(%016" PRIx64 "-%016" PRIx64 ") is MATCH: " @@ -12294,6 +12324,7 @@ __cold static int setup_dxb(MDBX_env *env, const int lck_rc, "opening after an unclean shutdown", "there are no suitable meta-pages", "manual recovery is required"); + meta_troika_dump(env, &troika); return MDBX_CORRUPTED; } if (env->me_flags & MDBX_RDONLY) { @@ -12301,6 +12332,7 @@ __cold static int setup_dxb(MDBX_env *env, const int lck_rc, " to steady %" PRIaTXN ")%s", "opening after an unclean shutdown", recent.txnid, prefer_steady.txnid, ", but unable in read-only mode"); + meta_troika_dump(env, &troika); return MDBX_WANNA_RECOVERY; } @@ -12309,6 +12341,7 @@ __cold static int setup_dxb(MDBX_env *env, const int lck_rc, "purge%s meta[%u] with%s txnid %" PRIaTXN, "opening after an unclean shutdown", last_valid ? "" : " invalid", pgno, last_valid ? " weak" : "", recent.txnid); + meta_troika_dump(env, &troika); ENSURE(env, prefer_steady.is_steady); err = override_meta(env, pgno, 0, last_valid ? recent.ptr_c : prefer_steady.ptr_c); From 0ccec204096864f9694e117f6056d26b73a68bcc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Sat, 20 Aug 2022 01:32:38 +0300 Subject: [PATCH 100/364] mdbx: don't deem meta pages with zero txnid equal. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Устранение крайне маловероятного регресса после перехода на мета-тройку: - процесс А открыает БД и читает мета-траницы для формирования тройки; - процесс Б постоянно коммитит новые транзакции; - есть шанс что процесс А при чтении разных мета страниц попадет на момент их обновления более одного раза, это может привести к ложной ошибке коллизии мета-страниц, так как для обновляемых мета-страниц будет виден нулевой номер транзакции. --- src/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/core.c b/src/core.c index f18a8c36..b69d6856 100644 --- a/src/core.c +++ b/src/core.c @@ -5139,7 +5139,8 @@ static __inline bool meta_eq(const meta_troika_t *troika, unsigned a, unsigned b) { assert(a < NUM_METAS && b < NUM_METAS); return troika->txnid[a] == troika->txnid[b] && - (((troika->fsm >> a) ^ (troika->fsm >> b)) & 1) == 0; + (((troika->fsm >> a) ^ (troika->fsm >> b)) & 1) == 0 && + troika->txnid[a]; } static unsigned meta_eq_mask(const meta_troika_t *troika) { From b36a07a512c1412d5753219aa8fc66cab75a012a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Wed, 24 Aug 2022 16:24:22 +0300 Subject: [PATCH 101/364] mdbx: release v0.12.1 (Positive Proxima) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The planned frontward release with new superior features on the day of 20 anniversary of [Positive Technologies](https://ptsecurty.com). New: ---- - The `Big Foot` feature which significantly reduces GC overhead for processing large lists of retired pages from huge transactions. Now _libmdbx_ avoid creating large chunks of PNLs (page number lists) which required a long sequences of free pages, aka large/overflow pages. Thus avoiding searching, allocating and storing such sequences inside GC. - Improved hot/online validation and checking of database pages both for more robustness and performance. - New solid and fast method to latch meta-pages called `Troika`. The minimum of memory barriers, reads, comparisons and conditional transitions are used. - New `MDBX_VALIDATION` environment options to extra validation of DB structure and pages content for carefully/safe handling damaged or untrusted DB. - Accelerated ×16/×8/×4 by AVX512/AVX2/SSE2/Neon implementations of search page sequences. - Added the `gcrtime_seconds16dot16` counter to the "Page Operation Statistics" that accumulates time spent for GC searching and reclaiming. - Copy-with-compactification now clears/zeroes unused gaps inside database pages. - The `C` and `C++` APIs has been extended and/or refined to simplify using `wchar_t` pathnames. On Windows the `mdbx_env_openW()`, `mdbx_env_get_pathW()`, `mdbx_env_copyW()`, `mdbx_env_open_for_recoveryW()` are available for now, but the `mdbx_env_get_path()` has been replaced in favor of `mdbx_env_get_pathW()`. - Added explicit error message for Buildroot's Microblaze toolchain maintainers. - Added `MDBX_MANAGE_BUILD_FLAGS` build options for CMake. - Speed-up internal `bsearch`/`lower_bound` implementation using branchless tactic, including workaround for CLANG x86 optimiser bug. - A lot internal refinement and micro-optimisations. - Internally counted volume of dirty pages (unused for now but for coming features). Fixes: ------ - Never use modern `__cxa_thread_atexit()` on Apple's OSes. - Don't check owner for finished transactions. - Fixed typo in `MDBX_EINVAL` which breaks MingGW builds with CLANG. 37 files changed, 7604 insertions(+), 7417 deletions(-) Signed-off-by: Леонид Юрьев (Leonid Yuriev) --- ChangeLog.md | 27 ++++++++++++++++----------- src/man1/mdbx_chk.1 | 2 +- src/man1/mdbx_copy.1 | 2 +- src/man1/mdbx_drop.1 | 2 +- src/man1/mdbx_dump.1 | 2 +- src/man1/mdbx_load.1 | 2 +- src/man1/mdbx_stat.1 | 2 +- 7 files changed, 22 insertions(+), 17 deletions(-) diff --git a/ChangeLog.md b/ChangeLog.md index b629fd74..f69516cc 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -1,37 +1,42 @@ ChangeLog --------- -## v0.12.1 (Positive Proxima) scheduled to 2022-08-24 +## v0.12.1 (Positive Proxima) at 2022-08-24 The planned frontward release with new superior features on the day of 20 anniversary of [Positive Technologies](https://ptsecurty.com). +``` +37 files changed, 7604 insertions(+), 7417 deletions(-) +Signed-off-by: Леонид Юрьев (Leonid Yuriev) +``` + New: - The `Big Foot` feature which significantly reduces GC overhead for processing large lists of retired pages from huge transactions. Now _libmdbx_ avoid creating large chunks of PNLs (page number lists) which required a long sequences of free pages, aka large/overflow pages. Thus avoiding searching, allocating and storing such sequences inside GC. - Improved hot/online validation and checking of database pages both for more robustness and performance. + - New solid and fast method to latch meta-pages called `Troika`. + The minimum of memory barriers, reads, comparisons and conditional transitions are used. - New `MDBX_VALIDATION` environment options to extra validation of DB structure and pages content for carefully/safe handling damaged or untrusted DB. - - Optionally cache for pointers to last/steady meta-pages (currently is off by default). + - Accelerated ×16/×8/×4 by AVX512/AVX2/SSE2/Neon implementations of search page sequences. - Added the `gcrtime_seconds16dot16` counter to the "Page Operation Statistics" that accumulates time spent for GC searching and reclaiming. - Copy-with-compactification now clears/zeroes unused gaps inside database pages. - - The C++ API has been refined to simplify support for `wchar_t` in path names. + - The `C` and `C++` APIs has been extended and/or refined to simplify using `wchar_t` pathnames. + On Windows the `mdbx_env_openW()`, ``mdbx_env_get_pathW()`()`, `mdbx_env_copyW()`, `mdbx_env_open_for_recoveryW()` are available for now, + but the `mdbx_env_get_path()` has been replaced in favor of `mdbx_env_get_pathW()`. - Added explicit error message for Buildroot's Microblaze toolchain maintainers. + - Added `MDBX_MANAGE_BUILD_FLAGS` build options for CMake. + - Speed-up internal `bsearch`/`lower_bound` implementation using branchless tactic, including workaround for CLANG x86 optimiser bug. + - A lot internal refinement and micro-optimisations. + - Internally counted volume of dirty pages (unused for now but for coming features). Fixes: - Never use modern `__cxa_thread_atexit()` on Apple's OSes. - - Use `MultiByteToWideChar(CP_THREAD_ACP)` instead of `mbstowcs()`. - Don't check owner for finished transactions. - Fixed typo in `MDBX_EINVAL` which breaks MingGW builds with CLANG. -Minors: - - - Fixed variable name typo. - - Using `ldd` to check used dso. - - Added `MDBX_WEAK_IMPORT_ATTRIBUTE` macro. - - Use current transaction geometry for untouched parameters when `env_set_geometry()` called within a write transaction. - - Minor clarified `iov_page()` failure case. ## v0.12.0 at 2022-06-19 diff --git a/src/man1/mdbx_chk.1 b/src/man1/mdbx_chk.1 index 343b80cb..da2e78fb 100644 --- a/src/man1/mdbx_chk.1 +++ b/src/man1/mdbx_chk.1 @@ -1,6 +1,6 @@ .\" Copyright 2015-2022 Leonid Yuriev . .\" Copying restrictions apply. See COPYRIGHT/LICENSE. -.TH MDBX_CHK 1 "2022-06-19" "MDBX 0.12.0" +.TH MDBX_CHK 1 "2022-08-24" "MDBX 0.12.1" .SH NAME mdbx_chk \- MDBX checking tool .SH SYNOPSIS diff --git a/src/man1/mdbx_copy.1 b/src/man1/mdbx_copy.1 index 4a861172..3cb97a34 100644 --- a/src/man1/mdbx_copy.1 +++ b/src/man1/mdbx_copy.1 @@ -2,7 +2,7 @@ .\" Copyright 2012-2015 Howard Chu, Symas Corp. All Rights Reserved. .\" Copyright 2015,2016 Peter-Service R&D LLC . .\" Copying restrictions apply. See COPYRIGHT/LICENSE. -.TH MDBX_COPY 1 "2022-06-19" "MDBX 0.12.0" +.TH MDBX_COPY 1 "2022-08-24" "MDBX 0.12.1" .SH NAME mdbx_copy \- MDBX environment copy tool .SH SYNOPSIS diff --git a/src/man1/mdbx_drop.1 b/src/man1/mdbx_drop.1 index 15945800..099c485b 100644 --- a/src/man1/mdbx_drop.1 +++ b/src/man1/mdbx_drop.1 @@ -1,7 +1,7 @@ .\" Copyright 2021-2022 Leonid Yuriev . .\" Copyright 2014-2021 Howard Chu, Symas Corp. All Rights Reserved. .\" Copying restrictions apply. See COPYRIGHT/LICENSE. -.TH MDBX_DROP 1 "2022-06-19" "MDBX 0.12.0" +.TH MDBX_DROP 1 "2022-08-24" "MDBX 0.12.1" .SH NAME mdbx_drop \- MDBX database delete tool .SH SYNOPSIS diff --git a/src/man1/mdbx_dump.1 b/src/man1/mdbx_dump.1 index 4e360edf..417488e7 100644 --- a/src/man1/mdbx_dump.1 +++ b/src/man1/mdbx_dump.1 @@ -2,7 +2,7 @@ .\" Copyright 2014-2015 Howard Chu, Symas Corp. All Rights Reserved. .\" Copyright 2015,2016 Peter-Service R&D LLC . .\" Copying restrictions apply. See COPYRIGHT/LICENSE. -.TH MDBX_DUMP 1 "2022-06-19" "MDBX 0.12.0" +.TH MDBX_DUMP 1 "2022-08-24" "MDBX 0.12.1" .SH NAME mdbx_dump \- MDBX environment export tool .SH SYNOPSIS diff --git a/src/man1/mdbx_load.1 b/src/man1/mdbx_load.1 index 1363d56b..4ab41fbf 100644 --- a/src/man1/mdbx_load.1 +++ b/src/man1/mdbx_load.1 @@ -2,7 +2,7 @@ .\" Copyright 2014-2015 Howard Chu, Symas Corp. All Rights Reserved. .\" Copyright 2015,2016 Peter-Service R&D LLC . .\" Copying restrictions apply. See COPYRIGHT/LICENSE. -.TH MDBX_LOAD 1 "2022-06-19" "MDBX 0.12.0" +.TH MDBX_LOAD 1 "2022-08-24" "MDBX 0.12.1" .SH NAME mdbx_load \- MDBX environment import tool .SH SYNOPSIS diff --git a/src/man1/mdbx_stat.1 b/src/man1/mdbx_stat.1 index 1580ed44..a47d52f0 100644 --- a/src/man1/mdbx_stat.1 +++ b/src/man1/mdbx_stat.1 @@ -2,7 +2,7 @@ .\" Copyright 2012-2015 Howard Chu, Symas Corp. All Rights Reserved. .\" Copyright 2015,2016 Peter-Service R&D LLC . .\" Copying restrictions apply. See COPYRIGHT/LICENSE. -.TH MDBX_STAT 1 "2022-06-19" "MDBX 0.12.0" +.TH MDBX_STAT 1 "2022-08-24" "MDBX 0.12.1" .SH NAME mdbx_stat \- MDBX environment status tool .SH SYNOPSIS From b73727d73eef4cc6b4240f8e366b3e8278f51761 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Fri, 26 Aug 2022 18:52:35 +0300 Subject: [PATCH 102/364] mdbx: add `MDBX_HAVE_BUILTIN_CPU_SUPPORTS` build option. --- src/core.c | 8 ++------ src/options.h | 19 +++++++++++++++++++ 2 files changed, 21 insertions(+), 6 deletions(-) diff --git a/src/core.c b/src/core.c index 547ef3be..dc256137 100644 --- a/src/core.c +++ b/src/core.c @@ -1174,7 +1174,7 @@ static __inline int rthc_atexit(void (*dtor)(void *), void *obj, #ifndef MDBX_HAVE_CXA_THREAD_ATEXIT_IMPL #if defined(LIBCXXABI_HAS_CXA_THREAD_ATEXIT_IMPL) || \ defined(HAVE___CXA_THREAD_ATEXIT_IMPL) || __GLIBC_PREREQ(2, 18) || \ - defined(ANDROID) + defined(BIONIC) #define MDBX_HAVE_CXA_THREAD_ATEXIT_IMPL 1 #else #define MDBX_HAVE_CXA_THREAD_ATEXIT_IMPL 0 @@ -6232,10 +6232,7 @@ __hot static pgno_t *scan4seq_neon(pgno_t *range, const size_t len, #ifdef scan4seq /* The scan4seq() is the best or no alternatives */ -#else -#if !(__has_builtin(__builtin_cpu_supports) || \ - defined(__BUILTIN_CPU_SUPPORTS__) || \ - (defined(__ia32__) && __GNUC_PREREQ(4, 8) && __GLIBC_PREREQ(2, 23))) +#elif !MDBX_HAVE_BUILTIN_CPU_SUPPORTS /* The scan4seq_default() will be used since no cpu-features detection support * from compiler. Please don't ask to implement cpuid-based detection and don't * make such PRs. */ @@ -6272,7 +6269,6 @@ static pgno_t *scan4seq_resolver(pgno_t *range, const size_t len, scan4seq = choice ? choice : scan4seq_default; return scan4seq(range, len, seq); } -#endif /* __has_builtin(__builtin_cpu_supports */ #endif /* scan4seq */ //------------------------------------------------------------------------------ diff --git a/src/options.h b/src/options.h index b5d760b4..08018630 100644 --- a/src/options.h +++ b/src/options.h @@ -194,6 +194,25 @@ #ifndef MDBX_HAVE_C11ATOMICS #endif /* MDBX_HAVE_C11ATOMICS */ +/** If defined then enables use the GCC's `__builtin_cpu_supports()` + * for runtime dispatching depending on the CPU's capabilities. */ +#ifndef MDBX_HAVE_BUILTIN_CPU_SUPPORTS +#if defined(__APPLE__) || defined(BIONIC) +/* Never use any modern features on Apple's or Google's OSes + * since a lot of troubles with compatibility and/or performance */ +#define MDBX_HAVE_BUILTIN_CPU_SUPPORTS 0 +#elif __has_builtin(__builtin_cpu_supports) || \ + defined(__BUILTIN_CPU_SUPPORTS__) || \ + (defined(__ia32__) && __GNUC_PREREQ(4, 8) && __GLIBC_PREREQ(2, 23)) +#define MDBX_HAVE_BUILTIN_CPU_SUPPORTS 1 +#else +#define MDBX_HAVE_BUILTIN_CPU_SUPPORTS 0 +#endif +#elif !(MDBX_HAVE_BUILTIN_CPU_SUPPORTS == 0 || \ + MDBX_HAVE_BUILTIN_CPU_SUPPORTS == 1) +#error MDBX_HAVE_BUILTIN_CPU_SUPPORTS must be defined as 0 or 1 +#endif /* MDBX_HAVE_BUILTIN_CPU_SUPPORTS */ + //------------------------------------------------------------------------------ /** Win32 File Locking API for \ref MDBX_LOCKING */ From 3230fb57887b3ede297a99fd6245906a9d6736e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Fri, 26 Aug 2022 19:27:17 +0300 Subject: [PATCH 103/364] mdbx: update ChangeLog. --- ChangeLog.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/ChangeLog.md b/ChangeLog.md index f69516cc..7311db18 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -1,6 +1,17 @@ ChangeLog --------- +## Underway v0.12.2 + +New: + + - Added `MDBX_HAVE_BUILT IN_CPU_SUPPORTS` build option to control use GCC's `__builtin_cpu_supports()` function, + which could be unavailable on a fake OSes (macos, ios, android, etc). + + +------------------------------------------------------------------------------- + + ## v0.12.1 (Positive Proxima) at 2022-08-24 The planned frontward release with new superior features on the day of 20 anniversary of [Positive Technologies](https://ptsecurty.com). @@ -76,6 +87,7 @@ Minors: - Use current transaction geometry for untouched parameters when `env_set_geometry()` called within a write transaction. - Minor clarified `iov_page()` failure case. + ------------------------------------------------------------------------------- From 2d7c25b263814f1caee8122dfe527e5664ff61e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Fri, 2 Sep 2022 01:46:11 +0300 Subject: [PATCH 104/364] mdbx: minor fix extra ensure/assertion check of `oldest_reader` inside `txn_end()`. --- src/core.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/core.c b/src/core.c index dc256137..05adabf4 100644 --- a/src/core.c +++ b/src/core.c @@ -8630,10 +8630,6 @@ static int txn_end(MDBX_txn *txn, const unsigned mode) { (txn->mt_flags & MDBX_TXN_RDONLY) ? 'r' : 'w', (void *)txn, (void *)env, txn->mt_dbs[MAIN_DBI].md_root, txn->mt_dbs[FREE_DBI].md_root); - ENSURE(env, txn->mt_txnid >= - /* paranoia is appropriate here */ env->me_lck - ->mti_oldest_reader.weak); - if (!(mode & MDBX_END_EOTDONE)) /* !(already closed cursors) */ cursors_eot(txn, false); @@ -8643,6 +8639,9 @@ static int txn_end(MDBX_txn *txn, const unsigned mode) { MDBX_reader *slot = txn->to.reader; eASSERT(env, slot->mr_pid.weak == env->me_pid); if (likely(!(txn->mt_flags & MDBX_TXN_FINISHED))) { + ENSURE(env, txn->mt_txnid >= + /* paranoia is appropriate here */ env->me_lck + ->mti_oldest_reader.weak); eASSERT(env, txn->mt_txnid == slot->mr_txnid.weak && slot->mr_txnid.weak >= env->me_lck->mti_oldest_reader.weak); @@ -8671,6 +8670,9 @@ static int txn_end(MDBX_txn *txn, const unsigned mode) { txn->mt_flags = MDBX_TXN_RDONLY | MDBX_TXN_FINISHED; txn->mt_owner = 0; } else if (!(txn->mt_flags & MDBX_TXN_FINISHED)) { + ENSURE(env, txn->mt_txnid >= + /* paranoia is appropriate here */ env->me_lck + ->mti_oldest_reader.weak); #if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__) if (txn == env->me_txn0) txn_valgrind(env, nullptr); From 52cb6b90a783cce5f4c973979d09d955c09e821c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Fri, 2 Sep 2022 02:02:33 +0300 Subject: [PATCH 105/364] mdbx: fix extra check for `MDBX_APPENDDUP`. --- src/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/core.c b/src/core.c index 05adabf4..4047e4b8 100644 --- a/src/core.c +++ b/src/core.c @@ -15571,7 +15571,7 @@ __hot int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, mc->mc_ki[mc->mc_top]++; /* step forward for appending */ rc = MDBX_NOTFOUND; } else { - if (unlikely(rc != MDBX_SUCCESS || !(flags & MDBX_APPENDDUP))) + if (unlikely(rc != MDBX_SUCCESS)) /* new-key < last-key * or new-key == last-key without MDBX_APPENDDUP */ return MDBX_EKEYMISMATCH; From 29da09328ea325a23dfcf17acdc038ca8dc4ab22 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Mon, 5 Sep 2022 15:53:35 +0300 Subject: [PATCH 106/364] mdbx: removed description of deprecated usage of `MDBX_NODUPDATA`. --- mdbx.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mdbx.h b/mdbx.h index f536d41d..88550dd8 100644 --- a/mdbx.h +++ b/mdbx.h @@ -1593,8 +1593,7 @@ enum MDBX_put_flags_t { MDBX_NOOVERWRITE = UINT32_C(0x10), /** Has effect only for \ref MDBX_DUPSORT databases. - * For upsertion: don't write if the key-value pair already exist. - * For deletion: remove all values for key. */ + * For upsertion: don't write if the key-value pair already exist. */ MDBX_NODUPDATA = UINT32_C(0x20), /** For upsertion: overwrite the current key/data pair. From 2d5438d2c2f87656fd0a7139c02ceeb32c0cb38a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Tue, 6 Sep 2022 13:01:17 +0300 Subject: [PATCH 107/364] mdbx: fix regression ASAN/Valgring-enabled builds. --- src/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/core.c b/src/core.c index 4047e4b8..8917040e 100644 --- a/src/core.c +++ b/src/core.c @@ -7371,7 +7371,7 @@ static void txn_valgrind(MDBX_env *env, MDBX_txn *txn) { pgno_t last = MAX_PAGENO + 1; if (env->me_txn0 && env->me_txn0->mt_owner == osal_thread_self()) { /* inside write-txn */ - last = meta_recent(env, &env->me_txn0->troika).ptr_v->mm_geo.next; + last = meta_recent(env, &env->me_txn0->tw.troika).ptr_v->mm_geo.next; } else if (env->me_flags & MDBX_RDONLY) { /* read-only mode, no write-txn, no wlock mutex */ last = NUM_METAS; From c4efa8dce820393fe071851fc43005bdb51907eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Fri, 9 Sep 2022 19:01:59 +0300 Subject: [PATCH 108/364] mdbx: update ChangeLog. --- ChangeLog.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/ChangeLog.md b/ChangeLog.md index 7311db18..d3e2208c 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -8,6 +8,16 @@ New: - Added `MDBX_HAVE_BUILT IN_CPU_SUPPORTS` build option to control use GCC's `__builtin_cpu_supports()` function, which could be unavailable on a fake OSes (macos, ios, android, etc). +Fixes: + + - Fixed an extra check for `MDBX_APPENDDUP` inside `mdbx_cursor_put()` which could result in returning `MDBX_EKEYMISMATCH` for valid cases. + +Minors: + + - Fixed an extra ensure/assertion check of `oldest_reader` inside `txn_end()`. + - Removed description of deprecated usage of `MDBX_NODUPDATA`. + - Fixed regression ASAN/Valgring-enabled builds. + ------------------------------------------------------------------------------- From a089f730029339a2d82ddb298c06099178b24136 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Tue, 13 Sep 2022 11:39:55 +0300 Subject: [PATCH 109/364] mdbx: fix minor MinGW warning. --- ChangeLog.md | 1 + src/core.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/ChangeLog.md b/ChangeLog.md index d3e2208c..dce5c865 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -17,6 +17,7 @@ Minors: - Fixed an extra ensure/assertion check of `oldest_reader` inside `txn_end()`. - Removed description of deprecated usage of `MDBX_NODUPDATA`. - Fixed regression ASAN/Valgring-enabled builds. + - Fixed minor MingGW warning. ------------------------------------------------------------------------------- diff --git a/src/core.c b/src/core.c index 8917040e..a37eacac 100644 --- a/src/core.c +++ b/src/core.c @@ -5244,7 +5244,7 @@ static txnid_t find_oldest_reader(MDBX_env *const env, const txnid_t steady) { new_oldest = steady; for (unsigned i = 0; i < snap_nreaders; ++i) { - const mdbx_pid_t pid = + const uint32_t pid = atomic_load32(&lck->mti_readers[i].mr_pid, mo_AcquireRelease); if (!pid) continue; From bec9312df506eda0591e5feccb3ee70b41e1145c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Fri, 16 Sep 2022 17:32:41 +0300 Subject: [PATCH 110/364] mdbx: more off/on for clang-format. --- src/core.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/core.c b/src/core.c index a37eacac..c8fd9e03 100644 --- a/src/core.c +++ b/src/core.c @@ -2131,6 +2131,8 @@ static int lcklist_detach_locked(MDBX_env *env) { do { \ } while (0) +/* *INDENT-OFF* */ +/* clang-format off */ #define SEARCH_IMPL(NAME, TYPE_LIST, TYPE_ARG, CMP) \ static __always_inline const TYPE_LIST *NAME( \ const TYPE_LIST *it, unsigned length, const TYPE_ARG item) { \ @@ -2178,6 +2180,8 @@ static int lcklist_detach_locked(MDBX_env *env) { \ return it; \ } +/* *INDENT-ON* */ +/* clang-format on */ /*----------------------------------------------------------------------------*/ From cf8540d84e1f02baf504bb3f451c60acd4e1439f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Sat, 17 Sep 2022 10:02:42 +0300 Subject: [PATCH 111/364] mdbx: minor refine `mdbx_env_create()`. --- src/core.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/src/core.c b/src/core.c index c8fd9e03..4352545e 100644 --- a/src/core.c +++ b/src/core.c @@ -11489,6 +11489,16 @@ lckless_stub(const MDBX_env *env) { } __cold int mdbx_env_create(MDBX_env **penv) { + if (unlikely(!penv)) + return MDBX_EINVAL; + *penv = nullptr; + + const size_t os_psize = osal_syspagesize(); + if (unlikely(!is_powerof2(os_psize) || os_psize < MIN_PAGESIZE)) { + ERROR("unsuitable system pagesize %" PRIuPTR, os_psize); + return MDBX_INCOMPATIBLE; + } + MDBX_env *env = osal_calloc(1, sizeof(MDBX_env)); if (unlikely(!env)) return MDBX_ENOMEM; @@ -11516,12 +11526,6 @@ __cold int mdbx_env_create(MDBX_env **penv) { env->me_options.merge_threshold_16dot16_percent = 65536 / 4 /* 25% */; int rc; - const size_t os_psize = osal_syspagesize(); - if (unlikely(!is_powerof2(os_psize) || os_psize < MIN_PAGESIZE)) { - ERROR("unsuitable system pagesize %" PRIuPTR, os_psize); - rc = MDBX_INCOMPATIBLE; - goto bailout; - } env->me_os_psize = (unsigned)os_psize; setup_pagesize(env, (env->me_os_psize < MAX_PAGESIZE) ? env->me_os_psize : MAX_PAGESIZE); @@ -11558,7 +11562,6 @@ __cold int mdbx_env_create(MDBX_env **penv) { bailout: osal_free(env); - *penv = nullptr; return rc; } From fe20de136c22ed3bc4c6d3f673e79c106e824f60 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Sun, 18 Sep 2022 13:21:38 +0300 Subject: [PATCH 112/364] mdbx: require linux >= 4.0 --- src/core.c | 11 +++++++++++ src/osal.c | 13 ++++++------- 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/src/core.c b/src/core.c index 4352545e..a1f635d0 100644 --- a/src/core.c +++ b/src/core.c @@ -11499,6 +11499,17 @@ __cold int mdbx_env_create(MDBX_env **penv) { return MDBX_INCOMPATIBLE; } +#if defined(__linux__) || defined(__gnu_linux__) + if (unlikely(linux_kernel_version < 0x04000000)) { + /* 2022-09-01: Прошло уже больше двух после окончания какой-либо поддержки + * самого "долгоиграющего" ядра 3.16.85 ветки 3.x */ + ERROR("too old linux kernel %u.%u.%u.%u, the >= 4.0.0 is required", + linux_kernel_version >> 24, (linux_kernel_version >> 16) & 255, + (linux_kernel_version >> 8) & 255, linux_kernel_version & 255); + return MDBX_INCOMPATIBLE; + } +#endif /* Linux */ + MDBX_env *env = osal_calloc(1, sizeof(MDBX_env)); if (unlikely(!env)) return MDBX_ENOMEM; diff --git a/src/osal.c b/src/osal.c index a3e23eee..ec26357c 100644 --- a/src/osal.c +++ b/src/osal.c @@ -937,9 +937,8 @@ MDBX_INTERNAL_FUNC int osal_fsync(mdbx_filehandle_t fd, break /* error */; #if defined(__linux__) || defined(__gnu_linux__) case MDBX_SYNC_SIZE: - if (linux_kernel_version >= 0x03060000) - return MDBX_SUCCESS; - __fallthrough /* fall through */; + assert(linux_kernel_version >= 0x03060000); + return MDBX_SUCCESS; #endif /* Linux */ #endif /* _POSIX_SYNCHRONIZED_IO > 0 */ default: @@ -1076,10 +1075,10 @@ MDBX_INTERNAL_FUNC int osal_msync(osal_mmap_t *map, size_t offset, return (int)GetLastError(); #else #if defined(__linux__) || defined(__gnu_linux__) - if (mode_bits == MDBX_SYNC_NONE && linux_kernel_version > 0x02061300) - /* Since Linux 2.6.19, MS_ASYNC is in fact a no-op. The kernel properly - * tracks dirty pages and flushes them to storage as necessary. */ - return MDBX_SUCCESS; + assert(linux_kernel_version > 0x02061300); + /* Since Linux 2.6.19, MS_ASYNC is in fact a no-op. The kernel properly + * tracks dirty pages and flushes them to storage as necessary. */ + return MDBX_SUCCESS; #endif /* Linux */ if (msync(ptr, length, (mode_bits & MDBX_SYNC_DATA) ? MS_SYNC : MS_ASYNC)) return errno; From beda291692230b812d33e90ab237b3cfaa2fbfcf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Wed, 21 Sep 2022 14:32:18 +0300 Subject: [PATCH 113/364] mdbx-windows: fix nasty `clz()` (i.e. using `_BitScanReverse()` bug. --- ChangeLog.md | 1 + src/core.c | 17 ++++++++++++++--- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/ChangeLog.md b/ChangeLog.md index dce5c865..543b5d83 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -11,6 +11,7 @@ New: Fixes: - Fixed an extra check for `MDBX_APPENDDUP` inside `mdbx_cursor_put()` which could result in returning `MDBX_EKEYMISMATCH` for valid cases. + - Fixed nasty `clz()` bug (by using `_BitScanReverse()`, only MSVC builds affected). Minors: diff --git a/src/core.c b/src/core.c index a1f635d0..ea3e7331 100644 --- a/src/core.c +++ b/src/core.c @@ -5905,16 +5905,27 @@ MDBX_MAYBE_UNUSED static const pgno_t *scan4range_checker(const MDBX_PNL pnl, #if defined(_MSC_VER) && !defined(__builtin_clz) && \ !__has_builtin(__builtin_clz) -MDBX_MAYBE_UNUSED static __always_inline size_t __builtin_clz(unsigned value) { +MDBX_MAYBE_UNUSED static __always_inline size_t __builtin_clz(uint32_t value) { unsigned long index; _BitScanReverse(&index, value); - return index; + return 31 - index; } #endif /* _MSC_VER */ #if defined(_MSC_VER) && !defined(__builtin_clzl) && \ !__has_builtin(__builtin_clzl) -#define __builtin_clzl(value) __builtin_clz(value) +MDBX_MAYBE_UNUSED static __always_inline size_t __builtin_clzl(size_t value) { + unsigned long index; +#ifdef _WIN64 + assert(sizeof(value) == 8); + _BitScanReverse64(&index, value); + return 63 - index; +#else + assert(sizeof(value) == 4); + _BitScanReverse(&index, value); + return 31 - index; +#endif +} #endif /* _MSC_VER */ #if !defined(MDBX_ATTRIBUTE_TARGET) && \ From f51ace3db85150b57305ab140b536192a103a7c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Thu, 22 Sep 2022 17:26:59 +0300 Subject: [PATCH 114/364] mdbx-windows: always call debugger if it present when assertion check failed. --- src/osal.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/osal.c b/src/osal.c index ec26357c..b8b8cf54 100644 --- a/src/osal.c +++ b/src/osal.c @@ -242,14 +242,14 @@ __cold void mdbx_assert_fail(const MDBX_env *env, const char *msg, if (num < 1 || !message) message = ""; OutputDebugStringA(message); - if (IsDebuggerPresent()) - DebugBreak(); #else __assert_fail(msg, "mdbx", line, func); #endif } #if defined(_WIN32) || defined(_WIN64) + if (IsDebuggerPresent()) + DebugBreak(); FatalExit(ERROR_UNHANDLED_ERROR); #else abort(); From 32a3674dc892942ded442c86106bf187b4846901 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Thu, 22 Sep 2022 19:48:44 +0300 Subject: [PATCH 115/364] mdbx: return `MDBX_PROBLEM` insted of `MDBX_CORRUPTED` on coherence troubles. --- src/core.c | 33 +++++++++++++-------------------- 1 file changed, 13 insertions(+), 20 deletions(-) diff --git a/src/core.c b/src/core.c index ea3e7331..5f3fde43 100644 --- a/src/core.c +++ b/src/core.c @@ -3116,6 +3116,7 @@ static int __must_check_result page_split(MDBX_cursor *mc, MDBX_val *const newdata, pgno_t newpgno, const unsigned naf); +static int coherency_timeout(uint64_t *timestamp, pgno_t pgno); static bool coherency_check_meta(const MDBX_env *env, const volatile MDBX_meta *meta, bool report); static int __must_check_result validate_meta_copy(MDBX_env *env, @@ -4389,25 +4390,13 @@ static int iov_write(MDBX_txn *const txn, struct iov_ctx *ctx) { while (likely(rc == MDBX_SUCCESS) && unlikely(memcmp(wp, rp, ctx->iov[i].iov_len) != 0)) { if (!timestamp) { - timestamp = osal_monotime(); iov_done(txn, ctx); WARNING( "catch delayed/non-arrived page %" PRIaPGNO " %s", wp->mp_pgno, "(workaround for incoherent flaw of unified page/buffer cache)"); - } else if (unlikely(osal_monotime() - timestamp > 65536 / 10)) { - ERROR("bailout waiting for %" PRIaPGNO " page arrival %s", wp->mp_pgno, - "(workaround for incoherent flaw of unified page/buffer cache)"); - rc = MDBX_CORRUPTED; } -#if defined(_WIN32) || defined(_WIN64) - SwitchToThread(); -#elif defined(__linux__) || defined(__gnu_linux__) || defined(_UNIX03_SOURCE) - sched_yield(); -#elif (defined(_GNU_SOURCE) && __GLIBC_PREREQ(2, 1)) || defined(_OPEN_THREADS) - pthread_yield(); -#else - usleep(42); -#endif + if (coherency_timeout(×tamp, wp->mp_pgno) != MDBX_RESULT_TRUE) + rc = MDBX_PROBLEM; } dpage_free(env, wp, bytes2pgno(env, ctx->iov[i].iov_len)); } @@ -7629,13 +7618,17 @@ static bool coherency_check(const MDBX_env *env, const txnid_t txnid, return ok; } -__cold static int coherency_timeout(uint64_t *timestamp) { +__cold static int coherency_timeout(uint64_t *timestamp, pgno_t pgno) { if (likely(timestamp && *timestamp == 0)) *timestamp = osal_monotime(); else if (unlikely(!timestamp || osal_monotime() - *timestamp > 65536 / 10)) { - ERROR("bailout waiting for valid snapshot (%s)", - "workaround for incoherent flaw of unified page/buffer cache"); - return MDBX_CORRUPTED; + if (pgno) + ERROR("bailout waiting for %" PRIaPGNO " page arrival %s", pgno, + "(workaround for incoherent flaw of unified page/buffer cache)"); + else + ERROR("bailout waiting for valid snapshot (%s)", + "workaround for incoherent flaw of unified page/buffer cache"); + return MDBX_PROBLEM; } osal_memory_fence(mo_AcquireRelease, true); @@ -7660,7 +7653,7 @@ __hot static int coherency_check_readed(const MDBX_env *env, uint64_t *timestamp) { const bool report = !(timestamp && *timestamp); if (unlikely(!coherency_check(env, txnid, dbs, meta, report))) - return coherency_timeout(timestamp); + return coherency_timeout(timestamp, 0); return MDBX_SUCCESS; } @@ -7675,7 +7668,7 @@ static int coherency_check_written(const MDBX_env *env, const txnid_t txnid, (head_txnid < MIN_TXNID) ? "invalid" : "unexpected", head_txnid, bytes2pgno(env, (const uint8_t *)meta - env->me_dxb_mmap.dxb), "(workaround for incoherent flaw of unified page/buffer cache)"); - return coherency_timeout(timestamp); + return coherency_timeout(timestamp, 0); } return coherency_check_readed(env, head_txnid, meta->mm_dbs, meta, timestamp); } From 00515d50a9656188bb42f53edd3e58d3132efe29 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Fri, 23 Sep 2022 10:52:46 +0300 Subject: [PATCH 116/364] =?UTF-8?q?mdbx:=20=D0=B8=D1=81=D0=BF=D1=80=D0=B0?= =?UTF-8?q?=D0=B2=D0=BB=D0=B5=D0=BD=D0=B8=D0=B5=20=D0=BF=D1=80=D0=BE=D0=B2?= =?UTF-8?q?=D0=B5=D1=80=D0=BE=D1=87=D0=BD=D0=BE=D0=B3=D0=BE=20=D1=83=D1=82?= =?UTF-8?q?=D0=B2=D0=B5=D1=80=D0=B6=D0=B4=D0=B5=D0=BD=D0=B8=D1=8F=20=D0=B2?= =?UTF-8?q?=20`page=5Fretire=5Fex()`.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/core.c b/src/core.c index 5f3fde43..ae3d8498 100644 --- a/src/core.c +++ b/src/core.c @@ -4049,8 +4049,8 @@ static int page_retire_ex(MDBX_cursor *mc, const pgno_t pgno, check = page_get_any(mc, pgno, txn->mt_front); if (unlikely(check.err != MDBX_SUCCESS)) return check.err; - tASSERT(txn, - (check.page->mp_flags & ~P_LEAF2) == (pageflags & ~P_FROZEN)); + tASSERT(txn, (check.page->mp_flags & ~(P_LEAF2 | P_SPILLED)) == + (pageflags & ~P_FROZEN)); tASSERT(txn, !(pageflags & P_FROZEN) || IS_FROZEN(txn, check.page)); } if (pageflags & P_FROZEN) { From 41b918f1fcc39d9f0a669f0e0e02527b2e2bcb00 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Fri, 23 Sep 2022 21:47:32 +0300 Subject: [PATCH 117/364] =?UTF-8?q?mdbx:=20=D0=B8=D1=81=D0=BF=D1=80=D0=B0?= =?UTF-8?q?=D0=B2=D0=BB=D0=B5=D0=BD=D0=B8=D0=B5=20=D0=BF=D1=80=D0=BE=D0=B2?= =?UTF-8?q?=D0=B5=D1=80=D0=BE=D1=87=D0=BD=D0=BE=D0=B3=D0=BE=20=D1=83=D1=82?= =?UTF-8?q?=D0=B2=D0=B5=D1=80=D0=B6=D0=B4=D0=B5=D0=BD=D0=B8=D1=8F=20=D0=B2?= =?UTF-8?q?=D0=BD=D1=83=D1=82=D1=80=D0=B8=20`mdbx=5Ftxn=5Fabort()`=20?= =?UTF-8?q?=D0=B4=D0=BB=D1=8F=20=D0=BE=D1=88=D0=B8=D0=B1=D0=BE=D1=87=D0=BD?= =?UTF-8?q?=D1=8B=D1=85=20=D1=82=D1=80=D0=B0=D0=BD=D0=B7=D0=B0=D0=BA=D1=86?= =?UTF-8?q?=D0=B8=D0=B9.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/core.c b/src/core.c index ae3d8498..3439a3dd 100644 --- a/src/core.c +++ b/src/core.c @@ -8822,7 +8822,7 @@ int mdbx_txn_abort(MDBX_txn *txn) { if (txn->mt_child) mdbx_txn_abort(txn->mt_child); - tASSERT(txn, dirtylist_check(txn)); + tASSERT(txn, (txn->mt_flags & MDBX_TXN_ERROR) || dirtylist_check(txn)); return txn_end(txn, MDBX_END_ABORT | MDBX_END_SLOT | MDBX_END_FREE); } @@ -10622,6 +10622,7 @@ provide_latency: return rc; fail: + txn->mt_flags |= MDBX_TXN_ERROR; mdbx_txn_abort(txn); goto provide_latency; } From 9f64e2a10c44379659f9a210ff77785aa9f68ae1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Sat, 24 Sep 2022 01:29:47 +0300 Subject: [PATCH 118/364] =?UTF-8?q?mdbx:=20=D0=BF=D1=80=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B0=20=D1=81=D0=BF=D0=B8=D0=BB=D0=B8=D0=BD=D0=B3=D0=B0=20?= =?UTF-8?q?=D0=B4=D0=BB=D1=8F=20=D1=83=D1=81=D1=82=D1=80=D0=B0=D0=BD=D0=B5?= =?UTF-8?q?=D0=BD=D0=B8=D1=8F=20=D1=81=D1=80=D0=B0=D0=B1=D0=B0=D1=82=D1=8B?= =?UTF-8?q?=D0=B2=D0=B0=D0=BD=D0=B8=D1=8F=20=D0=BF=D1=80=D0=BE=D0=B2=D0=B5?= =?UTF-8?q?=D1=80=D0=BE=D1=87=D0=BD=D1=8B=D1=85=20=D1=83=D1=82=D0=B2=D0=B5?= =?UTF-8?q?=D1=80=D0=B6=D0=B4=D0=B5=D0=BD=D0=B8=D0=B9=20=D0=B2=20=D0=BE?= =?UTF-8?q?=D1=82=D0=BB=D0=B0=D0=B4=D0=BE=D1=87=D0=BD=D1=8B=D1=85=20=D1=81?= =?UTF-8?q?=D0=B1=D0=BE=D1=80=D0=BA=D0=B0=D1=85.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core.c | 35 +++++++++++++++++++++-------------- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/src/core.c b/src/core.c index 3439a3dd..65e7ae40 100644 --- a/src/core.c +++ b/src/core.c @@ -4718,7 +4718,7 @@ static int txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0, VERBOSE("lru-head %u, age-max %u", txn->tw.dirtylru, age_max); /* half of 8-bit radix-sort */ - unsigned radix_counters[256], spillable = 0, spilled = 0; + pgno_t radix_counters[256], spillable = 0; memset(&radix_counters, 0, sizeof(radix_counters)); const uint32_t reciprocal = (UINT32_C(255) << 24) / (age_max + 1); for (unsigned i = 1; i <= dl->length; ++i) { @@ -4744,14 +4744,15 @@ static int txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0, break; } - VERBOSE("prio2spill %u, prio2adjacent %u, amount %u, spillable %u, " - "wanna_spill %u", - prio2spill, prio2adjacent, amount, spillable, wanna_spill); + VERBOSE("prio2spill %u, prio2adjacent %u, spillable %u," + " wanna-spill %u, amount %u", + prio2spill, prio2adjacent, spillable, wanna_spill, amount); tASSERT(txn, prio2spill < prio2adjacent && prio2adjacent <= 256); unsigned prev_prio = 256; unsigned r, w, prio; - for (w = 0, r = 1; r <= dl->length && spilled < wanna_spill; + pgno_t spilled_entries = 0, spilled_npages = 0; + for (w = 0, r = 1; r <= dl->length && spilled_entries < wanna_spill; prev_prio = prio, ++r) { prio = spill_prio(txn, r, reciprocal); MDBX_page *const dp = dl->items[r].ptr; @@ -4766,11 +4767,12 @@ static int txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0, dpl_npages(dl, w), dl->items[r - 1].pgno, dpl_age(txn, r - 1), prev_prio); --w; - rc = spill_page(txn, &ctx, dl->items[r - 1].ptr, - dpl_npages(dl, r - 1)); + const unsigned co_npages = dpl_npages(dl, r - 1); + rc = spill_page(txn, &ctx, dl->items[r - 1].ptr, co_npages); if (unlikely(rc != MDBX_SUCCESS)) break; - ++spilled; + ++spilled_entries; + spilled_npages += co_npages; } DEBUG("spill %u page %" PRIaPGNO " (age %d, prio %u)", npages, @@ -4778,7 +4780,8 @@ static int txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0, rc = spill_page(txn, &ctx, dp, npages); if (unlikely(rc != MDBX_SUCCESS)) break; - ++spilled; + ++spilled_entries; + spilled_npages += npages; continue; } @@ -4790,21 +4793,25 @@ static int txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0, if (unlikely(rc != MDBX_SUCCESS)) break; prio = prev_prio /* to continue co-spilling next adjacent pages */; - ++spilled; + ++spilled_entries; + spilled_npages += npages; continue; } } dl->items[++w] = dl->items[r]; } - tASSERT(txn, spillable == 0 || spilled > 0); + VERBOSE("spilled entries %u, spilled npages %u", spilled_entries, + spilled_npages); + tASSERT(txn, spillable == 0 || spilled_entries > 0); while (r <= dl->length) dl->items[++w] = dl->items[r++]; - tASSERT(txn, r - 1 - w == spilled); + tASSERT(txn, r - 1 - w == spilled_entries); dl->sorted = dpl_setlen(dl, w); - txn->tw.dirtyroom += spilled; + txn->tw.dirtyroom += spilled_entries; + txn->tw.dirtylist->pages_including_loose -= spilled_npages; tASSERT(txn, dirtylist_check(txn)); if (ctx.iov_items) { @@ -4818,7 +4825,7 @@ static int txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0, pnl_sort(txn->tw.spill_pages, (size_t)txn->mt_next_pgno << 1); txn->mt_flags |= MDBX_TXN_SPILLS; - NOTICE("spilled %u dirty-entries, now have %u dirty-room", spilled, + NOTICE("spilled %u dirty-entries, now have %u dirty-room", spilled_entries, txn->tw.dirtyroom); iov_done(txn, &ctx); } else { From 474391c83c5f81def6fdf3b0b6f5716a87b78fbf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Sun, 25 Sep 2022 12:47:31 +0300 Subject: [PATCH 119/364] =?UTF-8?q?mdbx:=20=D0=BF=D0=BE=D0=B4=D0=B4=D0=B5?= =?UTF-8?q?=D1=80=D0=B6=D0=BA=D0=B0=20=D0=B0=D1=81=D0=B8=D0=BD=D1=85=D1=80?= =?UTF-8?q?=D0=BE=D0=BD=D0=BD=D0=BE=D0=B3=D0=BE=20=D0=B2=D0=B2=D0=BE=D0=B4?= =?UTF-8?q?=D0=B0-=D0=B2=D1=8B=D0=B2=D0=BE=D0=B4=D0=B0=20=D0=B4=D0=BB?= =?UTF-8?q?=D1=8F=20Windows=20=D0=B8=20=D0=BF=D0=BE=D0=B4=D0=B3=D0=BE?= =?UTF-8?q?=D1=82=D0=BE=D0=B2=D0=BA=D0=B0=20=D0=BA=20`io=5Fring`=20(=D0=BE?= =?UTF-8?q?=D0=B1=D1=8A=D0=B5=D0=B4=D0=B8=D0=BD=D1=91=D0=BD=D0=BD=D1=8B?= =?UTF-8?q?=D0=B5=20=D0=BA=D0=BE=D0=BC=D0=BC=D0=B8=D1=82=D1=8B=20=D0=B8=20?= =?UTF-8?q?=D0=B8=D1=81=D0=BF=D1=80=D0=B0=D0=B2=D0=BB=D0=B5=D0=BD=D0=B8?= =?UTF-8?q?=D1=8F).?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- TODO.md | 2 +- mdbx.h | 6 +- src/base.h | 2 +- src/core.c | 628 +++++++++++++++++++++++++------------------ src/internals.h | 29 +- src/lck-windows.c | 258 +++++++++++------- src/osal.c | 624 +++++++++++++++++++++++++++++++++++++++++- src/osal.h | 163 ++++++++++- test/osal-windows.cc | 10 +- 9 files changed, 1323 insertions(+), 399 deletions(-) diff --git a/TODO.md b/TODO.md index db853653..984b97ea 100644 --- a/TODO.md +++ b/TODO.md @@ -11,7 +11,6 @@ For the same reason ~~Github~~ is blacklisted forever. So currently most of the links are broken due to noted malicious ~~Github~~ sabotage. - - [Engage an "overlapped I/O" on Windows](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/224). - [Move most of `mdbx_chk` functional to the library API](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/204). - [Replace SRW-lock on Windows to allow shrink DB with `MDBX_NOTLS` option](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/210). - [More flexible support of asynchronous runtime/framework(s)](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/200). @@ -27,3 +26,4 @@ Done ---- - [Simple careful mode for working with corrupted DB](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/223). + - [Engage an "overlapped I/O" on Windows](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/224). diff --git a/mdbx.h b/mdbx.h index 88550dd8..b10ab212 100644 --- a/mdbx.h +++ b/mdbx.h @@ -2522,9 +2522,13 @@ struct MDBX_envinfo { uint64_t unspill; /**< Quantity of unspilled/reloaded pages */ uint64_t wops; /**< Number of explicit write operations (not a pages) to a disk */ + uint64_t + msync; /**< Number of explicit msync-to-disk operations (not a pages) */ + uint64_t + fsync; /**< Number of explicit fsync-to-disk operations (not a pages) */ uint64_t gcrtime_seconds16dot16; /**< Time spent loading and searching inside - GC (aka FreeDB) in 1/65536 of second. */ + GC (aka FreeDB) in 1/65536 of second */ } mi_pgop_stat; }; #ifndef __cplusplus diff --git a/src/base.h b/src/base.h index a927f805..1596d26a 100644 --- a/src/base.h +++ b/src/base.h @@ -63,7 +63,7 @@ #define SSIZE_MAX INTPTR_MAX #endif -#if UINTPTR_MAX > 0xffffFFFFul || ULONG_MAX > 0xffffFFFFul +#if UINTPTR_MAX > 0xffffFFFFul || ULONG_MAX > 0xffffFFFFul || defined(_WIN64) #define MDBX_WORDBITS 64 #else #define MDBX_WORDBITS 32 diff --git a/src/core.c b/src/core.c index 65e7ae40..d307b60b 100644 --- a/src/core.c +++ b/src/core.c @@ -3983,13 +3983,12 @@ __cold static void kill_page(MDBX_txn *txn, MDBX_page *mp, pgno_t pgno, while (--npages) { iov[n] = iov[0]; if (++n == MDBX_COMMIT_PAGES) { - osal_pwritev(env->me_lazy_fd, iov, MDBX_COMMIT_PAGES, iov_off, - pgno2bytes(env, MDBX_COMMIT_PAGES)); + osal_pwritev(env->me_lazy_fd, iov, MDBX_COMMIT_PAGES, iov_off); iov_off += pgno2bytes(env, MDBX_COMMIT_PAGES); n = 0; } } - osal_pwritev(env->me_lazy_fd, iov, n, iov_off, pgno2bytes(env, n)); + osal_pwritev(env->me_lazy_fd, iov, n, iov_off); } } @@ -4318,139 +4317,155 @@ static __inline int page_retire(MDBX_cursor *mc, MDBX_page *mp) { return page_retire_ex(mc, mp->mp_pgno, mp, mp->mp_flags); } -struct iov_ctx { - unsigned iov_items; - size_t iov_bytes; - size_t iov_off; +typedef struct iov_ctx { + MDBX_env *env; + osal_ioring_t *ior; + int err; +#ifndef MDBX_NEED_WRITTEN_RANGE +#define MDBX_NEED_WRITTEN_RANGE 1 +#endif /* MDBX_NEED_WRITTEN_RANGE */ +#if MDBX_NEED_WRITTEN_RANGE pgno_t flush_begin; pgno_t flush_end; - struct iovec iov[MDBX_COMMIT_PAGES]; -}; +#endif /* MDBX_NEED_WRITTEN_RANGE */ + uint64_t coherency_timestamp; +} iov_ctx_t; -static __inline void iov_init(MDBX_txn *const txn, struct iov_ctx *ctx) { - ctx->flush_begin = MAX_PAGENO; - ctx->flush_end = MIN_PAGENO; - ctx->iov_items = 0; - ctx->iov_bytes = 0; - ctx->iov_off = 0; - (void)txn; +__must_check_result static int iov_init(MDBX_txn *const txn, iov_ctx_t *ctx, + unsigned items, pgno_t npages) { + ctx->env = txn->mt_env; + ctx->ior = &txn->mt_env->me_ioring; + ctx->err = osal_ioring_reserve(ctx->ior, items, + pgno_align2os_bytes(txn->mt_env, npages)); + if (likely(ctx->err == MDBX_SUCCESS)) { +#if MDBX_NEED_WRITTEN_RANGE + ctx->flush_begin = MAX_PAGENO; + ctx->flush_end = MIN_PAGENO; +#endif /* MDBX_NEED_WRITTEN_RANGE */ + osal_ioring_reset(ctx->ior); + } + return ctx->err; } -static __inline void iov_done(MDBX_txn *const txn, struct iov_ctx *ctx) { - tASSERT(txn, ctx->iov_items == 0); -#if defined(__linux__) || defined(__gnu_linux__) - MDBX_env *const env = txn->mt_env; - if (!(txn->mt_flags & MDBX_WRITEMAP) && linux_kernel_version < 0x02060b00) - /* Linux kernels older than version 2.6.11 ignore the addr and nbytes - * arguments, making this function fairly expensive. Therefore, the - * whole cache is always flushed. */ - osal_flush_incoherent_mmap( - env->me_map + pgno2bytes(env, ctx->flush_begin), - pgno2bytes(env, ctx->flush_end - ctx->flush_begin), env->me_os_psize); -#endif /* Linux */ +static inline bool iov_empty(const iov_ctx_t *ctx) { + return osal_ioring_used(ctx->ior) == 0; } -static int iov_write(MDBX_txn *const txn, struct iov_ctx *ctx) { - tASSERT(txn, !(txn->mt_flags & MDBX_WRITEMAP)); - tASSERT(txn, ctx->iov_items > 0); +static void iov_callback4dirtypages(iov_ctx_t *ctx, size_t offset, void *data, + size_t bytes) { + MDBX_env *const env = ctx->env; + eASSERT(env, (env->me_flags & MDBX_WRITEMAP) == 0); - MDBX_env *const env = txn->mt_env; - int rc; - if (likely(ctx->iov_items == 1)) { - eASSERT(env, ctx->iov_bytes == (size_t)ctx->iov[0].iov_len); - rc = osal_pwrite(env->me_lazy_fd, ctx->iov[0].iov_base, ctx->iov[0].iov_len, - ctx->iov_off); - } else { - rc = osal_pwritev(env->me_lazy_fd, ctx->iov, ctx->iov_items, ctx->iov_off, - ctx->iov_bytes); - } + MDBX_page *wp = (MDBX_page *)data; + eASSERT(env, wp->mp_pgno == bytes2pgno(env, offset)); + eASSERT(env, bytes2pgno(env, bytes) >= (IS_OVERFLOW(wp) ? wp->mp_pages : 1u)); + eASSERT(env, (wp->mp_flags & P_ILL_BITS) == 0); - if (unlikely(rc != MDBX_SUCCESS)) - ERROR("Write error: %s", mdbx_strerror(rc)); - else { - VALGRIND_MAKE_MEM_DEFINED(txn->mt_env->me_map + ctx->iov_off, - ctx->iov_bytes); - MDBX_ASAN_UNPOISON_MEMORY_REGION(txn->mt_env->me_map + ctx->iov_off, - ctx->iov_bytes); - } - - unsigned iov_items = ctx->iov_items; -#if MDBX_ENABLE_PGOP_STAT - txn->mt_env->me_lck->mti_pgop_stat.wops.weak += iov_items; -#endif /* MDBX_ENABLE_PGOP_STAT */ - ctx->iov_items = 0; - ctx->iov_bytes = 0; - - uint64_t timestamp = 0; - for (unsigned i = 0; i < iov_items; i++) { - MDBX_page *wp = (MDBX_page *)ctx->iov[i].iov_base; - const MDBX_page *rp = pgno2page(txn->mt_env, wp->mp_pgno); + if (likely(ctx->err == MDBX_SUCCESS)) { + VALGRIND_MAKE_MEM_DEFINED(env->me_map + offset, bytes); + MDBX_ASAN_UNPOISON_MEMORY_REGION(env->me_map + offset, bytes); + osal_flush_incoherent_mmap(env->me_map + offset, bytes, env->me_os_psize); + const MDBX_page *const rp = (const MDBX_page *)(env->me_map + offset); /* check with timeout as the workaround * for todo4recovery://erased_by_github/libmdbx/issues/269 */ - while (likely(rc == MDBX_SUCCESS) && - unlikely(memcmp(wp, rp, ctx->iov[i].iov_len) != 0)) { - if (!timestamp) { - iov_done(txn, ctx); - WARNING( - "catch delayed/non-arrived page %" PRIaPGNO " %s", wp->mp_pgno, - "(workaround for incoherent flaw of unified page/buffer cache)"); - } - if (coherency_timeout(×tamp, wp->mp_pgno) != MDBX_RESULT_TRUE) - rc = MDBX_PROBLEM; + if (unlikely(memcmp(wp, rp, bytes))) { + ctx->coherency_timestamp = 0; + WARNING("catch delayed/non-arrived page %" PRIaPGNO " %s", wp->mp_pgno, + "(workaround for incoherent flaw of unified page/buffer cache)"); + do + if (coherency_timeout(&ctx->coherency_timestamp, wp->mp_pgno) != + MDBX_RESULT_TRUE) { + ctx->err = MDBX_PROBLEM; + break; + } + while (unlikely(memcmp(wp, rp, bytes))); } - dpage_free(env, wp, bytes2pgno(env, ctx->iov[i].iov_len)); } - return rc; + + if (likely(bytes == env->me_psize)) + dpage_free(env, wp, 1); + else { + do { + eASSERT(env, wp->mp_pgno == bytes2pgno(env, offset)); + eASSERT(env, (wp->mp_flags & P_ILL_BITS) == 0); + unsigned npages = IS_OVERFLOW(wp) ? wp->mp_pages : 1u; + size_t chunk = pgno2bytes(env, npages); + eASSERT(env, bytes >= chunk); + dpage_free(env, wp, npages); + wp = (MDBX_page *)((char *)wp + chunk); + offset += chunk; + bytes -= chunk; + } while (bytes); + } } -static int iov_page(MDBX_txn *txn, struct iov_ctx *ctx, MDBX_page *dp, - unsigned npages) { +static void iov_complete(iov_ctx_t *ctx) { + if ((ctx->env->me_flags & MDBX_WRITEMAP) == 0) + osal_ioring_walk(ctx->ior, ctx, iov_callback4dirtypages); + osal_ioring_reset(ctx->ior); +} + +__must_check_result static int iov_write(iov_ctx_t *ctx) { + eASSERT(ctx->env, !iov_empty(ctx)); + osal_ioring_write_result_t r = osal_ioring_write(ctx->ior); +#if MDBX_ENABLE_PGOP_STAT + ctx->env->me_lck->mti_pgop_stat.wops.weak += r.wops; +#endif /* MDBX_ENABLE_PGOP_STAT */ + ctx->err = r.err; + if (unlikely(ctx->err != MDBX_SUCCESS)) + ERROR("Write error: %s", mdbx_strerror(ctx->err)); + iov_complete(ctx); + return ctx->err; +} + +__must_check_result static int iov_page(MDBX_txn *txn, iov_ctx_t *ctx, + MDBX_page *dp, unsigned npages) { MDBX_env *const env = txn->mt_env; + tASSERT(txn, ctx->err == MDBX_SUCCESS); tASSERT(txn, dp->mp_pgno >= MIN_PAGENO && dp->mp_pgno < txn->mt_next_pgno); tASSERT(txn, IS_MODIFIABLE(txn, dp)); tASSERT(txn, !(dp->mp_flags & ~(P_BRANCH | P_LEAF | P_LEAF2 | P_OVERFLOW))); - ctx->flush_begin = - (ctx->flush_begin < dp->mp_pgno) ? ctx->flush_begin : dp->mp_pgno; - ctx->flush_end = (ctx->flush_end > dp->mp_pgno + npages) - ? ctx->flush_end - : dp->mp_pgno + npages; - env->me_lck->mti_unsynced_pages.weak += npages; - if (IS_SHADOWED(txn, dp)) { tASSERT(txn, !(txn->mt_flags & MDBX_WRITEMAP)); dp->mp_txnid = txn->mt_txnid; tASSERT(txn, IS_SPILLED(txn, dp)); - const size_t size = pgno2bytes(env, npages); - if (ctx->iov_off + ctx->iov_bytes != pgno2bytes(env, dp->mp_pgno) || - ctx->iov_items == ARRAY_LENGTH(ctx->iov) || - ctx->iov_bytes + size > MAX_WRITE) { - if (ctx->iov_items) { - int err = iov_write(txn, ctx); - if (unlikely(err != MDBX_SUCCESS)) - return err; -#if defined(__linux__) || defined(__gnu_linux__) - if (linux_kernel_version >= 0x02060b00) - /* Linux kernels older than version 2.6.11 ignore the addr and nbytes - * arguments, making this function fairly expensive. Therefore, the - * whole cache is always flushed. */ -#endif /* Linux */ - osal_flush_incoherent_mmap(env->me_map + ctx->iov_off, ctx->iov_bytes, - env->me_os_psize); + int err = osal_ioring_add(ctx->ior, pgno2bytes(env, dp->mp_pgno), dp, + pgno2bytes(env, npages)); + if (unlikely(err != MDBX_SUCCESS)) { + ctx->err = err; + if (unlikely(err != MDBX_RESULT_TRUE)) { + iov_complete(ctx); + return err; } - ctx->iov_off = pgno2bytes(env, dp->mp_pgno); + err = iov_write(ctx); + tASSERT(txn, iov_empty(ctx)); + if (likely(err == MDBX_SUCCESS)) { + err = osal_ioring_add(ctx->ior, pgno2bytes(env, dp->mp_pgno), dp, + pgno2bytes(env, npages)); + if (unlikely(err != MDBX_SUCCESS)) { + iov_complete(ctx); + return ctx->err = err; + } + } + tASSERT(txn, ctx->err == MDBX_SUCCESS); } - ctx->iov[ctx->iov_items].iov_base = (void *)dp; - ctx->iov[ctx->iov_items].iov_len = size; - ctx->iov_items += 1; - ctx->iov_bytes += size; } else { tASSERT(txn, txn->mt_flags & MDBX_WRITEMAP); } + +#if MDBX_NEED_WRITTEN_RANGE + ctx->flush_begin = + (ctx->flush_begin < dp->mp_pgno) ? ctx->flush_begin : dp->mp_pgno; + ctx->flush_end = (ctx->flush_end > dp->mp_pgno + npages) + ? ctx->flush_end + : dp->mp_pgno + npages; +#endif /* MDBX_NEED_WRITTEN_RANGE */ + env->me_lck->mti_unsynced_pages.weak += npages; return MDBX_SUCCESS; } -static int spill_page(MDBX_txn *txn, struct iov_ctx *ctx, MDBX_page *dp, +static int spill_page(MDBX_txn *txn, iov_ctx_t *ctx, MDBX_page *dp, unsigned npages) { tASSERT(txn, !(txn->mt_flags & MDBX_WRITEMAP)); pgno_t pgno = dp->mp_pgno; @@ -4613,13 +4628,18 @@ static int txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0, txn->tw.dirtyroom, need); tASSERT(txn, txn->tw.dirtylist->length >= wanna_spill); - struct iov_ctx ctx; - iov_init(txn, &ctx); int rc = MDBX_SUCCESS; if (txn->mt_flags & MDBX_WRITEMAP) { MDBX_dpl *const dl = txn->tw.dirtylist; const unsigned span = dl->length - txn->tw.loose_count; txn->tw.dirtyroom += span; + + iov_ctx_t ctx; + rc = iov_init(txn, &ctx, wanna_spill, + dl->pages_including_loose - txn->tw.loose_count); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + unsigned r, w; for (w = 0, r = 1; r <= dl->length; ++r) { MDBX_page *dp = dl->items[r].ptr; @@ -4749,6 +4769,13 @@ static int txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0, prio2spill, prio2adjacent, spillable, wanna_spill, amount); tASSERT(txn, prio2spill < prio2adjacent && prio2adjacent <= 256); + iov_ctx_t ctx; + rc = iov_init(txn, &ctx, amount, + txn->tw.dirtylist->pages_including_loose - + txn->tw.loose_count); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + unsigned prev_prio = 256; unsigned r, w, prio; pgno_t spilled_entries = 0, spilled_npages = 0; @@ -4814,12 +4841,10 @@ static int txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0, txn->tw.dirtylist->pages_including_loose -= spilled_npages; tASSERT(txn, dirtylist_check(txn)); - if (ctx.iov_items) { - /* iov_page() frees dirty-pages and reset iov_items in case of failure. */ + if (!iov_empty(&ctx)) { tASSERT(txn, rc == MDBX_SUCCESS); - rc = iov_write(txn, &ctx); + rc = iov_write(&ctx); } - if (unlikely(rc != MDBX_SUCCESS)) goto bailout; @@ -4827,9 +4852,8 @@ static int txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0, txn->mt_flags |= MDBX_TXN_SPILLS; NOTICE("spilled %u dirty-entries, now have %u dirty-room", spilled_entries, txn->tw.dirtyroom); - iov_done(txn, &ctx); } else { - tASSERT(txn, ctx.iov_items == 0 && rc == MDBX_SUCCESS); + tASSERT(txn, rc == MDBX_SUCCESS); for (unsigned i = 1; i <= dl->length; ++i) { MDBX_page *dp = dl->items[i].ptr; NOTICE("dirtylist[%u]: pgno %u, npages %u, flags 0x%04X, age %u, prio %u", @@ -5610,7 +5634,7 @@ __cold static int map_resize(MDBX_env *env, const pgno_t used_pgno, if ((env->me_flags & MDBX_WRITEMAP) && env->me_lck->mti_unsynced_pages.weak) { #if MDBX_ENABLE_PGOP_STAT - env->me_lck->mti_pgop_stat.wops.weak += 1; + env->me_lck->mti_pgop_stat.msync.weak += 1; #endif /* MDBX_ENABLE_PGOP_STAT */ rc = osal_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, used_pgno), MDBX_SYNC_NONE); @@ -5743,74 +5767,71 @@ __cold static int map_resize_implicit(MDBX_env *env, const pgno_t used_pgno, true); } -static int meta_unsteady(MDBX_env *env, const txnid_t last_steady, - MDBX_meta *const meta, mdbx_filehandle_t fd) { +static int meta_unsteady(int err, MDBX_env *env, const txnid_t early_than, + const pgno_t pgno) { + MDBX_meta *const meta = METAPAGE(env, pgno); + const txnid_t txnid = constmeta_txnid(meta); + if (unlikely(err != MDBX_SUCCESS) || !META_IS_STEADY(meta) || + !(txnid < early_than)) + return err; + + WARNING("wipe txn #%" PRIaTXN ", meta %" PRIaPGNO, txnid, pgno); const uint64_t wipe = MDBX_DATASIGN_NONE; - if (unlikely(META_IS_STEADY(meta)) && constmeta_txnid(meta) <= last_steady) { - WARNING("wipe txn #%" PRIaTXN ", meta %" PRIaPGNO, last_steady, - data_page(meta)->mp_pgno); - if (env->me_flags & MDBX_WRITEMAP) - unaligned_poke_u64(4, meta->mm_sign, wipe); - else - return osal_pwrite(fd, &wipe, sizeof(meta->mm_sign), - (uint8_t *)&meta->mm_sign - env->me_map); - } - return MDBX_SUCCESS; -} - -__cold static int wipe_steady(MDBX_txn *txn, const txnid_t last_steady) { - MDBX_env *const env = txn->mt_env; -#if MDBX_ENABLE_PGOP_STAT - env->me_lck->mti_pgop_stat.wops.weak += 1; -#endif /* MDBX_ENABLE_PGOP_STAT */ - const mdbx_filehandle_t fd = (env->me_dsync_fd != INVALID_HANDLE_VALUE) - ? env->me_dsync_fd - : env->me_lazy_fd; - int err = meta_unsteady(env, last_steady, METAPAGE(env, 0), fd); - if (unlikely(err != MDBX_SUCCESS)) - return err; - err = meta_unsteady(env, last_steady, METAPAGE(env, 1), fd); - if (unlikely(err != MDBX_SUCCESS)) - return err; - err = meta_unsteady(env, last_steady, METAPAGE(env, 2), fd); - if (unlikely(err != MDBX_SUCCESS)) - return err; - + const void *ptr = &wipe; + size_t bytes = sizeof(meta->mm_sign), + offset = (uint8_t *)&meta->mm_sign - env->me_map; if (env->me_flags & MDBX_WRITEMAP) { + unaligned_poke_u64(4, meta->mm_sign, wipe); osal_flush_incoherent_cpu_writeback(); err = osal_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, NUM_METAS), MDBX_SYNC_DATA); if (unlikely(err != MDBX_SUCCESS)) return err; - } else { - if (fd == env->me_lazy_fd) { -#if MDBX_USE_SYNCFILERANGE - static bool syncfilerange_unavailable; - if (!syncfilerange_unavailable && - sync_file_range(env->me_lazy_fd, 0, pgno2bytes(env, NUM_METAS), - SYNC_FILE_RANGE_WRITE | SYNC_FILE_RANGE_WAIT_AFTER)) { - err = errno; - if (ignore_enosys(err) == MDBX_RESULT_TRUE) - syncfilerange_unavailable = true; - } - if (syncfilerange_unavailable) -#endif /* MDBX_USE_SYNCFILERANGE */ - err = osal_fsync(env->me_lazy_fd, MDBX_SYNC_DATA); - if (unlikely(err != MDBX_SUCCESS)) - return err; } - osal_flush_incoherent_mmap(env->me_map, pgno2bytes(env, NUM_METAS), - env->me_os_psize); + ptr = data_page(meta); + offset = (uint8_t *)ptr - env->me_map; + bytes = env->me_psize; } +#if MDBX_ENABLE_PGOP_STAT + env->me_lck->mti_pgop_stat.wops.weak += 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ + err = osal_pwrite(env->me_fd4meta, ptr, bytes, offset); + if (likely(err == MDBX_SUCCESS) && env->me_fd4meta == env->me_lazy_fd) { + err = osal_fsync(env->me_lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); +#if MDBX_ENABLE_PGOP_STAT + env->me_lck->mti_pgop_stat.fsync.weak += 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ + } + return err; +} + +__cold static int wipe_steady(MDBX_txn *txn, txnid_t last_steady) { + MDBX_env *const env = txn->mt_env; + int err = MDBX_SUCCESS; + + /* early than last_steady */ + err = meta_unsteady(err, env, last_steady, 0); + err = meta_unsteady(err, env, last_steady, 1); + err = meta_unsteady(err, env, last_steady, 2); + + /* the last_steady */ + err = meta_unsteady(err, env, last_steady + 1, 0); + err = meta_unsteady(err, env, last_steady + 1, 1); + err = meta_unsteady(err, env, last_steady + 1, 2); + + osal_flush_incoherent_mmap(env->me_map, pgno2bytes(env, NUM_METAS), + env->me_os_psize); + /* force oldest refresh */ atomic_store32(&env->me_lck->mti_readers_refresh_flag, true, mo_Relaxed); + tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); txn->tw.troika = meta_tap(env); for (MDBX_txn *scan = txn->mt_env->me_txn0; scan; scan = scan->mt_child) if (scan != txn) scan->tw.troika = txn->tw.troika; - return MDBX_SUCCESS; + return err; } //------------------------------------------------------------------------------ @@ -7052,6 +7073,40 @@ fail: return rc; } +static int meta_sync(const MDBX_env *env, const meta_ptr_t head) { + eASSERT(env, atomic_load32(&env->me_lck->mti_meta_sync_txnid, mo_Relaxed) != + (uint32_t)head.txnid); + /* Функция может вызываться (в том числе) при (env->me_flags & + * MDBX_NOMETASYNC) == 0 и env->me_fd4meta == env->me_dsync_fd, например если + * предыдущая транзакция была выполненна с флагом MDBX_NOMETASYNC. */ + + int rc = MDBX_RESULT_TRUE; + if (env->me_flags & MDBX_WRITEMAP) { +#if MDBX_ENABLE_PGOP_ST + env->me_lck->mti_pgop_stat.wops.weak += 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ + const MDBX_page *page = data_page(head.ptr_c); + rc = osal_pwrite(env->me_fd4meta, page, env->me_psize, + (uint8_t *)page - env->me_map); + + if (likely(rc == MDBX_SUCCESS) && env->me_fd4meta == env->me_lazy_fd) { + rc = osal_fsync(env->me_lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); +#if MDBX_ENABLE_PGOP_STAT + env->me_lck->mti_pgop_stat.fsync.weak += 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ + } + } else { + rc = osal_fsync(env->me_lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); +#if MDBX_ENABLE_PGOP_STAT + env->me_lck->mti_pgop_stat.fsync.weak += 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ + } + + if (likely(rc == MDBX_SUCCESS)) + env->me_lck->mti_meta_sync_txnid.weak = (uint32_t)head.txnid; + return rc; +} + __cold static int env_sync(MDBX_env *env, bool force, bool nonblock) { bool locked = false; int rc = MDBX_RESULT_TRUE /* means "nothing to sync" */; @@ -7104,7 +7159,7 @@ retry:; int err; /* pre-sync to avoid latency for writer */ - if (unsynced_pages > /* FIXME: define threshold */ 16 && + if (unsynced_pages > /* FIXME: define threshold */ 42 && (flags & MDBX_SAFE_NOSYNC) == 0) { eASSERT(env, ((flags ^ env->me_flags) & MDBX_WRITEMAP) == 0); if (flags & MDBX_WRITEMAP) { @@ -7173,19 +7228,8 @@ retry:; /* LY: sync meta-pages if MDBX_NOMETASYNC enabled * and someone was not synced above. */ if (atomic_load32(&env->me_lck->mti_meta_sync_txnid, mo_Relaxed) != - (uint32_t)head.txnid) { -#if MDBX_ENABLE_PGOP_STAT - env->me_lck->mti_pgop_stat.wops.weak += 1; -#endif /* MDBX_ENABLE_PGOP_STAT */ - rc = (flags & MDBX_WRITEMAP) - ? osal_msync(&env->me_dxb_mmap, 0, - pgno_align2os_bytes(env, NUM_METAS), - MDBX_SYNC_DATA | MDBX_SYNC_IODQ) - : osal_fsync(env->me_lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); - if (likely(rc == MDBX_SUCCESS)) - atomic_store32(&env->me_lck->mti_meta_sync_txnid, (uint32_t)head.txnid, - mo_Relaxed); - } + (uint32_t)head.txnid) + rc = meta_sync(env, head); bailout: if (locked) @@ -7628,7 +7672,8 @@ static bool coherency_check(const MDBX_env *env, const txnid_t txnid, __cold static int coherency_timeout(uint64_t *timestamp, pgno_t pgno) { if (likely(timestamp && *timestamp == 0)) *timestamp = osal_monotime(); - else if (unlikely(!timestamp || osal_monotime() - *timestamp > 65536 / 10)) { + else if (unlikely(!timestamp || osal_monotime() - *timestamp > + osal_16dot16_to_monotime(65536 / 10))) { if (pgno) ERROR("bailout waiting for %" PRIaPGNO " page arrival %s", pgno, "(workaround for incoherent flaw of unified page/buffer cache)"); @@ -9902,7 +9947,7 @@ bailout: return rc; } -static int txn_write(MDBX_txn *txn, struct iov_ctx *ctx) { +static int txn_write(MDBX_txn *txn, iov_ctx_t *ctx) { MDBX_dpl *const dl = (txn->mt_flags & MDBX_WRITEMAP) ? txn->tw.dirtylist : dpl_sort(txn); int rc = MDBX_SUCCESS; @@ -9919,10 +9964,9 @@ static int txn_write(MDBX_txn *txn, struct iov_ctx *ctx) { break; } - if (ctx->iov_items) { - /* iov_page() frees dirty-pages and reset iov_items in case of failure. */ + if (!iov_empty(ctx)) { tASSERT(txn, rc == MDBX_SUCCESS); - rc = iov_write(txn, ctx); + rc = iov_write(ctx); } while (r <= dl->length) @@ -10568,42 +10612,53 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { goto fail; } - struct iov_ctx write_ctx; - iov_init(txn, &write_ctx); + const meta_ptr_t head = meta_recent(env, &txn->tw.troika); + iov_ctx_t write_ctx; + rc = iov_init(txn, &write_ctx, txn->tw.dirtylist->length, + txn->tw.dirtylist->pages_including_loose - txn->tw.loose_count); + if (unlikely(rc != MDBX_SUCCESS)) + goto fail; + + if (head.is_steady && atomic_load32(&env->me_lck->mti_meta_sync_txnid, + mo_Relaxed) != (uint32_t)head.txnid) { + /* sync prev meta */ + rc = meta_sync(env, head); + if (unlikely(rc != MDBX_SUCCESS)) + goto fail; + } + rc = txn_write(txn, &write_ctx); - if (likely(rc == MDBX_SUCCESS)) - iov_done(txn, &write_ctx); + if (unlikely(rc != MDBX_SUCCESS)) + goto fail; + /* TODO: use ctx.flush_begin & ctx.flush_end for range-sync */ ts_3 = latency ? osal_monotime() : 0; - if (likely(rc == MDBX_SUCCESS)) { - const meta_ptr_t head = meta_recent(env, &txn->tw.troika); - MDBX_meta meta; - memcpy(meta.mm_magic_and_version, head.ptr_c->mm_magic_and_version, 8); - meta.mm_extra_flags = head.ptr_c->mm_extra_flags; - meta.mm_validator_id = head.ptr_c->mm_validator_id; - meta.mm_extra_pagehdr = head.ptr_c->mm_extra_pagehdr; - unaligned_poke_u64(4, meta.mm_pages_retired, - unaligned_peek_u64(4, head.ptr_c->mm_pages_retired) + - MDBX_PNL_SIZE(txn->tw.retired_pages)); - meta.mm_geo = txn->mt_geo; - meta.mm_dbs[FREE_DBI] = txn->mt_dbs[FREE_DBI]; - meta.mm_dbs[MAIN_DBI] = txn->mt_dbs[MAIN_DBI]; - meta.mm_canary = txn->mt_canary; + MDBX_meta meta; + memcpy(meta.mm_magic_and_version, head.ptr_c->mm_magic_and_version, 8); + meta.mm_extra_flags = head.ptr_c->mm_extra_flags; + meta.mm_validator_id = head.ptr_c->mm_validator_id; + meta.mm_extra_pagehdr = head.ptr_c->mm_extra_pagehdr; + unaligned_poke_u64(4, meta.mm_pages_retired, + unaligned_peek_u64(4, head.ptr_c->mm_pages_retired) + + MDBX_PNL_SIZE(txn->tw.retired_pages)); + meta.mm_geo = txn->mt_geo; + meta.mm_dbs[FREE_DBI] = txn->mt_dbs[FREE_DBI]; + meta.mm_dbs[MAIN_DBI] = txn->mt_dbs[MAIN_DBI]; + meta.mm_canary = txn->mt_canary; - txnid_t commit_txnid = txn->mt_txnid; + txnid_t commit_txnid = txn->mt_txnid; #if MDBX_ENABLE_BIGFOOT - if (gcu_ctx.bigfoot > txn->mt_txnid) { - commit_txnid = gcu_ctx.bigfoot; - TRACE("use @%" PRIaTXN " (+%u) for commit bigfoot-txn", commit_txnid, - (unsigned)(commit_txnid - txn->mt_txnid)); - } -#endif - meta_set_txnid(env, &meta, commit_txnid); - - rc = sync_locked(env, env->me_flags | txn->mt_flags | MDBX_SHRINK_ALLOWED, - &meta, &txn->tw.troika); + if (gcu_ctx.bigfoot > txn->mt_txnid) { + commit_txnid = gcu_ctx.bigfoot; + TRACE("use @%" PRIaTXN " (+%u) for commit bigfoot-txn", commit_txnid, + (unsigned)(commit_txnid - txn->mt_txnid)); } +#endif + meta_set_txnid(env, &meta, commit_txnid); + + rc = sync_locked(env, env->me_flags | txn->mt_flags | MDBX_SHRINK_ALLOWED, + &meta, &txn->tw.troika); ts_4 = latency ? osal_monotime() : 0; if (unlikely(rc != MDBX_SUCCESS)) { env->me_flags |= MDBX_FATAL_ERROR; @@ -10894,11 +10949,11 @@ static int validate_meta_copy(MDBX_env *env, const MDBX_meta *meta, __cold static int read_header(MDBX_env *env, MDBX_meta *dest, const int lck_exclusive, const mdbx_mode_t mode_bits) { + memset(dest, 0, sizeof(MDBX_meta)); int rc = osal_filesize(env->me_lazy_fd, &env->me_dxb_mmap.filesize); if (unlikely(rc != MDBX_SUCCESS)) return rc; - memset(dest, 0, sizeof(MDBX_meta)); unaligned_poke_u64(4, dest->mm_sign, MDBX_DATASIGN_WEAK); rc = MDBX_CORRUPTED; @@ -11200,7 +11255,9 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending, if (atomic_load32(&env->me_lck->mti_unsynced_pages, mo_Relaxed)) { eASSERT(env, ((flags ^ env->me_flags) & MDBX_WRITEMAP) == 0); enum osal_syncmode_bits mode_bits = MDBX_SYNC_NONE; + unsigned sync_op = 0; if ((flags & MDBX_SAFE_NOSYNC) == 0) { + sync_op = 1; mode_bits = MDBX_SYNC_DATA; if (pending->mm_geo.next > meta_prefer_steady(env, troika).ptr_c->mm_geo.now) @@ -11209,7 +11266,7 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending, mode_bits |= MDBX_SYNC_IODQ; } #if MDBX_ENABLE_PGOP_STAT - env->me_lck->mti_pgop_stat.wops.weak += 1; + env->me_lck->mti_pgop_stat.msync.weak += sync_op; #endif /* MDBX_ENABLE_PGOP_STAT */ if (flags & MDBX_WRITEMAP) rc = @@ -11298,9 +11355,6 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending, eASSERT(env, ((env->me_flags ^ flags) & MDBX_WRITEMAP) == 0); ENSURE(env, target == head.ptr_c || constmeta_txnid(target) < pending->unsafe_txnid); -#if MDBX_ENABLE_PGOP_STAT - env->me_lck->mti_pgop_stat.wops.weak += 1; -#endif /* MDBX_ENABLE_PGOP_STAT */ if (flags & MDBX_WRITEMAP) { jitter4testing(true); if (likely(target != head.ptr_c)) { @@ -11338,34 +11392,37 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending, osal_flush_incoherent_cpu_writeback(); jitter4testing(true); /* sync meta-pages */ - rc = - osal_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, NUM_METAS), - (flags & MDBX_NOMETASYNC) ? MDBX_SYNC_NONE - : MDBX_SYNC_DATA | MDBX_SYNC_IODQ); +#if MDBX_ENABLE_PGOP_STAT + env->me_lck->mti_pgop_stat.msync.weak += 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ + rc = osal_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, NUM_METAS), + (flags & MDBX_NOMETASYNC) + ? MDBX_SYNC_NONE + : MDBX_SYNC_DATA | MDBX_SYNC_IODQ); if (unlikely(rc != MDBX_SUCCESS)) goto fail; } else { - const MDBX_meta undo_meta = *target; - const mdbx_filehandle_t fd = (env->me_dsync_fd != INVALID_HANDLE_VALUE) - ? env->me_dsync_fd - : env->me_lazy_fd; #if MDBX_ENABLE_PGOP_STAT env->me_lck->mti_pgop_stat.wops.weak += 1; #endif /* MDBX_ENABLE_PGOP_STAT */ - rc = osal_pwrite(fd, pending, sizeof(MDBX_meta), + const MDBX_meta undo_meta = *target; + rc = osal_pwrite(env->me_fd4meta, pending, sizeof(MDBX_meta), (uint8_t *)target - env->me_map); if (unlikely(rc != MDBX_SUCCESS)) { undo: DEBUG("%s", "write failed, disk error?"); /* On a failure, the pagecache still contains the new data. * Try write some old data back, to prevent it from being used. */ - osal_pwrite(fd, &undo_meta, sizeof(MDBX_meta), + osal_pwrite(env->me_fd4meta, &undo_meta, sizeof(MDBX_meta), (uint8_t *)target - env->me_map); goto fail; } osal_flush_incoherent_mmap(target, sizeof(MDBX_meta), env->me_os_psize); /* sync meta-pages */ - if ((flags & MDBX_NOMETASYNC) == 0 && fd == env->me_lazy_fd) { + if ((flags & MDBX_NOMETASYNC) == 0 && env->me_fd4meta == env->me_lazy_fd) { +#if MDBX_ENABLE_PGOP_STAT + env->me_lck->mti_pgop_stat.fsync.weak += 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ rc = osal_fsync(env->me_lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); if (rc != MDBX_SUCCESS) goto undo; @@ -11382,7 +11439,7 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending, goto fail; } env->me_lck->mti_meta_sync_txnid.weak = - (uint32_t)pending->unsafe_txnid - + pending->mm_txnid_a[__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__].weak - ((flags & MDBX_NOMETASYNC) ? UINT32_MAX / 3 : 0); *troika = meta_tap(env); @@ -11528,9 +11585,11 @@ __cold int mdbx_env_create(MDBX_env **penv) { env->me_maxreaders = DEFAULT_READERS; env->me_maxdbs = env->me_numdbs = CORE_DBS; - env->me_lazy_fd = INVALID_HANDLE_VALUE; - env->me_dsync_fd = INVALID_HANDLE_VALUE; - env->me_lfd = INVALID_HANDLE_VALUE; + env->me_lazy_fd = env->me_dsync_fd = env->me_fd4meta = env->me_fd4data = +#if defined(_WIN32) || defined(_WIN64) + env->me_overlapped_fd = +#endif /* Windows */ + env->me_lfd = INVALID_HANDLE_VALUE; env->me_pid = osal_getpid(); env->me_stuck_meta = -1; @@ -12863,10 +12922,10 @@ __cold static int __must_check_result override_meta(MDBX_env *env, if (shape && memcmp(model, shape, sizeof(MDBX_meta)) == 0) return MDBX_SUCCESS; -#if MDBX_ENABLE_PGOP_STAT - env->me_lck->mti_pgop_stat.wops.weak += 1; -#endif /* MDBX_ENABLE_PGOP_STAT */ if (env->me_flags & MDBX_WRITEMAP) { +#if MDBX_ENABLE_PGOP_STAT + env->me_lck->mti_pgop_stat.msync.weak += 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ rc = osal_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, model->mm_geo.next), MDBX_SYNC_DATA | MDBX_SYNC_IODQ); @@ -12877,18 +12936,26 @@ __cold static int __must_check_result override_meta(MDBX_env *env, * clearing consistency flag by mdbx_meta_update_begin() */ memcpy(pgno2page(env, target), page, env->me_psize); osal_flush_incoherent_cpu_writeback(); +#if MDBX_ENABLE_PGOP_STAT + env->me_lck->mti_pgop_stat.msync.weak += 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ rc = osal_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, target + 1), MDBX_SYNC_DATA | MDBX_SYNC_IODQ); } else { - const mdbx_filehandle_t fd = (env->me_dsync_fd != INVALID_HANDLE_VALUE) - ? env->me_dsync_fd - : env->me_lazy_fd; - rc = osal_pwrite(fd, page, env->me_psize, pgno2bytes(env, target)); - if (rc == MDBX_SUCCESS && fd == env->me_lazy_fd) +#if MDBX_ENABLE_PGOP_STAT + env->me_lck->mti_pgop_stat.wops.weak += 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ + rc = osal_pwrite(env->me_fd4meta, page, env->me_psize, + pgno2bytes(env, target)); + if (rc == MDBX_SUCCESS && env->me_fd4meta == env->me_lazy_fd) { +#if MDBX_ENABLE_PGOP_STAT + env->me_lck->mti_pgop_stat.fsync.weak += 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ rc = osal_fsync(env->me_lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); + } + osal_flush_incoherent_mmap(env->me_map, pgno2bytes(env, NUM_METAS), + env->me_os_psize); } - osal_flush_incoherent_mmap(env->me_map, pgno2bytes(env, NUM_METAS), - env->me_os_psize); eASSERT(env, !env->me_txn && !env->me_txn0); return rc; } @@ -13254,14 +13321,6 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, if (rc != MDBX_SUCCESS) goto bailout; - eASSERT(env, env->me_dsync_fd == INVALID_HANDLE_VALUE); - if ((flags & (MDBX_RDONLY | MDBX_SAFE_NOSYNC | MDBX_NOMETASYNC)) == 0) { - rc = osal_openfile(MDBX_OPEN_DXB_DSYNC, env, env_pathname.dxb, - &env->me_dsync_fd, 0); - ENSURE(env, - (rc != MDBX_SUCCESS) == (env->me_dsync_fd == INVALID_HANDLE_VALUE)); - } - #if MDBX_LOCKING == MDBX_LOCKING_SYSV env->me_sysv_ipc.key = ftok(env_pathname.dxb, 42); if (env->me_sysv_ipc.key == -1) { @@ -13270,7 +13329,30 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, } #endif /* MDBX_LOCKING */ -#if !(defined(_WIN32) || defined(_WIN64)) + /* Set the position in files outside of the data to avoid corruption + * due to erroneous use of file descriptors in the application code. */ + const uint64_t safe_parking_lot_offset = UINT64_C(0x7fffFFFF80000000); + osal_fseek(env->me_lazy_fd, safe_parking_lot_offset); + + env->me_fd4data = env->me_fd4meta = env->me_lazy_fd; +#if defined(_WIN32) || defined(_WIN64) + uint8_t ior_flags = 0; + if ((flags & (MDBX_RDONLY | MDBX_SAFE_NOSYNC)) == MDBX_SYNC_DURABLE) { + ior_flags = IOR_OVERLAPPED; + rc = + osal_openfile(MDBX_OPEN_DXB_OVERLAPPED, + env, env_pathname.dxb, &env->me_overlapped_fd, 0); + if (rc != MDBX_SUCCESS) + goto bailout; + env->me_data_lock_event = CreateEventW(nullptr, true, false, nullptr); + if (!env->me_data_lock_event) { + rc = (int)GetLastError(); + goto bailout; + } + env->me_fd4data = env->me_overlapped_fd; + osal_fseek(env->me_overlapped_fd, safe_parking_lot_offset); + } +#else if (mode == 0) { /* pickup mode for lck-file */ struct stat st; @@ -13291,13 +13373,7 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, rc = lck_rc; goto bailout; } - - /* Set the position in files outside of the data to avoid corruption - * due to erroneous use of file descriptors in the application code. */ - osal_fseek(env->me_lfd, UINT64_C(1) << 63); - osal_fseek(env->me_lazy_fd, UINT64_C(1) << 63); - if (env->me_dsync_fd != INVALID_HANDLE_VALUE) - osal_fseek(env->me_dsync_fd, UINT64_C(1) << 63); + osal_fseek(env->me_lfd, safe_parking_lot_offset); const MDBX_env_flags_t rigorous_flags = MDBX_SAFE_NOSYNC | MDBX_DEPRECATED_MAPASYNC; @@ -13305,6 +13381,20 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, MDBX_LIFORECLAIM | MDBX_DEPRECATED_COALESCE | MDBX_NORDAHEAD; + eASSERT(env, env->me_dsync_fd == INVALID_HANDLE_VALUE); + if ((flags & (MDBX_RDONLY | MDBX_SAFE_NOSYNC)) == 0 && + (env->me_fd4data == env->me_lazy_fd || !(flags & MDBX_NOMETASYNC))) { + rc = osal_openfile(MDBX_OPEN_DXB_DSYNC, env, env_pathname.dxb, + &env->me_dsync_fd, 0); + if (env->me_dsync_fd != INVALID_HANDLE_VALUE) { + if ((flags & MDBX_NOMETASYNC) == 0) + env->me_fd4meta = env->me_dsync_fd; + if (env->me_fd4data == env->me_lazy_fd) + env->me_fd4data = env->me_dsync_fd; + osal_fseek(env->me_dsync_fd, safe_parking_lot_offset); + } + } + MDBX_lockinfo *const lck = env->me_lck_mmap.lck; if (lck && lck_rc != MDBX_RESULT_TRUE && (env->me_flags & MDBX_RDONLY) == 0) { while (atomic_load32(&lck->mti_envmode, mo_AcquireRelease) == MDBX_RDONLY) { @@ -13413,6 +13503,12 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, } else rc = MDBX_ENOMEM; } + if (rc == MDBX_SUCCESS) + rc = osal_ioring_create(&env->me_ioring, +#if defined(_WIN32) || defined(_WIN64) + ior_flags, +#endif /* Windows */ + env->me_fd4data); } #if MDBX_DEBUG @@ -13469,6 +13565,8 @@ __cold static int env_close(MDBX_env *env) { const int rc = lcklist_detach_locked(env); lcklist_unlock(); + osal_ioring_destroy(&env->me_ioring); + if (env->me_map) { osal_munmap(&env->me_dxb_mmap); #ifdef MDBX_USE_VALGRIND @@ -13477,6 +13575,14 @@ __cold static int env_close(MDBX_env *env) { #endif } +#if defined(_WIN32) || defined(_WIN64) + if (env->me_overlapped_fd != INVALID_HANDLE_VALUE) { + CloseHandle(env->me_data_lock_event); + CloseHandle(env->me_overlapped_fd); + env->me_overlapped_fd = INVALID_HANDLE_VALUE; + } +#endif /* Windows */ + if (env->me_dsync_fd != INVALID_HANDLE_VALUE) { (void)osal_closefile(env->me_dsync_fd); env->me_dsync_fd = INVALID_HANDLE_VALUE; @@ -13578,7 +13684,7 @@ __cold int mdbx_env_close_ex(MDBX_env *env, bool dont_sync) { ? MDBX_SUCCESS : rc; } -#endif +#endif /* Windows */ } eASSERT(env, env->me_signature.weak == 0); @@ -20528,6 +20634,10 @@ __cold static int fetch_envinfo_ex(const MDBX_env *env, const MDBX_txn *txn, atomic_load64(&lck->mti_pgop_stat.unspill, mo_Relaxed); arg->mi_pgop_stat.wops = atomic_load64(&lck->mti_pgop_stat.wops, mo_Relaxed); + arg->mi_pgop_stat.msync = + atomic_load64(&lck->mti_pgop_stat.msync, mo_Relaxed); + arg->mi_pgop_stat.fsync = + atomic_load64(&lck->mti_pgop_stat.fsync, mo_Relaxed); arg->mi_pgop_stat.gcrtime_seconds16dot16 = osal_monotime_to_16dot16( atomic_load64(&lck->mti_pgop_stat.gcrtime, mo_Relaxed)); #else diff --git a/src/internals.h b/src/internals.h index 74137815..05f7393f 100644 --- a/src/internals.h +++ b/src/internals.h @@ -591,6 +591,10 @@ typedef struct { MDBX_atomic_uint64_t gcrtime; /* Time spending for reading/searching GC (aka FreeDB). The unit/scale is platform-depended, see osal_monotime(). */ + MDBX_atomic_uint64_t + msync; /* Number of explicit msync/flush-to-disk operations */ + MDBX_atomic_uint64_t + fsync; /* Number of explicit fsync/flush-to-disk operations */ } MDBX_pgop_stat_t; #endif /* MDBX_ENABLE_PGOP_STAT */ @@ -1143,7 +1147,11 @@ struct MDBX_env { osal_mmap_t me_dxb_mmap; /* The main data file */ #define me_map me_dxb_mmap.dxb #define me_lazy_fd me_dxb_mmap.fd - mdbx_filehandle_t me_dsync_fd; +#define me_fd4data me_ioring.fd + mdbx_filehandle_t me_dsync_fd, me_fd4meta; +#if defined(_WIN32) || defined(_WIN64) + HANDLE me_overlapped_fd, me_data_lock_event; +#endif /* Windows */ osal_mmap_t me_lck_mmap; /* The lock file */ #define me_lfd me_lck_mmap.fd struct MDBX_lockinfo *me_lck; @@ -1222,6 +1230,7 @@ struct MDBX_env { unsigned me_dp_reserve_len; /* PNL of pages that became unused in a write txn */ MDBX_PNL me_retired_pages; + osal_ioring_t me_ioring; #if defined(_WIN32) || defined(_WIN64) osal_srwlock_t me_remap_guard; @@ -1609,20 +1618,24 @@ ceil_powerof2(size_t value, size_t granularity) { } MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static unsigned -log2n_powerof2(size_t value) { - assert(value > 0 && value < INT32_MAX && is_powerof2(value)); - assert((value & -(int32_t)value) == value); -#if __GNUC_PREREQ(4, 1) || __has_builtin(__builtin_ctzl) - return __builtin_ctzl(value); +log2n_powerof2(size_t value_uintptr) { + assert(value_uintptr > 0 && value_uintptr < INT32_MAX && + is_powerof2(value_uintptr)); + assert((value_uintptr & -(intptr_t)value_uintptr) == value_uintptr); + const uint32_t value_uint32 = (uint32_t)value_uintptr; +#if __GNUC_PREREQ(4, 1) || __has_builtin(__builtin_ctz) + STATIC_ASSERT(sizeof(value_uint32) <= sizeof(unsigned)); + return __builtin_ctz(value_uint32); #elif defined(_MSC_VER) unsigned long index; - _BitScanForward(&index, (unsigned long)value); + STATIC_ASSERT(sizeof(value_uint32) <= sizeof(long)); + _BitScanForward(&index, value_uint32); return index; #else static const uint8_t debruijn_ctz32[32] = { 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8, 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9}; - return debruijn_ctz32[(uint32_t)(value * 0x077CB531u) >> 27]; + return debruijn_ctz32[(uint32_t)(value_uint32 * 0x077CB531ul) >> 27]; #endif } diff --git a/src/lck-windows.c b/src/lck-windows.c index 7b833773..7038854e 100644 --- a/src/lck-windows.c +++ b/src/lck-windows.c @@ -112,32 +112,73 @@ static #define LCK_WAITFOR 0 #define LCK_DONTWAIT LOCKFILE_FAIL_IMMEDIATELY -static __inline BOOL flock(mdbx_filehandle_t fd, DWORD flags, uint64_t offset, - size_t bytes) { +static int flock_with_event(HANDLE fd, HANDLE event, DWORD flags, + uint64_t offset, size_t bytes) { + TRACE("lock>>: fd %p, event %p, flags 0x%x offset %" PRId64 ", bytes %" PRId64 + " >>", + fd, event, flags, offset, bytes); OVERLAPPED ov; - ov.hEvent = 0; + ov.Internal = 0; + ov.InternalHigh = 0; + ov.hEvent = event; ov.Offset = (DWORD)offset; ov.OffsetHigh = HIGH_DWORD(offset); - return LockFileEx(fd, flags, 0, (DWORD)bytes, HIGH_DWORD(bytes), &ov); + if (LockFileEx(fd, flags, 0, (DWORD)bytes, HIGH_DWORD(bytes), &ov)) { + TRACE("lock<<: fd %p, event %p, flags 0x%x offset %" PRId64 + ", bytes %" PRId64 " << %s", + fd, event, flags, offset, bytes, "done"); + return MDBX_SUCCESS; + } + + DWORD rc = GetLastError(); + if (rc == ERROR_IO_PENDING) { + if (event) { + if (GetOverlappedResult(fd, &ov, &rc, true)) { + TRACE("lock<<: fd %p, event %p, flags 0x%x offset %" PRId64 + ", bytes %" PRId64 " << %s", + fd, event, flags, offset, bytes, "overlapped-done"); + return MDBX_SUCCESS; + } + rc = GetLastError(); + } else + CancelIo(fd); + } + TRACE("lock<<: fd %p, event %p, flags 0x%x offset %" PRId64 ", bytes %" PRId64 + " << err %d", + fd, event, flags, offset, bytes, rc); + return (int)rc; } -static __inline BOOL funlock(mdbx_filehandle_t fd, uint64_t offset, - size_t bytes) { +static __inline int flock(HANDLE fd, DWORD flags, uint64_t offset, + size_t bytes) { + return flock_with_event(fd, 0, flags, offset, bytes); +} + +static __inline int flock_data(const MDBX_env *env, DWORD flags, + uint64_t offset, size_t bytes) { + return flock_with_event(env->me_fd4data, env->me_data_lock_event, flags, + offset, bytes); +} + +static int funlock(mdbx_filehandle_t fd, uint64_t offset, size_t bytes) { + TRACE("unlock: fd %p, offset %" PRId64 ", bytes %" PRId64, fd, offset, bytes); return UnlockFile(fd, (DWORD)offset, HIGH_DWORD(offset), (DWORD)bytes, - HIGH_DWORD(bytes)); + HIGH_DWORD(bytes)) + ? MDBX_SUCCESS + : (int)GetLastError(); } /*----------------------------------------------------------------------------*/ /* global `write` lock for write-txt processing, * exclusive locking both meta-pages) */ -#define LCK_MAXLEN (1u + ((~(size_t)0) >> 1)) -#define LCK_META_OFFSET 0 -#define LCK_META_LEN (MAX_PAGESIZE * NUM_METAS) -#define LCK_BODY_OFFSET LCK_META_LEN -#define LCK_BODY_LEN (LCK_MAXLEN - LCK_BODY_OFFSET) -#define LCK_BODY LCK_BODY_OFFSET, LCK_BODY_LEN -#define LCK_WHOLE 0, LCK_MAXLEN +#ifdef _WIN64 +#define DXB_MAXLEN UINT64_C(0x7fffFFFFfff00000) +#else +#define DXB_MAXLEN UINT32_C(0x7ff00000) +#endif +#define DXB_BODY (env->me_psize * NUM_METAS), DXB_MAXLEN +#define DXB_WHOLE 0, DXB_MAXLEN int mdbx_txn_lock(MDBX_env *env, bool dontwait) { if (dontwait) { @@ -155,24 +196,27 @@ int mdbx_txn_lock(MDBX_env *env, bool dontwait) { } } - if ((env->me_flags & MDBX_EXCLUSIVE) || - flock(env->me_lazy_fd, - dontwait ? (LCK_EXCLUSIVE | LCK_DONTWAIT) - : (LCK_EXCLUSIVE | LCK_WAITFOR), - LCK_BODY)) + if (env->me_flags & MDBX_EXCLUSIVE) return MDBX_SUCCESS; - int rc = (int)GetLastError(); + + int rc = flock_with_event(env->me_fd4data, env->me_data_lock_event, + dontwait ? (LCK_EXCLUSIVE | LCK_DONTWAIT) + : (LCK_EXCLUSIVE | LCK_WAITFOR), + DXB_BODY); + if (rc == MDBX_SUCCESS) + return rc; + LeaveCriticalSection(&env->me_windowsbug_lock); return (!dontwait || rc != ERROR_LOCK_VIOLATION) ? rc : MDBX_BUSY; } void mdbx_txn_unlock(MDBX_env *env) { - int rc = (env->me_flags & MDBX_EXCLUSIVE) - ? TRUE - : funlock(env->me_lazy_fd, LCK_BODY); + if ((env->me_flags & MDBX_EXCLUSIVE) == 0) { + int err = funlock(env->me_fd4data, DXB_BODY); + if (err != MDBX_SUCCESS) + mdbx_panic("%s failed: err %u", __func__, err); + } LeaveCriticalSection(&env->me_windowsbug_lock); - if (!rc) - mdbx_panic("%s failed: err %u", __func__, (int)GetLastError()); } /*----------------------------------------------------------------------------*/ @@ -193,32 +237,32 @@ MDBX_INTERNAL_FUNC int osal_rdt_lock(MDBX_env *env) { /* transition from S-? (used) to S-E (locked), * e.g. exclusive lock upper-part */ - if ((env->me_flags & MDBX_EXCLUSIVE) || - flock(env->me_lfd, LCK_EXCLUSIVE | LCK_WAITFOR, LCK_UPPER)) + if (env->me_flags & MDBX_EXCLUSIVE) + return MDBX_SUCCESS; + + int rc = flock(env->me_lfd, LCK_EXCLUSIVE | LCK_WAITFOR, LCK_UPPER); + if (rc == MDBX_SUCCESS) return MDBX_SUCCESS; - int rc = (int)GetLastError(); osal_srwlock_ReleaseShared(&env->me_remap_guard); return rc; } MDBX_INTERNAL_FUNC void osal_rdt_unlock(MDBX_env *env) { - if (env->me_lfd != INVALID_HANDLE_VALUE) { + if (env->me_lfd != INVALID_HANDLE_VALUE && + (env->me_flags & MDBX_EXCLUSIVE) == 0) { /* transition from S-E (locked) to S-? (used), e.g. unlock upper-part */ - if ((env->me_flags & MDBX_EXCLUSIVE) == 0 && - !funlock(env->me_lfd, LCK_UPPER)) - mdbx_panic("%s failed: err %u", __func__, (int)GetLastError()); + int err = funlock(env->me_lfd, LCK_UPPER); + if (err != MDBX_SUCCESS) + mdbx_panic("%s failed: err %u", __func__, err); } osal_srwlock_ReleaseShared(&env->me_remap_guard); } MDBX_INTERNAL_FUNC int osal_lockfile(mdbx_filehandle_t fd, bool wait) { - return flock(fd, - wait ? LCK_EXCLUSIVE | LCK_WAITFOR - : LCK_EXCLUSIVE | LCK_DONTWAIT, - 0, LCK_MAXLEN) - ? MDBX_SUCCESS - : (int)GetLastError(); + return flock( + fd, wait ? LCK_EXCLUSIVE | LCK_WAITFOR : LCK_EXCLUSIVE | LCK_DONTWAIT, 0, + DXB_MAXLEN); } static int suspend_and_append(mdbx_handle_array_t **array, @@ -386,40 +430,36 @@ static void lck_unlock(MDBX_env *env) { if (env->me_lfd != INVALID_HANDLE_VALUE) { /* double `unlock` for robustly remove overlapped shared/exclusive locks */ - while (funlock(env->me_lfd, LCK_LOWER)) - ; - err = (int)GetLastError(); + do + err = funlock(env->me_lfd, LCK_LOWER); + while (err == MDBX_SUCCESS); assert(err == ERROR_NOT_LOCKED || (mdbx_RunningUnderWine() && err == ERROR_LOCK_VIOLATION)); - (void)err; SetLastError(ERROR_SUCCESS); - while (funlock(env->me_lfd, LCK_UPPER)) - ; - err = (int)GetLastError(); + do + err = funlock(env->me_lfd, LCK_UPPER); + while (err == MDBX_SUCCESS); assert(err == ERROR_NOT_LOCKED || (mdbx_RunningUnderWine() && err == ERROR_LOCK_VIOLATION)); - (void)err; SetLastError(ERROR_SUCCESS); } - if (env->me_lazy_fd != INVALID_HANDLE_VALUE) { + if (env->me_fd4data != INVALID_HANDLE_VALUE) { /* explicitly unlock to avoid latency for other processes (windows kernel * releases such locks via deferred queues) */ - while (funlock(env->me_lazy_fd, LCK_BODY)) - ; - err = (int)GetLastError(); + do + err = funlock(env->me_fd4data, DXB_BODY); + while (err == MDBX_SUCCESS); assert(err == ERROR_NOT_LOCKED || (mdbx_RunningUnderWine() && err == ERROR_LOCK_VIOLATION)); - (void)err; SetLastError(ERROR_SUCCESS); - while (funlock(env->me_lazy_fd, LCK_WHOLE)) - ; - err = (int)GetLastError(); + do + err = funlock(env->me_fd4data, DXB_WHOLE); + while (err == MDBX_SUCCESS); assert(err == ERROR_NOT_LOCKED || (mdbx_RunningUnderWine() && err == ERROR_LOCK_VIOLATION)); - (void)err; SetLastError(ERROR_SUCCESS); } } @@ -428,56 +468,55 @@ static void lck_unlock(MDBX_env *env) { * or as 'used' (S-? and returns MDBX_RESULT_FALSE). * Otherwise returns an error. */ static int internal_seize_lck(HANDLE lfd) { - int rc; assert(lfd != INVALID_HANDLE_VALUE); /* 1) now on ?-? (free), get ?-E (middle) */ jitter4testing(false); - if (!flock(lfd, LCK_EXCLUSIVE | LCK_WAITFOR, LCK_UPPER)) { - rc = (int)GetLastError() /* 2) something went wrong, give up */; + int rc = flock(lfd, LCK_EXCLUSIVE | LCK_WAITFOR, LCK_UPPER); + if (rc != MDBX_SUCCESS) { + /* 2) something went wrong, give up */; ERROR("%s, err %u", "?-?(free) >> ?-E(middle)", rc); return rc; } /* 3) now on ?-E (middle), try E-E (exclusive-write) */ jitter4testing(false); - if (flock(lfd, LCK_EXCLUSIVE | LCK_DONTWAIT, LCK_LOWER)) + rc = flock(lfd, LCK_EXCLUSIVE | LCK_DONTWAIT, LCK_LOWER); + if (rc == MDBX_SUCCESS) return MDBX_RESULT_TRUE /* 4) got E-E (exclusive-write), done */; /* 5) still on ?-E (middle) */ - rc = (int)GetLastError(); jitter4testing(false); if (rc != ERROR_SHARING_VIOLATION && rc != ERROR_LOCK_VIOLATION) { /* 6) something went wrong, give up */ - if (!funlock(lfd, LCK_UPPER)) + rc = funlock(lfd, LCK_UPPER); + if (rc != MDBX_SUCCESS) mdbx_panic("%s(%s) failed: err %u", __func__, "?-E(middle) >> ?-?(free)", - (int)GetLastError()); + rc); return rc; } /* 7) still on ?-E (middle), try S-E (locked) */ jitter4testing(false); - rc = flock(lfd, LCK_SHARED | LCK_DONTWAIT, LCK_LOWER) ? MDBX_RESULT_FALSE - : (int)GetLastError(); + rc = flock(lfd, LCK_SHARED | LCK_DONTWAIT, LCK_LOWER); jitter4testing(false); - if (rc != MDBX_RESULT_FALSE) + if (rc != MDBX_SUCCESS) ERROR("%s, err %u", "?-E(middle) >> S-E(locked)", rc); /* 8) now on S-E (locked) or still on ?-E (middle), * transition to S-? (used) or ?-? (free) */ - if (!funlock(lfd, LCK_UPPER)) + int err = funlock(lfd, LCK_UPPER); + if (err != MDBX_SUCCESS) mdbx_panic("%s(%s) failed: err %u", __func__, - "X-E(locked/middle) >> X-?(used/free)", (int)GetLastError()); + "X-E(locked/middle) >> X-?(used/free)", err); /* 9) now on S-? (used, DONE) or ?-? (free, FAILURE) */ return rc; } MDBX_INTERNAL_FUNC int osal_lck_seize(MDBX_env *env) { - int rc; - - assert(env->me_lazy_fd != INVALID_HANDLE_VALUE); + assert(env->me_fd4data != INVALID_HANDLE_VALUE); if (env->me_flags & MDBX_EXCLUSIVE) return MDBX_RESULT_TRUE /* nope since files were must be opened non-shareable */ @@ -486,15 +525,13 @@ MDBX_INTERNAL_FUNC int osal_lck_seize(MDBX_env *env) { if (env->me_lfd == INVALID_HANDLE_VALUE) { /* LY: without-lck mode (e.g. on read-only filesystem) */ jitter4testing(false); - if (!flock(env->me_lazy_fd, LCK_SHARED | LCK_DONTWAIT, LCK_WHOLE)) { - rc = (int)GetLastError(); + int rc = flock_data(env, LCK_SHARED | LCK_DONTWAIT, DXB_WHOLE); + if (rc != MDBX_SUCCESS) ERROR("%s, err %u", "without-lck", rc); - return rc; - } - return MDBX_RESULT_FALSE; + return rc; } - rc = internal_seize_lck(env->me_lfd); + int rc = internal_seize_lck(env->me_lfd); jitter4testing(false); if (rc == MDBX_RESULT_TRUE && (env->me_flags & MDBX_RDONLY) == 0) { /* Check that another process don't operates in without-lck mode. @@ -503,17 +540,18 @@ MDBX_INTERNAL_FUNC int osal_lck_seize(MDBX_env *env) { * - we need an exclusive lock for do so; * - we can't lock meta-pages, otherwise other process could get an error * while opening db in valid (non-conflict) mode. */ - if (!flock(env->me_lazy_fd, LCK_EXCLUSIVE | LCK_DONTWAIT, LCK_BODY)) { - rc = (int)GetLastError(); - ERROR("%s, err %u", "lock-against-without-lck", rc); + int err = flock_data(env, LCK_EXCLUSIVE | LCK_DONTWAIT, DXB_WHOLE); + if (err != MDBX_SUCCESS) { + ERROR("%s, err %u", "lock-against-without-lck", err); jitter4testing(false); lck_unlock(env); - } else { - jitter4testing(false); - if (!funlock(env->me_lazy_fd, LCK_BODY)) - mdbx_panic("%s(%s) failed: err %u", __func__, - "unlock-against-without-lck", (int)GetLastError()); + return err; } + jitter4testing(false); + err = funlock(env->me_fd4data, DXB_WHOLE); + if (err != MDBX_SUCCESS) + mdbx_panic("%s(%s) failed: err %u", __func__, + "unlock-against-without-lck", err); } return rc; @@ -521,28 +559,31 @@ MDBX_INTERNAL_FUNC int osal_lck_seize(MDBX_env *env) { MDBX_INTERNAL_FUNC int osal_lck_downgrade(MDBX_env *env) { /* Transite from exclusive-write state (E-E) to used (S-?) */ - assert(env->me_lazy_fd != INVALID_HANDLE_VALUE); + assert(env->me_fd4data != INVALID_HANDLE_VALUE); assert(env->me_lfd != INVALID_HANDLE_VALUE); if (env->me_flags & MDBX_EXCLUSIVE) return MDBX_SUCCESS /* nope since files were must be opened non-shareable */ ; /* 1) now at E-E (exclusive-write), transition to ?_E (middle) */ - if (!funlock(env->me_lfd, LCK_LOWER)) + int rc = funlock(env->me_lfd, LCK_LOWER); + if (rc != MDBX_SUCCESS) mdbx_panic("%s(%s) failed: err %u", __func__, - "E-E(exclusive-write) >> ?-E(middle)", (int)GetLastError()); + "E-E(exclusive-write) >> ?-E(middle)", rc); /* 2) now at ?-E (middle), transition to S-E (locked) */ - if (!flock(env->me_lfd, LCK_SHARED | LCK_DONTWAIT, LCK_LOWER)) { - int rc = (int)GetLastError() /* 3) something went wrong, give up */; + rc = flock(env->me_lfd, LCK_SHARED | LCK_DONTWAIT, LCK_LOWER); + if (rc != MDBX_SUCCESS) { + /* 3) something went wrong, give up */; ERROR("%s, err %u", "?-E(middle) >> S-E(locked)", rc); return rc; } /* 4) got S-E (locked), continue transition to S-? (used) */ - if (!funlock(env->me_lfd, LCK_UPPER)) + rc = funlock(env->me_lfd, LCK_UPPER); + if (rc != MDBX_SUCCESS) mdbx_panic("%s(%s) failed: err %u", __func__, "S-E(locked) >> S-?(used)", - (int)GetLastError()); + rc); return MDBX_SUCCESS /* 5) now at S-? (used), done */; } @@ -555,24 +596,26 @@ MDBX_INTERNAL_FUNC int mdbx_lck_upgrade(MDBX_env *env) { return MDBX_SUCCESS /* nope since files were must be opened non-shareable */ ; - int rc; /* 1) now on S-? (used), try S-E (locked) */ jitter4testing(false); - if (!flock(env->me_lfd, LCK_EXCLUSIVE | LCK_DONTWAIT, LCK_UPPER)) { - rc = (int)GetLastError() /* 2) something went wrong, give up */; + int rc = flock(env->me_lfd, LCK_EXCLUSIVE | LCK_DONTWAIT, LCK_UPPER); + if (rc != MDBX_SUCCESS) { + /* 2) something went wrong, give up */; VERBOSE("%s, err %u", "S-?(used) >> S-E(locked)", rc); return rc; } /* 3) now on S-E (locked), transition to ?-E (middle) */ - if (!funlock(env->me_lfd, LCK_LOWER)) + rc = funlock(env->me_lfd, LCK_LOWER); + if (rc != MDBX_SUCCESS) mdbx_panic("%s(%s) failed: err %u", __func__, "S-E(locked) >> ?-E(middle)", - (int)GetLastError()); + rc); /* 4) now on ?-E (middle), try E-E (exclusive-write) */ jitter4testing(false); - if (!flock(env->me_lfd, LCK_EXCLUSIVE | LCK_DONTWAIT, LCK_LOWER)) { - rc = (int)GetLastError() /* 5) something went wrong, give up */; + rc = flock(env->me_lfd, LCK_EXCLUSIVE | LCK_DONTWAIT, LCK_LOWER); + if (rc != MDBX_SUCCESS) { + /* 5) something went wrong, give up */; VERBOSE("%s, err %u", "?-E(middle) >> E-E(exclusive-write)", rc); return rc; } @@ -586,6 +629,23 @@ MDBX_INTERNAL_FUNC int osal_lck_init(MDBX_env *env, (void)env; (void)inprocess_neighbor; (void)global_uniqueness_flag; + if (mdbx_SetFileIoOverlappedRange && !(env->me_flags & MDBX_RDONLY)) { + HANDLE token = INVALID_HANDLE_VALUE; + TOKEN_PRIVILEGES privileges; + privileges.PrivilegeCount = 1; + privileges.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED; + if (!OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES, + &token) || + !LookupPrivilegeValue(NULL, SE_LOCK_MEMORY_NAME, + &privileges.Privileges[0].Luid) || + !AdjustTokenPrivileges(token, FALSE, &privileges, sizeof(privileges), + nullptr, nullptr) || + GetLastError() != ERROR_SUCCESS) + mdbx_SetFileIoOverlappedRange = NULL; + + if (token != INVALID_HANDLE_VALUE) + CloseHandle(token); + } return MDBX_SUCCESS; } @@ -752,6 +812,7 @@ MDBX_NtFsControlFile mdbx_NtFsControlFile; MDBX_PrefetchVirtualMemory mdbx_PrefetchVirtualMemory; MDBX_GetTickCount64 mdbx_GetTickCount64; MDBX_RegGetValueA mdbx_RegGetValueA; +MDBX_SetFileIoOverlappedRange mdbx_SetFileIoOverlappedRange; #endif /* xMDBX_ALLOY */ #if __GNUC_PREREQ(8, 0) @@ -783,6 +844,7 @@ static void mdbx_winnt_import(void) { GET_PROC_ADDR(hKernel32dll, GetVolumeInformationByHandleW); GET_PROC_ADDR(hKernel32dll, GetFinalPathNameByHandleW); GET_PROC_ADDR(hKernel32dll, PrefetchVirtualMemory); + GET_PROC_ADDR(hKernel32dll, SetFileIoOverlappedRange); } const HINSTANCE hAdvapi32dll = GetModuleHandleA("advapi32.dll"); diff --git a/src/osal.c b/src/osal.c index b8b8cf54..77b6adfc 100644 --- a/src/osal.c +++ b/src/osal.c @@ -1,4 +1,4 @@ -/* https://en.wikipedia.org/wiki/Operating_system_abstraction_layer */ +/* https://en.wikipedia.org/wiki/Operating_system_abstraction_layer */ /* * Copyright 2015-2022 Leonid Yuriev @@ -537,6 +537,596 @@ MDBX_INTERNAL_FUNC size_t osal_mb2w(wchar_t *dst, size_t dst_n, const char *src, /*----------------------------------------------------------------------------*/ +#if defined(_WIN32) || defined(_WIN64) +#define ior_alignment_mask (ior->pagesize - 1) +#define OSAL_IOV_MAX (4096 / sizeof(ior_sgv_element)) + +static void ior_put_event(osal_ioring_t *ior, HANDLE event) { + assert(event && event != INVALID_HANDLE_VALUE && event != ior); + assert(ior->event_stack < ior->allocated); + ior->event_pool[ior->event_stack] = event; + ior->event_stack += 1; +} + +static HANDLE ior_get_event(osal_ioring_t *ior) { + assert(ior->event_stack <= ior->allocated); + if (ior->event_stack > 0) { + ior->event_stack -= 1; + assert(ior->event_pool[ior->event_stack] != 0); + return ior->event_pool[ior->event_stack]; + } + return CreateEventW(nullptr, true, false, nullptr); +} + +static void WINAPI ior_wocr(DWORD err, DWORD bytes, OVERLAPPED *ov) { + osal_ioring_t *ior = ov->hEvent; + ov->Internal = err; + ov->InternalHigh = bytes; + if (++ior->async_completed >= ior->async_waiting) + SetEvent(ior->async_done); +} + +#elif MDBX_HAVE_PWRITEV +#if defined(_SC_IOV_MAX) +static size_t osal_iov_max; +#define OSAL_IOV_MAX osal_iov_max +#else +#define OSAL_IOV_MAX IOV_MAX +#endif +#else +#undef OSAL_IOV_MAX +#endif /* OSAL_IOV_MAX */ + +MDBX_INTERNAL_FUNC int osal_ioring_create(osal_ioring_t *ior, +#if defined(_WIN32) || defined(_WIN64) + unsigned flags, +#endif /* Windows */ + mdbx_filehandle_t fd) { + memset(ior, 0, sizeof(osal_ioring_t)); + ior->fd = fd; + +#if defined(_WIN32) || defined(_WIN64) + ior->flags = flags; + const unsigned pagesize = (unsigned)osal_syspagesize(); + ior->pagesize = pagesize; + ior->pagesize_ln2 = (uint8_t)log2n_powerof2(pagesize); + ior->async_done = ior_get_event(ior); + if (!ior->async_done) + return GetLastError(); +#endif /* !Windows */ + +#if MDBX_HAVE_PWRITEV && defined(_SC_IOV_MAX) + if (!osal_iov_max) + osal_iov_max = sysconf(_SC_IOV_MAX); +#endif + + ior->boundary = (char *)(ior->pool + ior->allocated); + return MDBX_SUCCESS; +} + +static __inline size_t ior_offset(const ior_item_t *item) { +#if defined(_WIN32) || defined(_WIN64) + return item->ov.Offset | (size_t)((sizeof(size_t) > sizeof(item->ov.Offset)) + ? (uint64_t)item->ov.OffsetHigh << 32 + : 0); +#else + return item->offset; +#endif /* !Windows */ +} + +static __inline ior_item_t *ior_next(ior_item_t *item, size_t sgvcnt) { +#if defined(ior_sgv_element) + assert(sgvcnt > 0); + return (ior_item_t *)((char *)item + sizeof(ior_item_t) - + sizeof(ior_sgv_element) + + sizeof(ior_sgv_element) * sgvcnt); +#else + assert(sgvcnt == 1); + (void)sgvcnt; + return item + 1; +#endif +} + +MDBX_INTERNAL_FUNC int osal_ioring_add(osal_ioring_t *ior, const size_t offset, + void *data, const size_t bytes) { + + assert(bytes && data); + assert(bytes % MIN_PAGESIZE == 0 && bytes <= MAX_WRITE); + assert(offset % MIN_PAGESIZE == 0 && offset + (uint64_t)bytes <= MAX_MAPSIZE); + +#if defined(_WIN32) || defined(_WIN64) + const unsigned segments = (unsigned)(bytes >> ior->pagesize_ln2); + const bool use_gather = + (ior->flags & IOR_UNBUFFERED) && ior->slots_left >= segments; +#endif /* Windows */ + + ior_item_t *item = ior->pool; + if (likely(ior->last)) { + item = ior->last; + if (unlikely(ior_offset(item) + ior_last_bytes(ior, item) == offset) && + likely(ior_last_bytes(ior, item) + bytes <= MAX_WRITE)) { +#if defined(_WIN32) || defined(_WIN64) + if (use_gather && + ((bytes | (uintptr_t)data | ior->last_bytes | + (uintptr_t)(uint64_t)item->sgv[0].Buffer) & + ior_alignment_mask) == 0 && + ior->last_sgvcnt + segments < OSAL_IOV_MAX) { + assert((item->single.iov_len & 1) == 0); + assert(item->sgv[ior->last_sgvcnt].Buffer == 0); + ior->last_bytes += bytes; + size_t i = 0; + do { + item->sgv[ior->last_sgvcnt + i].Buffer = PtrToPtr64(data); + data = (char *)data + ior->pagesize; + } while (++i < segments); + ior->slots_left -= segments; + item->sgv[ior->last_sgvcnt += segments].Buffer = 0; + assert((item->single.iov_len & 1) == 0); + return MDBX_SUCCESS; + } + const void *end = + (char *)(item->single.iov_base) + item->single.iov_len - 1; + if (unlikely(end == data)) { + assert((item->single.iov_len & 1) != 0); + item->single.iov_len += bytes; + return MDBX_SUCCESS; + } +#elif MDBX_HAVE_PWRITEV + assert((int)item->sgvcnt > 0); + const void *end = (char *)(item->sgv[item->sgvcnt - 1].iov_base) + + item->sgv[item->sgvcnt - 1].iov_len; + if (unlikely(end == data)) { + item->sgv[item->sgvcnt - 1].iov_len += bytes; + ior->last_bytes += bytes; + return MDBX_SUCCESS; + } + if (likely(item->sgvcnt < OSAL_IOV_MAX)) { + if (unlikely(ior->slots_left < 1)) + return MDBX_RESULT_TRUE; + item->sgv[item->sgvcnt].iov_base = data; + item->sgv[item->sgvcnt].iov_len = bytes; + ior->last_bytes += bytes; + item->sgvcnt += 1; + ior->slots_left -= 1; + return MDBX_SUCCESS; + } +#else + const void *end = (char *)(item->single.iov_base) + item->single.iov_len; + if (unlikely(end == data)) { + item->single.iov_len += bytes; + return MDBX_SUCCESS; + } +#endif + } + item = ior_next(item, ior_last_sgvcnt(ior, item)); + } + + if (unlikely(ior->slots_left < 1)) + return MDBX_RESULT_TRUE; + + unsigned slots_used = 1; +#if defined(_WIN32) || defined(_WIN64) + item->ov.Internal = item->ov.InternalHigh = 0; + item->ov.Offset = (DWORD)offset; + item->ov.OffsetHigh = HIGH_DWORD(offset); + item->ov.hEvent = 0; + if (!use_gather || ((bytes | (uintptr_t)(data)) & ior_alignment_mask) != 0 || + segments > OSAL_IOV_MAX) { + /* WriteFile() */ + item->single.iov_base = data; + item->single.iov_len = bytes + 1; + assert((item->single.iov_len & 1) != 0); + } else { + /* WriteFileGather() */ + item->sgv[0].Buffer = PtrToPtr64(data); + for (size_t i = 1; i < segments; ++i) { + data = (char *)data + ior->pagesize; + item->sgv[slots_used].Buffer = PtrToPtr64(data); + } + item->sgv[slots_used].Buffer = 0; + assert((item->single.iov_len & 1) == 0); + slots_used = segments; + } + ior->last_bytes = bytes; + ior_last_sgvcnt(ior, item) = slots_used; +#elif MDBX_HAVE_PWRITEV + item->offset = offset; + item->sgv[0].iov_base = data; + item->sgv[0].iov_len = bytes; + ior->last_bytes = bytes; + ior_last_sgvcnt(ior, item) = slots_used; +#else + item->offset = offset; + item->single.iov_base = data; + item->single.iov_len = bytes; +#endif /* !Windows */ + ior->slots_left -= slots_used; + ior->last = item; + return MDBX_SUCCESS; +} + +MDBX_INTERNAL_FUNC void osal_ioring_walk( + osal_ioring_t *ior, iov_ctx_t *ctx, + void (*callback)(iov_ctx_t *ctx, size_t offset, void *data, size_t bytes)) { + for (ior_item_t *item = ior->pool; item <= ior->last;) { +#if defined(_WIN32) || defined(_WIN64) + size_t offset = ior_offset(item); + char *data = item->single.iov_base; + size_t bytes = item->single.iov_len - 1; + size_t i = 1; + if (bytes & 1) { + data = Ptr64ToPtr(item->sgv[0].Buffer); + bytes = ior->pagesize; + while (item->sgv[i].Buffer) { + if (data + ior->pagesize != item->sgv[i].Buffer) { + callback(ctx, offset, data, bytes); + offset += bytes; + data = Ptr64ToPtr(item->sgv[i].Buffer); + bytes = 0; + } + bytes += ior->pagesize; + ++i; + } + } + assert(bytes < MAX_WRITE); + callback(ctx, offset, data, bytes); +#elif MDBX_HAVE_PWRITEV + assert(item->sgvcnt > 0); + size_t offset = item->offset; + size_t i = 0; + do { + callback(ctx, offset, item->sgv[i].iov_base, item->sgv[i].iov_len); + offset += item->sgv[i].iov_len; + } while (++i != item->sgvcnt); +#else + const size_t i = 1; + callback(ctx, item->offset, item->single.iov_base, item->single.iov_len); +#endif + item = ior_next(item, i); + } +} + +MDBX_INTERNAL_FUNC osal_ioring_write_result_t +osal_ioring_write(osal_ioring_t *ior) { + osal_ioring_write_result_t r = {MDBX_SUCCESS, 0}; + +#if defined(_WIN32) || defined(_WIN64) + HANDLE *const end_wait_for = + ior->event_pool + ior->allocated + + /* был выделен один дополнительный элемент для async_done */ 1; + HANDLE *wait_for = end_wait_for; + LONG async_started = 0; + for (ior_item_t *item = ior->pool; item <= ior->last;) { + item->ov.Internal = STATUS_PENDING; + size_t i = 1, bytes = item->single.iov_len - 1; + r.wops += 1; + if (bytes & 1) { + bytes = ior->pagesize; + while (item->sgv[i].Buffer) { + bytes += ior->pagesize; + ++i; + } + assert(bytes < MAX_WRITE); + item->ov.hEvent = ior_get_event(ior); + if (unlikely(!item->ov.hEvent)) { + bailout_geterr: + r.err = GetLastError(); + bailout_rc: + assert(r.err != MDBX_SUCCESS); + CancelIo(ior->fd); + return r; + } + if (WriteFileGather(ior->fd, item->sgv, (DWORD)bytes, nullptr, + &item->ov)) { + assert(item->ov.Internal == 0 && + WaitForSingleObject(item->ov.hEvent, 0) == WAIT_OBJECT_0); + ior_put_event(ior, item->ov.hEvent); + item->ov.hEvent = 0; + } else { + r.err = (int)GetLastError(); + if (unlikely(r.err != ERROR_IO_PENDING)) { + ERROR("%s: fd %p, item %p (%zu), pgno %u, bytes %zu, offset %" PRId64 + ", err %d", + "WriteFileGather", ior->fd, item, item - ior->pool, + ((MDBX_page *)item->single.iov_base)->mp_pgno, bytes, + item->ov.Offset + ((uint64_t)item->ov.OffsetHigh << 32), r.err); + goto bailout_rc; + } + assert(wait_for > ior->event_pool + ior->event_stack); + *--wait_for = item->ov.hEvent; + } + } else if (ior->flags & IOR_OVERLAPPED) { + assert(bytes < MAX_WRITE); + retry: + item->ov.hEvent = ior; + if (WriteFileEx(ior->fd, item->single.iov_base, (DWORD)bytes, &item->ov, + ior_wocr)) { + async_started += 1; + } else { + r.err = (int)GetLastError(); + switch (r.err) { + default: + ERROR("%s: fd %p, item %p (%zu), pgno %u, bytes %zu, offset %" PRId64 + ", err %d", + "WriteFileEx", ior->fd, item, item - ior->pool, + ((MDBX_page *)item->single.iov_base)->mp_pgno, bytes, + item->ov.Offset + ((uint64_t)item->ov.OffsetHigh << 32), r.err); + goto bailout_rc; + case ERROR_NOT_FOUND: + case ERROR_USER_MAPPED_FILE: + case ERROR_LOCK_VIOLATION: + WARNING( + "%s: fd %p, item %p (%zu), pgno %u, bytes %zu, offset %" PRId64 + ", err %d", + "WriteFileEx", ior->fd, item, item - ior->pool, + ((MDBX_page *)item->single.iov_base)->mp_pgno, bytes, + item->ov.Offset + ((uint64_t)item->ov.OffsetHigh << 32), r.err); + SleepEx(0, true); + goto retry; + case ERROR_INVALID_USER_BUFFER: + case ERROR_NOT_ENOUGH_MEMORY: + if (SleepEx(0, true) == WAIT_IO_COMPLETION) + goto retry; + goto bailout_rc; + case ERROR_IO_PENDING: + async_started += 1; + } + } + } else { + assert(bytes < MAX_WRITE); + DWORD written = 0; + if (!WriteFile(ior->fd, item->single.iov_base, (DWORD)bytes, &written, + &item->ov)) { + r.err = (int)GetLastError(); + ERROR("%s: fd %p, item %p (%zu), pgno %u, bytes %zu, offset %" PRId64 + ", err %d", + "WriteFile", ior->fd, item, item - ior->pool, + ((MDBX_page *)item->single.iov_base)->mp_pgno, bytes, + item->ov.Offset + ((uint64_t)item->ov.OffsetHigh << 32), r.err); + goto bailout_rc; + } else if (unlikely(written != bytes)) { + r.err = ERROR_WRITE_FAULT; + goto bailout_rc; + } + } + item = ior_next(item, i); + } + + assert(ior->async_waiting > ior->async_completed && + ior->async_waiting == INT_MAX); + ior->async_waiting = async_started; + if (async_started > ior->async_completed && end_wait_for == wait_for) { + assert(wait_for > ior->event_pool + ior->event_stack); + *--wait_for = ior->async_done; + } + + const size_t pending_count = end_wait_for - wait_for; + if (pending_count) { + /* Ждем до MAXIMUM_WAIT_OBJECTS (64) последних хендлов, а после избирательно + * ждем посредством GetOverlappedResult(), если какие-то более ранние + * элементы еще не завершены. В целом, так получается меньше системных + * вызовов, т.е. меньше накладных расходов. Однако, не факт что эта экономия + * не будет перекрыта неэффективностью реализации + * WaitForMultipleObjectsEx(), но тогда это проблемы на стороне M$. */ + DWORD madness; + do + madness = WaitForMultipleObjectsEx((pending_count < MAXIMUM_WAIT_OBJECTS) + ? (DWORD)pending_count + : MAXIMUM_WAIT_OBJECTS, + wait_for, true, + /* сутки */ 86400000ul, true); + while (madness == WAIT_IO_COMPLETION); + STATIC_ASSERT(WAIT_OBJECT_0 == 0); + if (/* madness >= WAIT_OBJECT_0 && */ + madness < WAIT_OBJECT_0 + MAXIMUM_WAIT_OBJECTS) + r.err = MDBX_SUCCESS; + else if (madness >= WAIT_ABANDONED_0 && + madness < WAIT_ABANDONED_0 + MAXIMUM_WAIT_OBJECTS) { + r.err = ERROR_ABANDONED_WAIT_0; + goto bailout_rc; + } else if (madness == WAIT_TIMEOUT) { + r.err = WAIT_TIMEOUT; + goto bailout_rc; + } else { + r.err = /* madness == WAIT_FAILED */ MDBX_PROBLEM; + goto bailout_rc; + } + + assert(ior->async_waiting == ior->async_completed); + for (ior_item_t *item = ior->pool; item <= ior->last;) { + size_t i = 1, bytes = item->single.iov_len - 1; + if (bytes & 1) { + bytes = ior->pagesize; + while (item->sgv[i].Buffer) { + bytes += ior->pagesize; + ++i; + } + if (!HasOverlappedIoCompleted(&item->ov)) { + DWORD written = 0; + if (unlikely( + !GetOverlappedResult(ior->fd, &item->ov, &written, true))) { + ERROR("%s: item %p (%zu), pgno %u, bytes %zu, offset %" PRId64 + ", err %d", + "GetOverlappedResult", item, item - ior->pool, + ((MDBX_page *)item->single.iov_base)->mp_pgno, bytes, + item->ov.Offset + ((uint64_t)item->ov.OffsetHigh << 32), + GetLastError()); + goto bailout_geterr; + } + assert(MDBX_SUCCESS == item->ov.Internal); + assert(written == item->ov.InternalHigh); + } + } else { + assert(HasOverlappedIoCompleted(&item->ov)); + } + assert(item->ov.Internal != ERROR_IO_PENDING); + if (unlikely(item->ov.Internal != MDBX_SUCCESS)) { + DWORD written = 0; + r.err = (int)item->ov.Internal; + if ((r.err & 0x80000000) && + GetOverlappedResult(NULL, &item->ov, &written, true)) + r.err = (int)GetLastError(); + ERROR("%s: item %p (%zu), pgno %u, bytes %zu, offset %" PRId64 + ", err %d", + "Result", item, item - ior->pool, + ((MDBX_page *)item->single.iov_base)->mp_pgno, bytes, + item->ov.Offset + ((uint64_t)item->ov.OffsetHigh << 32), + GetLastError()); + goto bailout_rc; + } + if (unlikely(item->ov.InternalHigh != bytes)) { + r.err = ERROR_WRITE_FAULT; + goto bailout_rc; + } + item = ior_next(item, i); + } + assert(ior->async_waiting == ior->async_completed); + } else { + assert(r.err == MDBX_SUCCESS); + } + assert(ior->async_waiting == ior->async_completed); + +#else + STATIC_ASSERT_MSG(sizeof(off_t) >= sizeof(size_t), + "libmdbx requires 64-bit file I/O on 64-bit systems"); + for (ior_item_t *item = ior->pool; item <= ior->last;) { +#if MDBX_HAVE_PWRITEV + assert(item->sgvcnt > 0); + if (item->sgvcnt == 1) + r.err = osal_pwrite(ior->fd, item->sgv[0].iov_base, item->sgv[0].iov_len, + item->offset); + else + r.err = osal_pwritev(ior->fd, item->sgv, item->sgvcnt, item->offset); + + // TODO: io_uring_prep_write(sqe, fd, ...); + + item = ior_next(item, item->sgvcnt); +#else + r.err = osal_pwrite(ior->fd, item->single.iov_base, item->single.iov_len, + item->offset); + item = ior_next(item, 1); +#endif + r.wops += 1; + if (unlikely(r.err != MDBX_SUCCESS)) + break; + } + + // TODO: io_uring_submit(&ring) + // TODO: err = io_uring_wait_cqe(&ring, &cqe); + // TODO: io_uring_cqe_seen(&ring, cqe); + +#endif /* !Windows */ + return r; +} + +MDBX_INTERNAL_FUNC void osal_ioring_reset(osal_ioring_t *ior) { +#if defined(_WIN32) || defined(_WIN64) + if (ior->last) { + for (ior_item_t *item = ior->pool; item <= ior->last;) { + if (!HasOverlappedIoCompleted(&item->ov)) + CancelIoEx(ior->fd, &item->ov); + if (item->ov.hEvent && item->ov.hEvent != ior) + ior_put_event(ior, item->ov.hEvent); + size_t i = 1; + if ((item->single.iov_len & 1) == 0) + while (item->sgv[i].Buffer) + ++i; + item = ior_next(item, i); + } + } + ior->async_waiting = INT_MAX; + ior->async_completed = 0; + ResetEvent(ior->async_done); +#endif /* !Windows */ + ior->slots_left = ior->allocated; + ior->last = nullptr; +} + +static void ior_cleanup(osal_ioring_t *ior, const size_t since) { + osal_ioring_reset(ior); +#if defined(_WIN32) || defined(_WIN64) + for (size_t i = since; i < ior->event_stack; ++i) + CloseHandle(ior->event_pool[i]); + ior->event_stack = 0; +#else + (void)since; +#endif /* Windows */ +} + +MDBX_INTERNAL_FUNC int osal_ioring_resize(osal_ioring_t *ior, size_t items) { + assert(items > 0 && items < INT_MAX / sizeof(ior_item_t)); +#if defined(_WIN32) || defined(_WIN64) + if (ior->state & IOR_STATE_LOCKED) + return MDBX_SUCCESS; + const bool useSetFileIoOverlappedRange = (ior->flags & IOR_OVERLAPPED) && + mdbx_SetFileIoOverlappedRange && + items > 7; + const size_t ceiling = + useSetFileIoOverlappedRange + ? ((items < 65536 / 2 / sizeof(ior_item_t)) ? 65536 : 65536 * 4) + : 4096; + const size_t bytes = ceil_powerof2(sizeof(ior_item_t) * items, ceiling); + items = bytes / sizeof(ior_item_t); +#endif /* Windows */ + + if (items != ior->allocated) { + assert(items >= osal_ioring_used(ior)); + if (items < ior->allocated) + ior_cleanup(ior, items); +#if defined(_WIN32) || defined(_WIN64) + void *ptr = osal_realloc( + ior->event_pool, + (items + /* extra for waiting the async_done */ 1) * sizeof(HANDLE)); + if (unlikely(!ptr)) + return MDBX_ENOMEM; + ior->event_pool = ptr; + + int err = osal_memalign_alloc(ceiling, bytes, &ptr); + if (unlikely(err != MDBX_SUCCESS)) + return err; + if (ior->pool) { + memcpy(ptr, ior->pool, ior->allocated * sizeof(ior_item_t)); + osal_memalign_free(ior->pool); + } +#else + void *ptr = osal_realloc(ior->pool, sizeof(ior_item_t) * items); + if (unlikely(!ptr)) + return MDBX_ENOMEM; +#endif + ior->pool = ptr; + + if (items > ior->allocated) + memset(ior->pool + ior->allocated, 0, + sizeof(ior_item_t) * (items - ior->allocated)); + ior->allocated = (unsigned)items; + ior->boundary = (char *)(ior->pool + ior->allocated); +#if defined(_WIN32) || defined(_WIN64) + if (useSetFileIoOverlappedRange) { + if (mdbx_SetFileIoOverlappedRange(ior->fd, ptr, (ULONG)bytes)) + ior->state += IOR_STATE_LOCKED; + else + return GetLastError(); + } +#endif /* Windows */ + } + return MDBX_SUCCESS; +} + +MDBX_INTERNAL_FUNC void osal_ioring_destroy(osal_ioring_t *ior) { + if (ior->allocated) + ior_cleanup(ior, 0); +#if defined(_WIN32) || defined(_WIN64) + osal_memalign_free(ior->pool); + osal_free(ior->event_pool); + CloseHandle(ior->async_done); +#else + osal_free(ior->pool); +#endif + memset(ior, -1, sizeof(osal_ioring_t)); +} + +/*----------------------------------------------------------------------------*/ + MDBX_INTERNAL_FUNC int osal_removefile(const pathchar_t *pathname) { #if defined(_WIN32) || defined(_WIN64) return DeleteFileW(pathname) ? MDBX_SUCCESS : (int)GetLastError(); @@ -589,17 +1179,21 @@ MDBX_INTERNAL_FUNC int osal_openfile(const enum osal_openfile_purpose purpose, case MDBX_OPEN_DXB_LAZY: DesiredAccess |= GENERIC_READ | GENERIC_WRITE; break; + case MDBX_OPEN_DXB_OVERLAPPED: + FlagsAndAttributes |= FILE_FLAG_OVERLAPPED; + /* fall through */ + __fallthrough; case MDBX_OPEN_DXB_DSYNC: CreationDisposition = OPEN_EXISTING; - DesiredAccess |= GENERIC_WRITE; + DesiredAccess |= GENERIC_WRITE | GENERIC_READ; FlagsAndAttributes |= FILE_FLAG_WRITE_THROUGH; break; case MDBX_OPEN_COPY: CreationDisposition = CREATE_NEW; ShareMode = 0; DesiredAccess |= GENERIC_WRITE; - FlagsAndAttributes |= - (env->me_psize < env->me_os_psize) ? 0 : FILE_FLAG_NO_BUFFERING; + if (env->me_psize >= env->me_os_psize) + FlagsAndAttributes |= FILE_FLAG_NO_BUFFERING; break; case MDBX_OPEN_DELETE: CreationDisposition = OPEN_EXISTING; @@ -878,28 +1472,30 @@ MDBX_INTERNAL_FUNC int osal_write(mdbx_filehandle_t fd, const void *buf, } } -int osal_pwritev(mdbx_filehandle_t fd, struct iovec *iov, int iovcnt, - uint64_t offset, size_t expected_written) { -#if defined(_WIN32) || defined(_WIN64) || defined(__APPLE__) || \ - (defined(__ANDROID_API__) && __ANDROID_API__ < 24) +int osal_pwritev(mdbx_filehandle_t fd, struct iovec *iov, int sgvcnt, + uint64_t offset) { + size_t expected = 0; + for (int i = 0; i < sgvcnt; ++i) + expected += iov[i].iov_len; +#if !MDBX_HAVE_PWRITEV size_t written = 0; - for (int i = 0; i < iovcnt; ++i) { + for (int i = 0; i < sgvcnt; ++i) { int rc = osal_pwrite(fd, iov[i].iov_base, iov[i].iov_len, offset); if (unlikely(rc != MDBX_SUCCESS)) return rc; written += iov[i].iov_len; offset += iov[i].iov_len; } - return (expected_written == written) ? MDBX_SUCCESS - : MDBX_EIO /* ERROR_WRITE_FAULT */; + return (expected == written) ? MDBX_SUCCESS + : MDBX_EIO /* ERROR_WRITE_FAULT */; #else int rc; intptr_t written; do { STATIC_ASSERT_MSG(sizeof(off_t) >= sizeof(size_t), "libmdbx requires 64-bit file I/O on 64-bit systems"); - written = pwritev(fd, iov, iovcnt, offset); - if (likely(expected_written == (size_t)written)) + written = pwritev(fd, iov, sgvcnt, offset); + if (likely(expected == (size_t)written)) return MDBX_SUCCESS; rc = errno; } while (rc == EINTR); @@ -1066,7 +1662,7 @@ MDBX_INTERNAL_FUNC int osal_thread_join(osal_thread_t thread) { /*----------------------------------------------------------------------------*/ -MDBX_INTERNAL_FUNC int osal_msync(osal_mmap_t *map, size_t offset, +MDBX_INTERNAL_FUNC int osal_msync(const osal_mmap_t *map, size_t offset, size_t length, enum osal_syncmode_bits mode_bits) { uint8_t *ptr = (uint8_t *)map->address + offset; diff --git a/src/osal.h b/src/osal.h index cec91dca..11ef24f8 100644 --- a/src/osal.h +++ b/src/osal.h @@ -263,8 +263,138 @@ typedef union osal_srwlock { } osal_srwlock_t; #endif /* Windows */ +#ifndef MDBX_HAVE_PWRITEV +#if defined(_WIN32) || defined(_WIN64) + +#define MDBX_HAVE_PWRITEV 0 + +#elif defined(__ANDROID_API__) + +#if __ANDROID_API__ < 24 +#define MDBX_HAVE_PWRITEV 0 +#else +#define MDBX_HAVE_PWRITEV 1 +#endif + +#elif defined(__APPLE__) || defined(__MACH__) || defined(_DARWIN_C_SOURCE) + +#if defined(MAC_OS_X_VERSION_MIN_REQUIRED) && defined(MAC_OS_VERSION_11_0) && \ + MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_VERSION_11_0 +/* FIXME: add checks for IOS versions, etc */ +#define MDBX_HAVE_PWRITEV 1 +#else +#define MDBX_HAVE_PWRITEV 0 +#endif + +#elif defined(_SC_IOV_MAX) || (defined(IOV_MAX) && IOV_MAX > 1) +#define MDBX_HAVE_PWRITEV 1 +#else +#define MDBX_HAVE_PWRITEV 0 +#endif +#endif /* MDBX_HAVE_PWRITEV */ + +typedef struct ior_item { +#if defined(_WIN32) || defined(_WIN64) + OVERLAPPED ov; +#define ior_svg_gap4terminator 1 +#define ior_sgv_element FILE_SEGMENT_ELEMENT +#else + size_t offset; +#if MDBX_HAVE_PWRITEV + size_t sgvcnt; +#define ior_svg_gap4terminator 0 +#define ior_sgv_element struct iovec +#endif /* MDBX_HAVE_PWRITEV */ +#endif /* !Windows */ + union { + MDBX_val single; +#if defined(ior_sgv_element) + ior_sgv_element sgv[1 + ior_svg_gap4terminator]; +#endif /* ior_sgv_element */ + }; +} ior_item_t; + +typedef struct osal_ioring { + unsigned slots_left; + unsigned allocated; +#if defined(_WIN32) || defined(_WIN64) +#define IOR_UNBUFFERED 1 +#define IOR_OVERLAPPED 2 +#define IOR_STATE_LOCKED 1 + unsigned pagesize; + unsigned last_sgvcnt; + size_t last_bytes; + uint8_t flags, state, pagesize_ln2; + unsigned event_stack; + HANDLE *event_pool; + volatile LONG async_waiting; + volatile LONG async_completed; + HANDLE async_done; + +#define ior_last_sgvcnt(ior, item) (ior)->last_sgvcnt +#define ior_last_bytes(ior, item) (ior)->last_bytes +#elif MDBX_HAVE_PWRITEV + unsigned last_bytes; +#define ior_last_sgvcnt(ior, item) (item)->sgvcnt +#define ior_last_bytes(ior, item) (ior)->last_bytes +#else +#define ior_last_sgvcnt(ior, item) (1) +#define ior_last_bytes(ior, item) (item)->single.iov_len +#endif /* !Windows */ + mdbx_filehandle_t fd; + ior_item_t *last; + ior_item_t *pool; + char *boundary; +} osal_ioring_t; + #ifndef __cplusplus +/* Actually this is not ioring for now, but on the way. */ +MDBX_INTERNAL_FUNC int osal_ioring_create(osal_ioring_t *, +#if defined(_WIN32) || defined(_WIN64) + unsigned flags, +#endif /* Windows */ + mdbx_filehandle_t fd); +MDBX_INTERNAL_FUNC int osal_ioring_resize(osal_ioring_t *, size_t items); +MDBX_INTERNAL_FUNC void osal_ioring_destroy(osal_ioring_t *); +MDBX_INTERNAL_FUNC void osal_ioring_reset(osal_ioring_t *); +MDBX_INTERNAL_FUNC int osal_ioring_add(osal_ioring_t *ctx, const size_t offset, + void *data, const size_t bytes); +typedef struct osal_ioring_write_result { + int err; + unsigned wops; +} osal_ioring_write_result_t; +MDBX_INTERNAL_FUNC osal_ioring_write_result_t +osal_ioring_write(osal_ioring_t *ior); + +typedef struct iov_ctx iov_ctx_t; +MDBX_INTERNAL_FUNC void osal_ioring_walk( + osal_ioring_t *ior, iov_ctx_t *ctx, + void (*callback)(iov_ctx_t *ctx, size_t offset, void *data, size_t bytes)); + +static inline unsigned osal_ioring_left(const osal_ioring_t *ior) { + return ior->slots_left; +} + +static inline unsigned osal_ioring_used(const osal_ioring_t *ior) { + return ior->allocated - ior->slots_left; +} + +static inline int osal_ioring_reserve(osal_ioring_t *ior, unsigned items, + size_t bytes) { + items = (items > 32) ? items : 32; +#if defined(_WIN32) || defined(_WIN64) + const unsigned npages = (unsigned)(bytes >> ior->pagesize_ln2); + items = (items > npages) ? items : npages; +#else + (void)bytes; +#endif + items = (items < 65536) ? items : 65536; + if (likely(ior->allocated >= items)) + return MDBX_SUCCESS; + return osal_ioring_resize(ior, items); +} + /*----------------------------------------------------------------------------*/ /* libc compatibility stuff */ @@ -290,10 +420,12 @@ MDBX_MAYBE_UNUSED MDBX_INTERNAL_FUNC void osal_jitter(bool tiny); MDBX_MAYBE_UNUSED static __inline void jitter4testing(bool tiny); /* max bytes to write in one call */ -#if defined(_WIN32) || defined(_WIN64) -#define MAX_WRITE UINT32_C(0x01000000) +#if defined(_WIN64) +#define MAX_WRITE UINT32_C(0x10000000) +#elif defined(_WIN32) +#define MAX_WRITE UINT32_C(0x04000000) #else -#define MAX_WRITE UINT32_C(0x3fff0000) +#define MAX_WRITE UINT32_C(0x3f000000) #endif #if defined(__linux__) || defined(__gnu_linux__) @@ -336,8 +468,7 @@ MDBX_INTERNAL_FUNC int osal_fastmutex_release(osal_fastmutex_t *fastmutex); MDBX_INTERNAL_FUNC int osal_fastmutex_destroy(osal_fastmutex_t *fastmutex); MDBX_INTERNAL_FUNC int osal_pwritev(mdbx_filehandle_t fd, struct iovec *iov, - int iovcnt, uint64_t offset, - size_t expected_written); + int sgvcnt, uint64_t offset); MDBX_INTERNAL_FUNC int osal_pread(mdbx_filehandle_t fd, void *buf, size_t count, uint64_t offset); MDBX_INTERNAL_FUNC int osal_pwrite(mdbx_filehandle_t fd, const void *buf, @@ -365,12 +496,15 @@ MDBX_INTERNAL_FUNC int osal_fseek(mdbx_filehandle_t fd, uint64_t pos); MDBX_INTERNAL_FUNC int osal_filesize(mdbx_filehandle_t fd, uint64_t *length); enum osal_openfile_purpose { - MDBX_OPEN_DXB_READ = 0, - MDBX_OPEN_DXB_LAZY = 1, - MDBX_OPEN_DXB_DSYNC = 2, - MDBX_OPEN_LCK = 3, - MDBX_OPEN_COPY = 4, - MDBX_OPEN_DELETE = 5 + MDBX_OPEN_DXB_READ, + MDBX_OPEN_DXB_LAZY, + MDBX_OPEN_DXB_DSYNC, +#if defined(_WIN32) || defined(_WIN64) + MDBX_OPEN_DXB_OVERLAPPED, +#endif /* Windows */ + MDBX_OPEN_LCK, + MDBX_OPEN_COPY, + MDBX_OPEN_DELETE }; MDBX_INTERNAL_FUNC int osal_openfile(const enum osal_openfile_purpose purpose, @@ -404,7 +538,7 @@ osal_suspend_threads_before_remap(MDBX_env *env, mdbx_handle_array_t **array); MDBX_INTERNAL_FUNC int osal_resume_threads_after_remap(mdbx_handle_array_t *array); #endif /* Windows */ -MDBX_INTERNAL_FUNC int osal_msync(osal_mmap_t *map, size_t offset, +MDBX_INTERNAL_FUNC int osal_msync(const osal_mmap_t *map, size_t offset, size_t length, enum osal_syncmode_bits mode_bits); MDBX_INTERNAL_FUNC int osal_check_fs_rdonly(mdbx_filehandle_t handle, @@ -692,6 +826,11 @@ MDBX_INTERNAL_VAR MDBX_RegGetValueA mdbx_RegGetValueA; NTSYSAPI ULONG RtlRandomEx(PULONG Seed); +typedef BOOL(WINAPI *MDBX_SetFileIoOverlappedRange)(HANDLE FileHandle, + PUCHAR OverlappedRangeStart, + ULONG Length); +MDBX_INTERNAL_VAR MDBX_SetFileIoOverlappedRange mdbx_SetFileIoOverlappedRange; + #endif /* Windows */ #endif /* !__cplusplus */ diff --git a/test/osal-windows.cc b/test/osal-windows.cc index 29ac5cb1..c90e4c05 100644 --- a/test/osal-windows.cc +++ b/test/osal-windows.cc @@ -71,7 +71,7 @@ void osal_setup(const std::vector &actors) { events.reserve(n); for (unsigned i = 0; i < n; ++i) { - HANDLE hEvent = CreateEvent(NULL, TRUE, FALSE, NULL); + HANDLE hEvent = CreateEventW(NULL, TRUE, FALSE, NULL); if (!hEvent) failure_perror("CreateEvent()", GetLastError()); hEvent = make_inheritable(hEvent); @@ -79,22 +79,22 @@ void osal_setup(const std::vector &actors) { events[i] = hEvent; } - hBarrierSemaphore = CreateSemaphore(NULL, 0, (LONG)actors.size(), NULL); + hBarrierSemaphore = CreateSemaphoreW(NULL, 0, (LONG)actors.size(), NULL); if (!hBarrierSemaphore) failure_perror("CreateSemaphore(BarrierSemaphore)", GetLastError()); hBarrierSemaphore = make_inheritable(hBarrierSemaphore); - hBarrierEvent = CreateEvent(NULL, TRUE, FALSE, NULL); + hBarrierEvent = CreateEventW(NULL, TRUE, FALSE, NULL); if (!hBarrierEvent) failure_perror("CreateEvent(BarrierEvent)", GetLastError()); hBarrierEvent = make_inheritable(hBarrierEvent); - hProgressActiveEvent = CreateEvent(NULL, FALSE, FALSE, NULL); + hProgressActiveEvent = CreateEventW(NULL, FALSE, FALSE, NULL); if (!hProgressActiveEvent) failure_perror("CreateEvent(ProgressActiveEvent)", GetLastError()); hProgressActiveEvent = make_inheritable(hProgressActiveEvent); - hProgressPassiveEvent = CreateEvent(NULL, FALSE, FALSE, NULL); + hProgressPassiveEvent = CreateEventW(NULL, FALSE, FALSE, NULL); if (!hProgressPassiveEvent) failure_perror("CreateEvent(ProgressPassiveEvent)", GetLastError()); hProgressPassiveEvent = make_inheritable(hProgressPassiveEvent); From 8aeb22b8bfe4954bbdd33fd7175b7588e4dd978d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Mon, 26 Sep 2022 20:05:02 +0300 Subject: [PATCH 120/364] =?UTF-8?q?mdbx:=20=D0=BB=D0=BE=D0=B3=D0=B8=D1=80?= =?UTF-8?q?=D0=BE=D0=B2=D0=B0=D0=BD=D0=B8=D0=B5=20=D0=BE=D1=88=D0=B8=D0=B1?= =?UTF-8?q?=D0=BE=D0=BA=20=D0=BF=D1=80=D0=B8=20=D0=BF=D0=BE=D0=B4=D0=B3?= =?UTF-8?q?=D0=BE=D1=82=D0=BE=D0=B2=D0=BA=D0=B5/=D0=B7=D0=B0=D0=BF=D0=B8?= =?UTF-8?q?=D1=81=D0=B8/=D1=84=D0=B8=D0=BA=D1=81=D0=B0=D1=86=D0=B8=D0=B8?= =?UTF-8?q?=20=D1=82=D1=80=D0=B0=D0=BD=D0=B7=D0=B0=D0=BA=D1=86=D0=B8=D0=B9?= =?UTF-8?q?.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/src/core.c b/src/core.c index d307b60b..dd2f2ffe 100644 --- a/src/core.c +++ b/src/core.c @@ -10616,20 +10616,26 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { iov_ctx_t write_ctx; rc = iov_init(txn, &write_ctx, txn->tw.dirtylist->length, txn->tw.dirtylist->pages_including_loose - txn->tw.loose_count); - if (unlikely(rc != MDBX_SUCCESS)) + if (unlikely(rc != MDBX_SUCCESS)) { + ERROR("txn-%s: error %d", "iov-init", rc); goto fail; + } if (head.is_steady && atomic_load32(&env->me_lck->mti_meta_sync_txnid, mo_Relaxed) != (uint32_t)head.txnid) { /* sync prev meta */ rc = meta_sync(env, head); - if (unlikely(rc != MDBX_SUCCESS)) + if (unlikely(rc != MDBX_SUCCESS)) { + ERROR("txn-%s: error %d", "presync-meta", rc); goto fail; + } } rc = txn_write(txn, &write_ctx); - if (unlikely(rc != MDBX_SUCCESS)) + if (unlikely(rc != MDBX_SUCCESS)) { + ERROR("txn-%s: error %d", "write", rc); goto fail; + } /* TODO: use ctx.flush_begin & ctx.flush_end for range-sync */ ts_3 = latency ? osal_monotime() : 0; @@ -10659,9 +10665,11 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { rc = sync_locked(env, env->me_flags | txn->mt_flags | MDBX_SHRINK_ALLOWED, &meta, &txn->tw.troika); + ts_4 = latency ? osal_monotime() : 0; if (unlikely(rc != MDBX_SUCCESS)) { env->me_flags |= MDBX_FATAL_ERROR; + ERROR("txn-%s: error %d", "sync", rc); goto fail; } From 2236b905677023bbb817f160ead53993d25f0bac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Tue, 27 Sep 2022 02:37:28 +0300 Subject: [PATCH 121/364] =?UTF-8?q?mdbx:=20=D0=B4=D0=BE=D0=B1=D0=B0=D0=B2?= =?UTF-8?q?=D0=BB=D0=B5=D0=BD=D0=B0=20=D0=BE=D0=BF=D1=86=D0=B8=D1=8F=20?= =?UTF-8?q?=D1=81=D0=B1=D0=BE=D1=80=D0=BA=D0=B8=20`MDBX=5FAVOID=5FMSYNC`?= =?UTF-8?q?=20(=D0=BE=D0=B1=D1=8A=D0=B5=D0=B4=D0=B8=D0=BD=D1=91=D0=BD?= =?UTF-8?q?=D0=BD=D1=8B=D0=B5=20=D0=BA=D0=BE=D0=BC=D0=BC=D0=B8=D1=82=D1=8B?= =?UTF-8?q?=20=D0=B8=20=D0=B8=D1=81=D0=BF=D1=80=D0=B0=D0=B2=D0=BB=D0=B5?= =?UTF-8?q?=D0=BD=D0=B8=D1=8F).?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core.c | 224 +++++++++++++++++++++++++++++--------------------- src/options.h | 29 ++++--- src/osal.c | 6 +- src/osal.h | 3 +- 4 files changed, 151 insertions(+), 111 deletions(-) diff --git a/src/core.c b/src/core.c index dd2f2ffe..aed1f500 100644 --- a/src/core.c +++ b/src/core.c @@ -3972,7 +3972,7 @@ __cold static void kill_page(MDBX_txn *txn, MDBX_page *mp, pgno_t pgno, const size_t bytes = pgno2bytes(env, npages); memset(mp, -1, bytes); mp->mp_pgno = pgno; - if ((env->me_flags & MDBX_WRITEMAP) == 0) + if ((txn->mt_flags & MDBX_WRITEMAP) == 0) osal_pwrite(env->me_lazy_fd, mp, bytes, pgno2bytes(env, pgno)); } else { struct iovec iov[MDBX_COMMIT_PAGES]; @@ -4430,6 +4430,9 @@ __must_check_result static int iov_page(MDBX_txn *txn, iov_ctx_t *ctx, tASSERT(txn, !(txn->mt_flags & MDBX_WRITEMAP)); dp->mp_txnid = txn->mt_txnid; tASSERT(txn, IS_SPILLED(txn, dp)); +#if MDBX_AVOID_MSYNC + doit:; +#endif /* MDBX_AVOID_MSYNC */ int err = osal_ioring_add(ctx->ior, pgno2bytes(env, dp->mp_pgno), dp, pgno2bytes(env, npages)); if (unlikely(err != MDBX_SUCCESS)) { @@ -4452,6 +4455,9 @@ __must_check_result static int iov_page(MDBX_txn *txn, iov_ctx_t *ctx, } } else { tASSERT(txn, txn->mt_flags & MDBX_WRITEMAP); +#if MDBX_AVOID_MSYNC + goto doit; +#endif /* MDBX_AVOID_MSYNC */ } #if MDBX_NEED_WRITTEN_RANGE @@ -4466,17 +4472,18 @@ __must_check_result static int iov_page(MDBX_txn *txn, iov_ctx_t *ctx, } static int spill_page(MDBX_txn *txn, iov_ctx_t *ctx, MDBX_page *dp, - unsigned npages) { + const unsigned npages) { +#if !MDBX_AVOID_MSYNC tASSERT(txn, !(txn->mt_flags & MDBX_WRITEMAP)); - pgno_t pgno = dp->mp_pgno; - int err = iov_page(txn, ctx, dp, npages); - if (likely(err == MDBX_SUCCESS)) { - err = pnl_append_range(true, &txn->tw.spill_pages, pgno << 1, npages); +#endif /* MDBX_AVOID_MSYNC */ #if MDBX_ENABLE_PGOP_STAT - if (likely(err == MDBX_SUCCESS)) - txn->mt_env->me_lck->mti_pgop_stat.spill.weak += npages; + txn->mt_env->me_lck->mti_pgop_stat.spill.weak += npages; #endif /* MDBX_ENABLE_PGOP_STAT */ - } + const pgno_t pgno = dp->mp_pgno; + int err = iov_page(txn, ctx, dp, npages); + if (likely(err == MDBX_SUCCESS) && + (!MDBX_AVOID_MSYNC || !(txn->mt_flags & MDBX_WRITEMAP))) + err = pnl_append_range(true, &txn->tw.spill_pages, pgno << 1, npages); return err; } @@ -4610,6 +4617,29 @@ static int txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0, (need > txn->tw.dirtyroom) ? need - txn->tw.dirtyroom : 1; #endif /* xMDBX_DEBUG_SPILLING */ + int rc = MDBX_SUCCESS; +#if !MDBX_AVOID_MSYNC + if (txn->mt_flags & MDBX_WRITEMAP) { + NOTICE("%s-spilling of %u dirty-entries (have %u dirty-room, need %u)", + "msync", wanna_spill, txn->tw.dirtyroom, need); + tASSERT(txn, txn->tw.spill_pages == nullptr); + const MDBX_env *env = txn->mt_env; + rc = + osal_msync(&txn->mt_env->me_dxb_mmap, 0, + pgno_align2os_bytes(env, txn->mt_next_pgno), MDBX_SYNC_NONE); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + dpl_clear(txn->tw.dirtylist); + txn->tw.dirtyroom = env->me_options.dp_limit - txn->tw.loose_count; + for (MDBX_page *lp = txn->tw.loose_pages; lp != nullptr; lp = lp->mp_next) { + rc = dpl_append(txn, lp->mp_pgno, lp, 1); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + } + goto done; + } +#endif /* MDBX_AVOID_MSYNC */ + const unsigned dirty = txn->tw.dirtylist->length; const unsigned spill_min = txn->mt_env->me_options.spill_min_denominator @@ -4624,68 +4654,27 @@ static int txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0, if (!wanna_spill) return MDBX_SUCCESS; - NOTICE("spilling %u dirty-entries (have %u dirty-room, need %u)", wanna_spill, - txn->tw.dirtyroom, need); + NOTICE("%s-spilling %u dirty-entries (have %u dirty-room, need %u)", "pwrite", + wanna_spill, txn->tw.dirtyroom, need); tASSERT(txn, txn->tw.dirtylist->length >= wanna_spill); - - int rc = MDBX_SUCCESS; - if (txn->mt_flags & MDBX_WRITEMAP) { - MDBX_dpl *const dl = txn->tw.dirtylist; - const unsigned span = dl->length - txn->tw.loose_count; - txn->tw.dirtyroom += span; - - iov_ctx_t ctx; - rc = iov_init(txn, &ctx, wanna_spill, - dl->pages_including_loose - txn->tw.loose_count); - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; - - unsigned r, w; - for (w = 0, r = 1; r <= dl->length; ++r) { - MDBX_page *dp = dl->items[r].ptr; - if (dp->mp_flags & P_LOOSE) - dl->items[++w] = dl->items[r]; - else if (!MDBX_FAKE_SPILL_WRITEMAP) { - rc = iov_page(txn, &ctx, dp, dpl_npages(dl, r)); - tASSERT(txn, rc == MDBX_SUCCESS); + if (!MDBX_AVOID_MSYNC || !(txn->mt_flags & MDBX_WRITEMAP)) { + if (!txn->tw.spill_pages) { + txn->tw.spill_least_removed = INT_MAX; + txn->tw.spill_pages = pnl_alloc(wanna_spill); + if (unlikely(!txn->tw.spill_pages)) { + rc = MDBX_ENOMEM; + bailout: + txn->mt_flags |= MDBX_TXN_ERROR; + return rc; } + } else { + /* purge deleted slots */ + spill_purge(txn); + rc = pnl_reserve(&txn->tw.spill_pages, wanna_spill); + (void)rc /* ignore since the resulting list may be shorter + and pnl_append() will increase pnl on demand */ + ; } - - tASSERT(txn, span == r - 1 - w && w == txn->tw.loose_count); - dl->sorted = (dl->sorted == dl->length) ? w : 0; - dpl_setlen(dl, w); - tASSERT(txn, dirtylist_check(txn)); - - if (!MDBX_FAKE_SPILL_WRITEMAP && ctx.flush_end > ctx.flush_begin) { - MDBX_env *const env = txn->mt_env; -#if MDBX_ENABLE_PGOP_STAT - env->me_lck->mti_pgop_stat.wops.weak += 1; -#endif /* MDBX_ENABLE_PGOP_STAT */ - rc = osal_msync(&env->me_dxb_mmap, - pgno_align2os_bytes(env, ctx.flush_begin), - pgno_align2os_bytes(env, ctx.flush_end - ctx.flush_begin), - MDBX_SYNC_NONE); - } - return rc; - } - - tASSERT(txn, !(txn->mt_flags & MDBX_WRITEMAP)); - if (!txn->tw.spill_pages) { - txn->tw.spill_least_removed = INT_MAX; - txn->tw.spill_pages = pnl_alloc(wanna_spill); - if (unlikely(!txn->tw.spill_pages)) { - rc = MDBX_ENOMEM; - bailout: - txn->mt_flags |= MDBX_TXN_ERROR; - return rc; - } - } else { - /* purge deleted slots */ - spill_purge(txn); - rc = pnl_reserve(&txn->tw.spill_pages, wanna_spill); - (void)rc /* ignore since the resulting list may be shorter - and pnl_append() will increase pnl on demand */ - ; } /* Сортируем чтобы запись на диск была полее последовательна */ @@ -4848,8 +4837,10 @@ static int txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0, if (unlikely(rc != MDBX_SUCCESS)) goto bailout; - pnl_sort(txn->tw.spill_pages, (size_t)txn->mt_next_pgno << 1); - txn->mt_flags |= MDBX_TXN_SPILLS; + if (!MDBX_AVOID_MSYNC || !(txn->mt_flags & MDBX_WRITEMAP)) { + pnl_sort(txn->tw.spill_pages, (size_t)txn->mt_next_pgno << 1); + txn->mt_flags |= MDBX_TXN_SPILLS; + } NOTICE("spilled %u dirty-entries, now have %u dirty-room", spilled_entries, txn->tw.dirtyroom); } else { @@ -5783,9 +5774,13 @@ static int meta_unsteady(int err, MDBX_env *env, const txnid_t early_than, if (env->me_flags & MDBX_WRITEMAP) { unaligned_poke_u64(4, meta->mm_sign, wipe); osal_flush_incoherent_cpu_writeback(); - err = osal_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, NUM_METAS), - MDBX_SYNC_DATA); - if (unlikely(err != MDBX_SUCCESS)) + if (!MDBX_AVOID_MSYNC) { + err = + osal_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, NUM_METAS), + MDBX_SYNC_DATA | MDBX_SYNC_IODQ); +#if MDBX_ENABLE_PGOP_STAT + env->me_lck->mti_pgop_stat.msync.weak += 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ return err; } ptr = data_page(meta); @@ -7082,18 +7077,26 @@ static int meta_sync(const MDBX_env *env, const meta_ptr_t head) { int rc = MDBX_RESULT_TRUE; if (env->me_flags & MDBX_WRITEMAP) { -#if MDBX_ENABLE_PGOP_ST - env->me_lck->mti_pgop_stat.wops.weak += 1; -#endif /* MDBX_ENABLE_PGOP_STAT */ - const MDBX_page *page = data_page(head.ptr_c); - rc = osal_pwrite(env->me_fd4meta, page, env->me_psize, - (uint8_t *)page - env->me_map); - - if (likely(rc == MDBX_SUCCESS) && env->me_fd4meta == env->me_lazy_fd) { - rc = osal_fsync(env->me_lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); + if (!MDBX_AVOID_MSYNC) { + rc = osal_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, NUM_METAS), + MDBX_SYNC_DATA | MDBX_SYNC_IODQ); #if MDBX_ENABLE_PGOP_STAT - env->me_lck->mti_pgop_stat.fsync.weak += 1; + env->me_lck->mti_pgop_stat.msync.weak += 1; #endif /* MDBX_ENABLE_PGOP_STAT */ + } else { +#if MDBX_ENABLE_PGOP_ST + env->me_lck->mti_pgop_stat.wops.weak += 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ + const MDBX_page *page = data_page(head.ptr_c); + rc = osal_pwrite(env->me_fd4meta, page, env->me_psize, + (uint8_t *)page - env->me_map); + + if (likely(rc == MDBX_SUCCESS) && env->me_fd4meta == env->me_lazy_fd) { + rc = osal_fsync(env->me_lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); +#if MDBX_ENABLE_PGOP_STAT + env->me_lck->mti_pgop_stat.fsync.weak += 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ + } } } else { rc = osal_fsync(env->me_lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); @@ -9948,8 +9951,9 @@ bailout: } static int txn_write(MDBX_txn *txn, iov_ctx_t *ctx) { - MDBX_dpl *const dl = - (txn->mt_flags & MDBX_WRITEMAP) ? txn->tw.dirtylist : dpl_sort(txn); + MDBX_dpl *dl = txn->tw.dirtylist; + if (MDBX_AVOID_MSYNC || !(txn->mt_flags & MDBX_WRITEMAP)) + dl = dpl_sort(txn); int rc = MDBX_SUCCESS; unsigned r, w; for (w = 0, r = 1; r <= dl->length; ++r) { @@ -11273,15 +11277,19 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending, if (flags & MDBX_NOMETASYNC) mode_bits |= MDBX_SYNC_IODQ; } + if (!MDBX_AVOID_MSYNC && (flags & MDBX_WRITEMAP)) { #if MDBX_ENABLE_PGOP_STAT env->me_lck->mti_pgop_stat.msync.weak += sync_op; #endif /* MDBX_ENABLE_PGOP_STAT */ - if (flags & MDBX_WRITEMAP) rc = osal_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, pending->mm_geo.next), mode_bits); - else + } else { +#if MDBX_ENABLE_PGOP_STAT + env->me_lck->mti_pgop_stat.fsync.weak += sync_op; +#endif /* MDBX_ENABLE_PGOP_STAT */ rc = osal_fsync(env->me_lazy_fd, mode_bits); + } if (unlikely(rc != MDBX_SUCCESS)) goto fail; rc = (flags & MDBX_SAFE_NOSYNC) ? MDBX_RESULT_TRUE /* carry non-steady */ @@ -11399,14 +11407,33 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending, memcpy(target->mm_sign, pending->mm_sign, 8); osal_flush_incoherent_cpu_writeback(); jitter4testing(true); - /* sync meta-pages */ + if (!MDBX_AVOID_MSYNC) { + /* sync meta-pages */ #if MDBX_ENABLE_PGOP_STAT - env->me_lck->mti_pgop_stat.msync.weak += 1; + env->me_lck->mti_pgop_stat.msync.weak += 1; #endif /* MDBX_ENABLE_PGOP_STAT */ - rc = osal_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, NUM_METAS), - (flags & MDBX_NOMETASYNC) - ? MDBX_SYNC_NONE - : MDBX_SYNC_DATA | MDBX_SYNC_IODQ); + rc = osal_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, NUM_METAS), + (flags & MDBX_NOMETASYNC) + ? MDBX_SYNC_NONE + : MDBX_SYNC_DATA | MDBX_SYNC_IODQ); + } else { +#if MDBX_ENABLE_PGOP_STAT + env->me_lck->mti_pgop_stat.wops.weak += 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ + const MDBX_page *page = data_page(target); + rc = osal_pwrite(env->me_fd4meta, page, env->me_psize, + (uint8_t *)page - env->me_map); + if (likely(rc == MDBX_SUCCESS)) { + osal_flush_incoherent_mmap(target, sizeof(MDBX_meta), env->me_os_psize); + if ((flags & MDBX_NOMETASYNC) == 0 && + env->me_fd4meta == env->me_lazy_fd) { +#if MDBX_ENABLE_PGOP_STAT + env->me_lck->mti_pgop_stat.fsync.weak += 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ + rc = osal_fsync(env->me_lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); + } + } + } if (unlikely(rc != MDBX_SUCCESS)) goto fail; } else { @@ -13347,8 +13374,16 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, uint8_t ior_flags = 0; if ((flags & (MDBX_RDONLY | MDBX_SAFE_NOSYNC)) == MDBX_SYNC_DURABLE) { ior_flags = IOR_OVERLAPPED; + if ((flags & MDBX_WRITEMAP) && MDBX_AVOID_MSYNC) { + MDBX_meta header; + if (read_header(env, &header, MDBX_SUCCESS, true) == MDBX_SUCCESS && + header.mm_psize >= env->me_os_psize) + ior_flags |= IOR_DIRECT; + } + rc = - osal_openfile(MDBX_OPEN_DXB_OVERLAPPED, + osal_openfile((ior_flags & IOR_DIRECT) ? MDBX_OPEN_DXB_OVERLAPPED_DIRECT + : MDBX_OPEN_DXB_OVERLAPPED, env, env_pathname.dxb, &env->me_overlapped_fd, 0); if (rc != MDBX_SUCCESS) goto bailout; @@ -23481,6 +23516,7 @@ __dll_export " MDBX_64BIT_ATOMIC=" MDBX_64BIT_ATOMIC_CONFIG " MDBX_64BIT_CAS=" MDBX_64BIT_CAS_CONFIG " MDBX_TRUST_RTC=" MDBX_TRUST_RTC_CONFIG + " MDBX_AVOID_MSYNC=" MDBX_STRINGIFY(MDBX_AVOID_MSYNC) " MDBX_ENABLE_REFUND=" MDBX_STRINGIFY(MDBX_ENABLE_REFUND) " MDBX_ENABLE_MADVISE=" MDBX_STRINGIFY(MDBX_ENABLE_MADVISE) #if MDBX_DISABLE_VALIDATION diff --git a/src/options.h b/src/options.h index 08018630..1a28e619 100644 --- a/src/options.h +++ b/src/options.h @@ -121,23 +121,22 @@ #error MDBX_DPL_PREALLOC_FOR_RADIXSORT must be defined as 0 or 1 #endif /* MDBX_DPL_PREALLOC_FOR_RADIXSORT */ -/** Basically, this build-option is for TODO. Guess it should be replaced - * with MDBX_ENABLE_WRITEMAP_SPILLING with the three variants: - * 0/OFF = Don't track dirty pages at all and don't spilling ones. - * This should be by-default on Linux and may-be other systems - * (not sure: Darwin/OSX, FreeBSD, Windows 10) where kernel provides - * properly LRU tracking and async writing on-demand. - * 1/ON = Lite tracking of dirty pages but with LRU labels and explicit - * spilling with msync(MS_ASYNC). */ -#ifndef MDBX_FAKE_SPILL_WRITEMAP -#if defined(__linux__) || defined(__gnu_linux__) -#define MDBX_FAKE_SPILL_WRITEMAP 1 /* msync(MS_ASYNC) is no-op on Linux */ +/** Controls dirty pages tracking, spilling and persisting in MDBX_WRITEMAP + * mode. 0/OFF = Don't track dirty pages at all, don't spill ones, and use + * msync() to persist data. This is by-default on Linux and other systems where + * kernel provides properly LRU tracking and effective flushing on-demand. 1/ON + * = Tracking of dirty pages but with LRU labels for spilling and explicit + * persist ones by write(). This may be reasonable for systems which low + * performance of msync() and/or LRU tracking. */ +#ifndef MDBX_AVOID_MSYNC +#if defined(_WIN32) || defined(_WIN64) +#define MDBX_AVOID_MSYNC 1 #else -#define MDBX_FAKE_SPILL_WRITEMAP 0 +#define MDBX_AVOID_MSYNC 0 #endif -#elif !(MDBX_FAKE_SPILL_WRITEMAP == 0 || MDBX_FAKE_SPILL_WRITEMAP == 1) -#error MDBX_FAKE_SPILL_WRITEMAP must be defined as 0 or 1 -#endif /* MDBX_FAKE_SPILL_WRITEMAP */ +#elif !(MDBX_AVOID_MSYNC == 0 || MDBX_AVOID_MSYNC == 1) +#error MDBX_AVOID_MSYNC must be defined as 0 or 1 +#endif /* MDBX_AVOID_MSYNC */ /** Controls sort order of internal page number lists. * This mostly experimental/advanced option with not for regular MDBX users. diff --git a/src/osal.c b/src/osal.c index 77b6adfc..2e0bb56a 100644 --- a/src/osal.c +++ b/src/osal.c @@ -637,7 +637,7 @@ MDBX_INTERNAL_FUNC int osal_ioring_add(osal_ioring_t *ior, const size_t offset, #if defined(_WIN32) || defined(_WIN64) const unsigned segments = (unsigned)(bytes >> ior->pagesize_ln2); const bool use_gather = - (ior->flags & IOR_UNBUFFERED) && ior->slots_left >= segments; + (ior->flags & IOR_DIRECT) && ior->slots_left >= segments; #endif /* Windows */ ior_item_t *item = ior->pool; @@ -1179,6 +1179,10 @@ MDBX_INTERNAL_FUNC int osal_openfile(const enum osal_openfile_purpose purpose, case MDBX_OPEN_DXB_LAZY: DesiredAccess |= GENERIC_READ | GENERIC_WRITE; break; + case MDBX_OPEN_DXB_OVERLAPPED_DIRECT: + FlagsAndAttributes |= FILE_FLAG_NO_BUFFERING; + /* fall through */ + __fallthrough; case MDBX_OPEN_DXB_OVERLAPPED: FlagsAndAttributes |= FILE_FLAG_OVERLAPPED; /* fall through */ diff --git a/src/osal.h b/src/osal.h index 11ef24f8..568ae9c0 100644 --- a/src/osal.h +++ b/src/osal.h @@ -318,7 +318,7 @@ typedef struct osal_ioring { unsigned slots_left; unsigned allocated; #if defined(_WIN32) || defined(_WIN64) -#define IOR_UNBUFFERED 1 +#define IOR_DIRECT 1 #define IOR_OVERLAPPED 2 #define IOR_STATE_LOCKED 1 unsigned pagesize; @@ -501,6 +501,7 @@ enum osal_openfile_purpose { MDBX_OPEN_DXB_DSYNC, #if defined(_WIN32) || defined(_WIN64) MDBX_OPEN_DXB_OVERLAPPED, + MDBX_OPEN_DXB_OVERLAPPED_DIRECT, #endif /* Windows */ MDBX_OPEN_LCK, MDBX_OPEN_COPY, From 375fa3a225d33f1ebdbd9fb15d25d7addf53e5ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Wed, 28 Sep 2022 00:06:57 +0300 Subject: [PATCH 122/364] =?UTF-8?q?mdbx:=20=D0=BD=D0=B5=D0=B1=D0=BE=D0=BB?= =?UTF-8?q?=D1=8C=D1=88=D0=B0=D1=8F=20=D1=87=D0=B8=D1=81=D1=82=D0=BA=D0=B0?= =?UTF-8?q?=20`dlist=5Ffree()`.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/core.c b/src/core.c index aed1f500..3b8734ac 100644 --- a/src/core.c +++ b/src/core.c @@ -3644,13 +3644,12 @@ static MDBX_page *page_malloc(MDBX_txn *txn, unsigned num) { static void dpage_free(MDBX_env *env, MDBX_page *dp, unsigned npages) { VALGRIND_MAKE_MEM_UNDEFINED(dp, pgno2bytes(env, npages)); MDBX_ASAN_UNPOISON_MEMORY_REGION(dp, pgno2bytes(env, npages)); - if (MDBX_DEBUG != 0 || unlikely(env->me_flags & MDBX_PAGEPERTURB)) + if (unlikely(env->me_flags & MDBX_PAGEPERTURB)) memset(dp, -1, pgno2bytes(env, npages)); if (npages == 1 && env->me_dp_reserve_len < env->me_options.dp_reserve_limit) { MDBX_ASAN_POISON_MEMORY_REGION((char *)dp + sizeof(dp->mp_next), - pgno2bytes(env, npages) - - sizeof(dp->mp_next)); + env->me_psize - sizeof(dp->mp_next)); dp->mp_next = env->me_dp_reserve; VALGRIND_MEMPOOL_FREE(env, dp); env->me_dp_reserve = dp; From bcd5bad74ad8f59a7fa71b31cb809bb4dca6439f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Wed, 28 Sep 2022 14:06:55 +0300 Subject: [PATCH 123/364] =?UTF-8?q?mdbx:=20=D0=B4=D0=BE=D0=B1=D0=B0=D0=B2?= =?UTF-8?q?=D0=BB=D0=B5=D0=BD=D0=BE=20`MDBX=5FNORETURN`=20=D0=BA=20`mdbx?= =?UTF-8?q?=5Fpanic()`=20=D0=B8=20`mdbx=5Fassert=5Ffail()`.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mdbx.h | 9 ++++++--- src/osal.c | 26 +++++++++++++++----------- 2 files changed, 21 insertions(+), 14 deletions(-) diff --git a/mdbx.h b/mdbx.h index b10ab212..7ab171a1 100644 --- a/mdbx.h +++ b/mdbx.h @@ -1020,12 +1020,15 @@ LIBMDBX_API const char *mdbx_dump_val(const MDBX_val *key, char *const buf, const size_t bufsize); /** \brief Panics with message and causes abnormal process termination. */ -LIBMDBX_API void mdbx_panic(const char *fmt, ...) MDBX_PRINTF_ARGS(1, 2); +MDBX_NORETURN LIBMDBX_API void mdbx_panic(const char *fmt, ...) + MDBX_PRINTF_ARGS(1, 2); /** \brief Panics with asserton failed message and causes abnormal process * termination. */ -LIBMDBX_API void mdbx_assert_fail(const MDBX_env *env, const char *msg, - const char *func, unsigned line); +MDBX_NORETURN LIBMDBX_API void mdbx_assert_fail(const MDBX_env *env, + const char *msg, + const char *func, + unsigned line); /** end of c_debug @} */ /** \brief Environment flags diff --git a/src/osal.c b/src/osal.c index 2e0bb56a..34aeb62d 100644 --- a/src/osal.c +++ b/src/osal.c @@ -247,13 +247,15 @@ __cold void mdbx_assert_fail(const MDBX_env *env, const char *msg, #endif } + while (1) { #if defined(_WIN32) || defined(_WIN64) - if (IsDebuggerPresent()) - DebugBreak(); - FatalExit(ERROR_UNHANDLED_ERROR); + if (IsDebuggerPresent()) + DebugBreak(); + FatalExit(ERROR_UNHANDLED_ERROR); #else - abort(); + abort(); #endif + } } __cold void mdbx_panic(const char *fmt, ...) { @@ -267,16 +269,18 @@ __cold void mdbx_panic(const char *fmt, ...) { (num < 1 || !message) ? "" : message; + while (1) { #if defined(_WIN32) || defined(_WIN64) - OutputDebugStringA("\r\nMDBX-PANIC: "); - OutputDebugStringA(const_message); - if (IsDebuggerPresent()) - DebugBreak(); - FatalExit(ERROR_UNHANDLED_ERROR); + OutputDebugStringA("\r\nMDBX-PANIC: "); + OutputDebugStringA(const_message); + if (IsDebuggerPresent()) + DebugBreak(); + FatalExit(ERROR_UNHANDLED_ERROR); #else - __assert_fail(const_message, "mdbx", 0, "panic"); - abort(); + __assert_fail(const_message, "mdbx", 0, "panic"); + abort(); #endif + } } /*----------------------------------------------------------------------------*/ From 143e3dfb775b88cd0eed137925d7b106ad444b2b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Thu, 29 Sep 2022 16:18:10 +0300 Subject: [PATCH 124/364] =?UTF-8?q?mdbx:=20=D0=BF=D1=80=D0=B5=D0=B8=D0=BC?= =?UTF-8?q?=D1=83=D1=89=D0=B5=D1=81=D1=82=D0=B2=D0=B5=D0=BD=D0=BD=D0=BE?= =?UTF-8?q?=D0=B5=20=D0=B8=D1=81=D0=BF=D0=BE=D0=BB=D1=8C=D0=B7=D0=BE=D0=B2?= =?UTF-8?q?=D0=B0=D0=BD=D0=B8=D0=B5=20`size=5Ft`=20=D0=B4=D0=BB=D1=8F=20?= =?UTF-8?q?=D1=83=D0=BC=D0=B5=D0=BD=D1=8C=D1=88=D0=B5=D0=BD=D0=B8=D1=8F=20?= =?UTF-8?q?=D0=BD=D0=B0=D0=BA=D0=BB=D0=B0=D0=B4=D0=BD=D1=8B=D1=85=20=D1=80?= =?UTF-8?q?=D0=B0=D1=81=D1=85=D0=BE=D0=B4=D0=BE=D0=B2=20=D0=BD=D0=B0=20?= =?UTF-8?q?=D0=BF=D0=BB=D0=B0=D1=82=D1=84=D0=BE=D1=80=D0=BC=D0=B5=20=D0=AD?= =?UTF-8?q?=D0=BB=D1=8C=D0=B1=D1=80=D1=83=D1=81.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core.c | 1508 +++++++++++++++++++++++------------------------ src/internals.h | 38 +- src/osal.c | 8 +- src/osal.h | 8 +- 4 files changed, 782 insertions(+), 780 deletions(-) diff --git a/src/core.c b/src/core.c index 3b8734ac..2dab0edb 100644 --- a/src/core.c +++ b/src/core.c @@ -40,22 +40,20 @@ /*------------------------------------------------------------------------------ * Internal inline functions */ -MDBX_NOTHROW_CONST_FUNCTION static unsigned branchless_abs(int value) { +MDBX_NOTHROW_CONST_FUNCTION static size_t branchless_abs(intptr_t value) { assert(value > INT_MIN); - const unsigned expanded_sign = - (unsigned)(value >> (sizeof(value) * CHAR_BIT - 1)); - return ((unsigned)value + expanded_sign) ^ expanded_sign; + const size_t expanded_sign = + (size_t)(value >> (sizeof(value) * CHAR_BIT - 1)); + return ((size_t)value + expanded_sign) ^ expanded_sign; } /* Pack/Unpack 16-bit values for Grow step & Shrink threshold */ -MDBX_NOTHROW_CONST_FUNCTION static __inline pgno_t me2v(unsigned m, - unsigned e) { +MDBX_NOTHROW_CONST_FUNCTION static __inline pgno_t me2v(size_t m, size_t e) { assert(m < 2048 && e < 8); return (pgno_t)(32768 + ((m + 1) << (e + 8))); } -MDBX_NOTHROW_CONST_FUNCTION static __inline uint16_t v2me(size_t v, - unsigned e) { +MDBX_NOTHROW_CONST_FUNCTION static __inline uint16_t v2me(size_t v, size_t e) { assert(v > (e ? me2v(2047, e - 1) : 32768)); assert(v <= me2v(2047, e)); size_t m = (v - 32768 + ((size_t)1 << (e + 8)) - 1) >> (e + 8); @@ -103,9 +101,9 @@ MDBX_NOTHROW_CONST_FUNCTION static uint16_t pages2pv(size_t pages) { /*------------------------------------------------------------------------------ * Unaligned access */ -MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __always_inline unsigned -field_alignment(unsigned alignment_baseline, size_t field_offset) { - unsigned merge = alignment_baseline | (unsigned)field_offset; +MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __always_inline size_t +field_alignment(size_t alignment_baseline, size_t field_offset) { + size_t merge = alignment_baseline | (size_t)field_offset; return merge & -(int)merge; } @@ -122,7 +120,7 @@ static __always_inline void poke_u8(uint8_t *const __restrict ptr, } MDBX_NOTHROW_PURE_FUNCTION static __always_inline uint16_t -unaligned_peek_u16(const unsigned expected_alignment, const void *const ptr) { +unaligned_peek_u16(const size_t expected_alignment, const void *const ptr) { assert((uintptr_t)ptr % expected_alignment == 0); if (MDBX_UNALIGNED_OK >= 2 || (expected_alignment % sizeof(uint16_t)) == 0) return *(const uint16_t *)ptr; @@ -138,9 +136,9 @@ unaligned_peek_u16(const unsigned expected_alignment, const void *const ptr) { } } -static __always_inline void -unaligned_poke_u16(const unsigned expected_alignment, - void *const __restrict ptr, const uint16_t v) { +static __always_inline void unaligned_poke_u16(const size_t expected_alignment, + void *const __restrict ptr, + const uint16_t v) { assert((uintptr_t)ptr % expected_alignment == 0); if (MDBX_UNALIGNED_OK >= 2 || (expected_alignment % sizeof(v)) == 0) *(uint16_t *)ptr = v; @@ -155,7 +153,7 @@ unaligned_poke_u16(const unsigned expected_alignment, } MDBX_NOTHROW_PURE_FUNCTION static __always_inline uint32_t unaligned_peek_u32( - const unsigned expected_alignment, const void *const __restrict ptr) { + const size_t expected_alignment, const void *const __restrict ptr) { assert((uintptr_t)ptr % expected_alignment == 0); if (MDBX_UNALIGNED_OK >= 4 || (expected_alignment % sizeof(uint32_t)) == 0) return *(const uint32_t *)ptr; @@ -177,9 +175,9 @@ MDBX_NOTHROW_PURE_FUNCTION static __always_inline uint32_t unaligned_peek_u32( } } -static __always_inline void -unaligned_poke_u32(const unsigned expected_alignment, - void *const __restrict ptr, const uint32_t v) { +static __always_inline void unaligned_poke_u32(const size_t expected_alignment, + void *const __restrict ptr, + const uint32_t v) { assert((uintptr_t)ptr % expected_alignment == 0); if (MDBX_UNALIGNED_OK >= 4 || (expected_alignment % sizeof(v)) == 0) *(uint32_t *)ptr = v; @@ -198,7 +196,7 @@ unaligned_poke_u32(const unsigned expected_alignment, } MDBX_NOTHROW_PURE_FUNCTION static __always_inline uint64_t unaligned_peek_u64( - const unsigned expected_alignment, const void *const __restrict ptr) { + const size_t expected_alignment, const void *const __restrict ptr) { assert((uintptr_t)ptr % expected_alignment == 0); if (MDBX_UNALIGNED_OK >= 8 || (expected_alignment % sizeof(uint64_t)) == 0) return *(const uint64_t *)ptr; @@ -221,7 +219,7 @@ MDBX_NOTHROW_PURE_FUNCTION static __always_inline uint64_t unaligned_peek_u64( } static __always_inline uint64_t -unaligned_peek_u64_volatile(const unsigned expected_alignment, +unaligned_peek_u64_volatile(const size_t expected_alignment, const volatile void *const __restrict ptr) { assert((uintptr_t)ptr % expected_alignment == 0); assert(expected_alignment % sizeof(uint32_t) == 0); @@ -241,9 +239,9 @@ unaligned_peek_u64_volatile(const unsigned expected_alignment, } } -static __always_inline void -unaligned_poke_u64(const unsigned expected_alignment, - void *const __restrict ptr, const uint64_t v) { +static __always_inline void unaligned_poke_u64(const size_t expected_alignment, + void *const __restrict ptr, + const uint64_t v) { assert((uintptr_t)ptr % expected_alignment == 0); if (MDBX_UNALIGNED_OK >= 8 || (expected_alignment % sizeof(v)) == 0) *(uint64_t *)ptr = v; @@ -439,7 +437,7 @@ node_largedata_pgno(const MDBX_node *const __restrict node) { (EVEN_FLOOR(PAGEROOM(pagesize) / 2) - sizeof(indx_t)) #define MAX_GC1OVPAGE(pagesize) (PAGEROOM(pagesize) / sizeof(pgno_t) - 1) -static __inline unsigned keysize_max(size_t pagesize, MDBX_db_flags_t flags) { +static __inline size_t keysize_max(size_t pagesize, MDBX_db_flags_t flags) { assert(pagesize >= MIN_PAGESIZE && pagesize <= MAX_PAGESIZE && is_powerof2(pagesize)); STATIC_ASSERT(BRANCH_NODE_MAX(MIN_PAGESIZE) - NODESIZE >= 8); @@ -454,11 +452,10 @@ static __inline unsigned keysize_max(size_t pagesize, MDBX_db_flags_t flags) { (MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_REVERSEDUP | MDBX_INTEGERDUP)) { const intptr_t max_dupsort_leaf_key = LEAF_NODE_MAX(pagesize) - NODESIZE - sizeof(MDBX_db); - return (max_branch_key < max_dupsort_leaf_key) - ? (unsigned)max_branch_key - : (unsigned)max_dupsort_leaf_key; + return (max_branch_key < max_dupsort_leaf_key) ? max_branch_key + : max_dupsort_leaf_key; } - return (unsigned)max_branch_key; + return max_branch_key; } static __inline size_t valsize_max(size_t pagesize, MDBX_db_flags_t flags) { @@ -601,13 +598,13 @@ flags_db2sub(uint16_t db_flags) { /*----------------------------------------------------------------------------*/ MDBX_NOTHROW_PURE_FUNCTION static __always_inline size_t -pgno2bytes(const MDBX_env *env, pgno_t pgno) { +pgno2bytes(const MDBX_env *env, size_t pgno) { eASSERT(env, (1u << env->me_psize2log) == env->me_psize); return ((size_t)pgno) << env->me_psize2log; } MDBX_NOTHROW_PURE_FUNCTION static __always_inline MDBX_page * -pgno2page(const MDBX_env *env, pgno_t pgno) { +pgno2page(const MDBX_env *env, size_t pgno) { return (MDBX_page *)(env->me_map + pgno2bytes(env, pgno)); } @@ -618,12 +615,12 @@ bytes2pgno(const MDBX_env *env, size_t bytes) { } MDBX_NOTHROW_PURE_FUNCTION static size_t -pgno_align2os_bytes(const MDBX_env *env, pgno_t pgno) { +pgno_align2os_bytes(const MDBX_env *env, size_t pgno) { return ceil_powerof2(pgno2bytes(env, pgno), env->me_os_psize); } MDBX_NOTHROW_PURE_FUNCTION static pgno_t pgno_align2os_pgno(const MDBX_env *env, - pgno_t pgno) { + size_t pgno) { return bytes2pgno(env, pgno_align2os_bytes(env, pgno)); } @@ -649,25 +646,25 @@ page_meta(MDBX_page *mp) { } /* Number of nodes on a page */ -MDBX_NOTHROW_PURE_FUNCTION static __always_inline unsigned +MDBX_NOTHROW_PURE_FUNCTION static __always_inline size_t page_numkeys(const MDBX_page *mp) { return mp->mp_lower >> 1; } /* The amount of space remaining in the page */ -MDBX_NOTHROW_PURE_FUNCTION static __always_inline unsigned +MDBX_NOTHROW_PURE_FUNCTION static __always_inline size_t page_room(const MDBX_page *mp) { return mp->mp_upper - mp->mp_lower; } /* Maximum free space in an empty page */ -MDBX_NOTHROW_PURE_FUNCTION static __always_inline unsigned +MDBX_NOTHROW_PURE_FUNCTION static __always_inline size_t page_space(const MDBX_env *env) { STATIC_ASSERT(PAGEHDRSZ % 2 == 0); return env->me_psize - PAGEHDRSZ; } -MDBX_NOTHROW_PURE_FUNCTION static __always_inline unsigned +MDBX_NOTHROW_PURE_FUNCTION static __always_inline size_t page_used(const MDBX_env *env, const MDBX_page *mp) { return page_space(env) - page_room(mp); } @@ -750,9 +747,9 @@ __cold static void MDBX_PRINTF_ARGS(2, 3) /* Address of node i in page p */ MDBX_NOTHROW_PURE_FUNCTION static __always_inline MDBX_node * -page_node(const MDBX_page *mp, unsigned i) { +page_node(const MDBX_page *mp, size_t i) { assert(PAGETYPE_COMPAT(mp) == P_LEAF || PAGETYPE_WHOLE(mp) == P_BRANCH); - assert(page_numkeys(mp) > (unsigned)(i)); + assert(page_numkeys(mp) > i); assert(mp->mp_ptrs[i] % 2 == 0); return (MDBX_node *)((char *)mp + mp->mp_ptrs[i] + PAGEHDRSZ); } @@ -761,7 +758,7 @@ page_node(const MDBX_page *mp, unsigned i) { * LEAF2 pages are used for MDBX_DUPFIXED sorted-duplicate sub-DBs. * There are no node headers, keys are stored contiguously. */ MDBX_NOTHROW_PURE_FUNCTION static __always_inline void * -page_leaf2key(const MDBX_page *mp, unsigned i, size_t keysize) { +page_leaf2key(const MDBX_page *mp, size_t i, size_t keysize) { assert(PAGETYPE_COMPAT(mp) == (P_LEAF | P_LEAF2)); assert(mp->mp_leaf2_ksize == keysize); (void)keysize; @@ -1318,7 +1315,7 @@ __cold void thread_dtor(void *rthc) { osal_thread_self(), rthc); const uint32_t self_pid = osal_getpid(); - for (unsigned i = 0; i < rthc_count; ++i) { + for (size_t i = 0; i < rthc_count; ++i) { const osal_thread_key_t key = rthc_table[i].thr_tls_key; MDBX_reader *const reader = thread_rthc_get(key); if (reader < rthc_table[i].begin || reader >= rthc_table[i].end) @@ -1333,7 +1330,7 @@ __cold void thread_dtor(void *rthc) { #endif TRACE("== thread 0x%" PRIxPTR - ", rthc %p, [%i], %p ... %p (%+i), rtch-pid %i, " + ", rthc %p, [%zi], %p ... %p (%+i), rtch-pid %i, " "current-pid %i", osal_thread_self(), __Wpedantic_format_voidptr(reader), i, __Wpedantic_format_voidptr(rthc_table[i].begin), @@ -1444,12 +1441,12 @@ __cold void global_dtor(void) { #endif const uint32_t self_pid = osal_getpid(); - for (unsigned i = 0; i < rthc_count; ++i) { + for (size_t i = 0; i < rthc_count; ++i) { const osal_thread_key_t key = rthc_table[i].thr_tls_key; thread_key_delete(key); for (MDBX_reader *rthc = rthc_table[i].begin; rthc < rthc_table[i].end; ++rthc) { - TRACE("== [%i] = key %" PRIuPTR ", %p ... %p, rthc %p (%+i), " + TRACE("== [%zi] = key %" PRIuPTR ", %p ... %p, rthc %p (%+i), " "rthc-pid %i, current-pid %i", i, (uintptr_t)key, __Wpedantic_format_voidptr(rthc_table[i].begin), __Wpedantic_format_voidptr(rthc_table[i].end), @@ -1531,10 +1528,10 @@ __cold void rthc_remove(const osal_thread_key_t key) { TRACE(">> key %zu, rthc_count %u, rthc_limit %u", (uintptr_t)key, rthc_count, rthc_limit); - for (unsigned i = 0; i < rthc_count; ++i) { + for (size_t i = 0; i < rthc_count; ++i) { if (key == rthc_table[i].thr_tls_key) { const uint32_t self_pid = osal_getpid(); - TRACE("== [%i], %p ...%p, current-pid %d", i, + TRACE("== [%zi], %p ...%p, current-pid %d", i, __Wpedantic_format_voidptr(rthc_table[i].begin), __Wpedantic_format_voidptr(rthc_table[i].end), self_pid); @@ -1982,7 +1979,7 @@ static int lcklist_detach_locked(MDBX_env *env) { \ __hot static void NAME(TYPE *const __restrict begin, \ TYPE *const __restrict end) { \ - NAME##_stack stack[sizeof(unsigned) * CHAR_BIT], *__restrict top = stack; \ + NAME##_stack stack[sizeof(size_t) * CHAR_BIT], *__restrict top = stack; \ \ TYPE *__restrict hi = end - 1; \ TYPE *__restrict lo = begin; \ @@ -2043,8 +2040,7 @@ static int lcklist_detach_locked(MDBX_env *env) { #define RADIXSORT_IMPL(NAME, TYPE, EXTRACT_KEY, BUFFER_PREALLOCATED, END_GAP) \ \ - __hot static bool NAME##_radixsort(TYPE *const begin, \ - const unsigned length) { \ + __hot static bool NAME##_radixsort(TYPE *const begin, const size_t length) { \ TYPE *tmp; \ if (BUFFER_PREALLOCATED) { \ tmp = begin + length + END_GAP; \ @@ -2055,37 +2051,37 @@ static int lcklist_detach_locked(MDBX_env *env) { return false; \ } \ \ - unsigned key_shift = 0, key_diff_mask; \ + size_t key_shift = 0, key_diff_mask; \ do { \ struct { \ - unsigned a[256], b[256]; \ + pgno_t a[256], b[256]; \ } counters; \ memset(&counters, 0, sizeof(counters)); \ \ key_diff_mask = 0; \ - unsigned prev_key = EXTRACT_KEY(begin) >> key_shift; \ + size_t prev_key = EXTRACT_KEY(begin) >> key_shift; \ TYPE *r = begin, *end = begin + length; \ do { \ - const unsigned key = EXTRACT_KEY(r) >> key_shift; \ + const size_t key = EXTRACT_KEY(r) >> key_shift; \ counters.a[key & 255]++; \ counters.b[(key >> 8) & 255]++; \ key_diff_mask |= prev_key ^ key; \ prev_key = key; \ } while (++r != end); \ \ - unsigned ta = 0, tb = 0; \ - for (unsigned i = 0; i < 256; ++i) { \ - const unsigned ia = counters.a[i]; \ + pgno_t ta = 0, tb = 0; \ + for (size_t i = 0; i < 256; ++i) { \ + const pgno_t ia = counters.a[i]; \ counters.a[i] = ta; \ ta += ia; \ - const unsigned ib = counters.b[i]; \ + const pgno_t ib = counters.b[i]; \ counters.b[i] = tb; \ tb += ib; \ } \ \ r = begin; \ do { \ - const unsigned key = EXTRACT_KEY(r) >> key_shift; \ + const size_t key = EXTRACT_KEY(r) >> key_shift; \ tmp[counters.a[key & 255]++] = *r; \ } while (++r != end); \ \ @@ -2095,7 +2091,7 @@ static int lcklist_detach_locked(MDBX_env *env) { } \ end = (r = tmp) + length; \ do { \ - const unsigned key = EXTRACT_KEY(r) >> key_shift; \ + const size_t key = EXTRACT_KEY(r) >> key_shift; \ begin[counters.b[(key >> 8) & 255]++] = *r; \ } while (++r != end); \ \ @@ -2135,7 +2131,7 @@ static int lcklist_detach_locked(MDBX_env *env) { /* clang-format off */ #define SEARCH_IMPL(NAME, TYPE_LIST, TYPE_ARG, CMP) \ static __always_inline const TYPE_LIST *NAME( \ - const TYPE_LIST *it, unsigned length, const TYPE_ARG item) { \ + const TYPE_LIST *it, size_t length, const TYPE_ARG item) { \ const TYPE_LIST *const begin = it, *const end = begin + length; \ \ if (MDBX_HAVE_CMOV) \ @@ -2237,9 +2233,9 @@ static void pnl_shrink(MDBX_PNL *ppl) { assert(pnl_bytes2size(pnl_size2bytes(MDBX_PNL_INITIAL)) >= MDBX_PNL_INITIAL && pnl_bytes2size(pnl_size2bytes(MDBX_PNL_INITIAL)) < MDBX_PNL_INITIAL * 3 / 2); - assert(MDBX_PNL_SIZE(*ppl) <= MDBX_PGL_LIMIT && - MDBX_PNL_ALLOCLEN(*ppl) >= MDBX_PNL_SIZE(*ppl)); - MDBX_PNL_SIZE(*ppl) = 0; + assert(MDBX_PNL_GETSIZE(*ppl) <= MDBX_PGL_LIMIT && + MDBX_PNL_ALLOCLEN(*ppl) >= MDBX_PNL_GETSIZE(*ppl)); + MDBX_PNL_SETSIZE(*ppl, 0); if (unlikely(MDBX_PNL_ALLOCLEN(*ppl) > MDBX_PNL_INITIAL * 2 - MDBX_CACHELINE_SIZE / sizeof(pgno_t))) { size_t bytes = pnl_size2bytes(MDBX_PNL_INITIAL); @@ -2257,8 +2253,8 @@ static void pnl_shrink(MDBX_PNL *ppl) { /* Grow the PNL to the size growed to at least given size */ static int pnl_reserve(MDBX_PNL *ppl, const size_t wanna) { const size_t allocated = MDBX_PNL_ALLOCLEN(*ppl); - assert(MDBX_PNL_SIZE(*ppl) <= MDBX_PGL_LIMIT && - MDBX_PNL_ALLOCLEN(*ppl) >= MDBX_PNL_SIZE(*ppl)); + assert(MDBX_PNL_GETSIZE(*ppl) <= MDBX_PGL_LIMIT && + MDBX_PNL_ALLOCLEN(*ppl) >= MDBX_PNL_GETSIZE(*ppl)); if (likely(allocated >= wanna)) return MDBX_SUCCESS; @@ -2287,21 +2283,21 @@ static int pnl_reserve(MDBX_PNL *ppl, const size_t wanna) { /* Make room for num additional elements in an PNL */ static __always_inline int __must_check_result pnl_need(MDBX_PNL *ppl, size_t num) { - assert(MDBX_PNL_SIZE(*ppl) <= MDBX_PGL_LIMIT && - MDBX_PNL_ALLOCLEN(*ppl) >= MDBX_PNL_SIZE(*ppl)); + assert(MDBX_PNL_GETSIZE(*ppl) <= MDBX_PGL_LIMIT && + MDBX_PNL_ALLOCLEN(*ppl) >= MDBX_PNL_GETSIZE(*ppl)); assert(num <= MDBX_PGL_LIMIT); - const size_t wanna = MDBX_PNL_SIZE(*ppl) + num; + const size_t wanna = MDBX_PNL_GETSIZE(*ppl) + num; return likely(MDBX_PNL_ALLOCLEN(*ppl) >= wanna) ? MDBX_SUCCESS : pnl_reserve(ppl, wanna); } static __always_inline void pnl_xappend(MDBX_PNL pl, pgno_t pgno) { - assert(MDBX_PNL_SIZE(pl) < MDBX_PNL_ALLOCLEN(pl)); + assert(MDBX_PNL_GETSIZE(pl) < MDBX_PNL_ALLOCLEN(pl)); if (AUDIT_ENABLED()) { - for (unsigned i = MDBX_PNL_SIZE(pl); i > 0; --i) + for (size_t i = MDBX_PNL_GETSIZE(pl); i > 0; --i) assert(pgno != pl[i]); } - MDBX_PNL_SIZE(pl) += 1; + *pl += 1; MDBX_PNL_LAST(pl) = pgno; } @@ -2309,7 +2305,7 @@ static __always_inline void pnl_xappend(MDBX_PNL pl, pgno_t pgno) { __always_inline static int __must_check_result pnl_append_range(bool spilled, MDBX_PNL *ppl, pgno_t pgno, - unsigned n) { + size_t n) { assert(n > 0); int rc = pnl_need(ppl, n); if (unlikely(rc != MDBX_SUCCESS)) @@ -2317,15 +2313,15 @@ __always_inline static int __must_check_result pnl_append_range(bool spilled, const MDBX_PNL pnl = *ppl; #if MDBX_PNL_ASCENDING - unsigned w = MDBX_PNL_SIZE(pnl); + size_t w = MDBX_PNL_GETSIZE(pnl); do { pnl[++w] = pgno; pgno += spilled ? 2 : 1; } while (--n); - MDBX_PNL_SIZE(pnl) = w; + MDBX_PNL_SETSIZE(pnl, w); #else - unsigned w = MDBX_PNL_SIZE(pnl) + n; - MDBX_PNL_SIZE(pnl) = w; + size_t w = MDBX_PNL_GETSIZE(pnl) + n; + MDBX_PNL_SETSIZE(pnl, w); do { pnl[w--] = pgno; pgno += spilled ? 2 : 1; @@ -2337,15 +2333,15 @@ __always_inline static int __must_check_result pnl_append_range(bool spilled, /* Append an pgno range into the sorted PNL */ __hot static int __must_check_result pnl_insert_range(MDBX_PNL *ppl, - pgno_t pgno, unsigned n) { + pgno_t pgno, size_t n) { assert(n > 0); int rc = pnl_need(ppl, n); if (unlikely(rc != MDBX_SUCCESS)) return rc; const MDBX_PNL pnl = *ppl; - unsigned r = MDBX_PNL_SIZE(pnl), w = r + n; - MDBX_PNL_SIZE(pnl) = w; + size_t r = MDBX_PNL_GETSIZE(pnl), w = r + n; + MDBX_PNL_SETSIZE(pnl, w); while (r && MDBX_PNL_DISORDERED(pnl[r], pgno)) pnl[w--] = pnl[r--]; @@ -2357,8 +2353,8 @@ __hot static int __must_check_result pnl_insert_range(MDBX_PNL *ppl, __hot static bool pnl_check(const pgno_t *pl, const size_t limit) { assert(limit >= MIN_PAGENO - MDBX_ENABLE_REFUND); - if (likely(MDBX_PNL_SIZE(pl))) { - if (unlikely(MDBX_PNL_SIZE(pl) > MDBX_PGL_LIMIT)) + if (likely(MDBX_PNL_GETSIZE(pl))) { + if (unlikely(MDBX_PNL_GETSIZE(pl) > MDBX_PGL_LIMIT)) return false; if (unlikely(MDBX_PNL_LEAST(pl) < MIN_PAGENO)) return false; @@ -2366,7 +2362,7 @@ __hot static bool pnl_check(const pgno_t *pl, const size_t limit) { return false; if ((!MDBX_DISABLE_VALIDATION || AUDIT_ENABLED()) && - likely(MDBX_PNL_SIZE(pl) > 1)) { + likely(MDBX_PNL_GETSIZE(pl) > 1)) { const pgno_t *scan = MDBX_PNL_BEGIN(pl); const pgno_t *const end = MDBX_PNL_END(pl); pgno_t prev = *scan++; @@ -2382,8 +2378,8 @@ __hot static bool pnl_check(const pgno_t *pl, const size_t limit) { static __always_inline bool pnl_check_allocated(const pgno_t *pl, const size_t limit) { - return pl == nullptr || - (MDBX_PNL_ALLOCLEN(pl) >= MDBX_PNL_SIZE(pl) && pnl_check(pl, limit)); + return pl == nullptr || (MDBX_PNL_ALLOCLEN(pl) >= MDBX_PNL_GETSIZE(pl) && + pnl_check(pl, limit)); } static __always_inline void @@ -2421,31 +2417,32 @@ pnl_merge_inner(pgno_t *__restrict dst, const pgno_t *__restrict src_a, __hot static void pnl_merge(MDBX_PNL dst, const MDBX_PNL src) { assert(pnl_check_allocated(dst, MAX_PAGENO + 1)); assert(pnl_check(src, MAX_PAGENO + 1)); - const pgno_t src_len = MDBX_PNL_SIZE(src); - const pgno_t dst_len = MDBX_PNL_SIZE(dst); + const size_t src_len = MDBX_PNL_GETSIZE(src); + const size_t dst_len = MDBX_PNL_GETSIZE(dst); if (likely(src_len > 0)) { - const pgno_t total = dst_len + src_len; + const size_t total = dst_len + src_len; assert(MDBX_PNL_ALLOCLEN(dst) >= total); dst[0] = /* the detent */ (MDBX_PNL_ASCENDING ? 0 : P_INVALID); pnl_merge_inner(dst + total, dst + dst_len, src + src_len, src); - MDBX_PNL_SIZE(dst) = total; + MDBX_PNL_SETSIZE(dst, total); } assert(pnl_check_allocated(dst, MAX_PAGENO + 1)); } -static void spill_remove(MDBX_txn *txn, unsigned idx, unsigned npages) { - tASSERT(txn, idx > 0 && idx <= MDBX_PNL_SIZE(txn->tw.spill_pages) && +static void spill_remove(MDBX_txn *txn, size_t idx, pgno_t npages) { + tASSERT(txn, idx > 0 && idx <= MDBX_PNL_GETSIZE(txn->tw.spill_pages) && txn->tw.spill_least_removed > 0); txn->tw.spill_least_removed = (idx < txn->tw.spill_least_removed) ? idx : txn->tw.spill_least_removed; txn->tw.spill_pages[idx] |= 1; - MDBX_PNL_SIZE(txn->tw.spill_pages) -= - (idx == MDBX_PNL_SIZE(txn->tw.spill_pages)); + MDBX_PNL_SETSIZE(txn->tw.spill_pages, + MDBX_PNL_GETSIZE(txn->tw.spill_pages) - + (idx == MDBX_PNL_GETSIZE(txn->tw.spill_pages))); while (unlikely(npages > 1)) { const pgno_t pgno = (txn->tw.spill_pages[idx] >> 1) + 1; if (MDBX_PNL_ASCENDING) { - if (++idx > MDBX_PNL_SIZE(txn->tw.spill_pages) || + if (++idx > MDBX_PNL_GETSIZE(txn->tw.spill_pages) || (txn->tw.spill_pages[idx] >> 1) != pgno) return; } else { @@ -2456,8 +2453,9 @@ static void spill_remove(MDBX_txn *txn, unsigned idx, unsigned npages) { : txn->tw.spill_least_removed; } txn->tw.spill_pages[idx] |= 1; - MDBX_PNL_SIZE(txn->tw.spill_pages) -= - (idx == MDBX_PNL_SIZE(txn->tw.spill_pages)); + MDBX_PNL_SETSIZE(txn->tw.spill_pages, + MDBX_PNL_GETSIZE(txn->tw.spill_pages) - + (idx == MDBX_PNL_GETSIZE(txn->tw.spill_pages))); --npages; } } @@ -2466,17 +2464,17 @@ static MDBX_PNL spill_purge(MDBX_txn *txn) { tASSERT(txn, txn->tw.spill_least_removed > 0); const MDBX_PNL sl = txn->tw.spill_pages; if (txn->tw.spill_least_removed != INT_MAX) { - unsigned len = MDBX_PNL_SIZE(sl), r, w; + size_t len = MDBX_PNL_GETSIZE(sl), r, w; for (w = r = txn->tw.spill_least_removed; r <= len; ++r) { sl[w] = sl[r]; w += 1 - (sl[r] & 1); } for (size_t i = 1; i < w; ++i) tASSERT(txn, (sl[i] & 1) == 0); - MDBX_PNL_SIZE(sl) = w - 1; + MDBX_PNL_SETSIZE(sl, w - 1); txn->tw.spill_least_removed = INT_MAX; } else { - for (size_t i = 1; i <= MDBX_PNL_SIZE(sl); ++i) + for (size_t i = 1; i <= MDBX_PNL_GETSIZE(sl); ++i) tASSERT(txn, (sl[i] & 1) == 0); } return sl; @@ -2493,8 +2491,8 @@ RADIXSORT_IMPL(pgno, pgno_t, MDBX_PNL_EXTRACT_KEY, SORT_IMPL(pgno_sort, false, pgno_t, MDBX_PNL_ORDERED) __hot __noinline static void pnl_sort_nochk(MDBX_PNL pnl) { - if (likely(MDBX_PNL_SIZE(pnl) < MDBX_RADIXSORT_THRESHOLD) || - unlikely(!pgno_radixsort(&MDBX_PNL_FIRST(pnl), MDBX_PNL_SIZE(pnl)))) + if (likely(MDBX_PNL_GETSIZE(pnl) < MDBX_RADIXSORT_THRESHOLD) || + unlikely(!pgno_radixsort(&MDBX_PNL_FIRST(pnl), MDBX_PNL_GETSIZE(pnl)))) pgno_sort(MDBX_PNL_BEGIN(pnl), MDBX_PNL_END(pnl)); } @@ -2508,45 +2506,45 @@ static __inline void pnl_sort(MDBX_PNL pnl, size_t limit4check) { * Returns The index of the first item greater than or equal to pgno. */ SEARCH_IMPL(pgno_bsearch, pgno_t, pgno_t, MDBX_PNL_ORDERED) -__hot __noinline static unsigned pnl_search_nochk(const MDBX_PNL pnl, - pgno_t pgno) { +__hot __noinline static size_t pnl_search_nochk(const MDBX_PNL pnl, + pgno_t pgno) { const pgno_t *begin = MDBX_PNL_BEGIN(pnl); - const pgno_t *it = pgno_bsearch(begin, MDBX_PNL_SIZE(pnl), pgno); - const pgno_t *end = begin + MDBX_PNL_SIZE(pnl); + const pgno_t *it = pgno_bsearch(begin, MDBX_PNL_GETSIZE(pnl), pgno); + const pgno_t *end = begin + MDBX_PNL_GETSIZE(pnl); assert(it >= begin && it <= end); if (it != begin) assert(MDBX_PNL_ORDERED(it[-1], pgno)); if (it != end) assert(!MDBX_PNL_ORDERED(it[0], pgno)); - return (unsigned)(it - begin + 1); + return it - begin + 1; } -static __inline unsigned pnl_search(const MDBX_PNL pnl, pgno_t pgno, - size_t limit) { +static __inline size_t pnl_search(const MDBX_PNL pnl, pgno_t pgno, + size_t limit) { assert(pnl_check_allocated(pnl, limit)); assert(pgno < limit); (void)limit; return pnl_search_nochk(pnl, pgno); } -static __inline unsigned search_spilled(const MDBX_txn *txn, pgno_t pgno) { +static __inline size_t search_spilled(const MDBX_txn *txn, pgno_t pgno) { const MDBX_PNL pnl = txn->tw.spill_pages; if (likely(!pnl)) return 0; pgno <<= 1; - unsigned n = pnl_search(pnl, pgno, (size_t)(MAX_PAGENO + 1) << 1); - return (n <= MDBX_PNL_SIZE(pnl) && pnl[n] == pgno) ? n : 0; + size_t n = pnl_search(pnl, pgno, (size_t)(MAX_PAGENO + 1) << 1); + return (n <= MDBX_PNL_GETSIZE(pnl) && pnl[n] == pgno) ? n : 0; } static __inline bool intersect_spilled(const MDBX_txn *txn, pgno_t pgno, - unsigned npages) { + pgno_t npages) { const MDBX_PNL pnl = txn->tw.spill_pages; if (likely(!pnl)) return false; - const unsigned len = MDBX_PNL_SIZE(pnl); + const size_t len = MDBX_PNL_GETSIZE(pnl); if (LOG_ENABLED(MDBX_LOG_EXTRA)) { - DEBUG_EXTRA("PNL len %u [", len); - for (unsigned i = 1; i <= len; ++i) + DEBUG_EXTRA("PNL len %zu [", len); + for (size_t i = 1; i <= len; ++i) DEBUG_EXTRA_PRINT(" %li", (pnl[i] & 1) ? -(long)(pnl[i] >> 1) : (long)(pnl[i] >> 1)); DEBUG_EXTRA_PRINT("%s\n", "]"); @@ -2554,20 +2552,21 @@ static __inline bool intersect_spilled(const MDBX_txn *txn, pgno_t pgno, const pgno_t spilled_range_begin = pgno << 1; const pgno_t spilled_range_last = ((pgno + npages) << 1) - 1; #if MDBX_PNL_ASCENDING - const unsigned n = + const size_t n = pnl_search(pnl, spilled_range_begin, (size_t)(MAX_PAGENO + 1) << 1); - assert(n && (n == MDBX_PNL_SIZE(pnl) + 1 || spilled_range_begin <= pnl[n])); - const bool rc = n <= MDBX_PNL_SIZE(pnl) && pnl[n] <= spilled_range_last; + assert(n && + (n == MDBX_PNL_GETSIZE(pnl) + 1 || spilled_range_begin <= pnl[n])); + const bool rc = n <= MDBX_PNL_GETSIZE(pnl) && pnl[n] <= spilled_range_last; #else - const unsigned n = + const size_t n = pnl_search(pnl, spilled_range_last, (size_t)(MAX_PAGENO + 1) << 1); - assert(n && (n == MDBX_PNL_SIZE(pnl) + 1 || spilled_range_last >= pnl[n])); - const bool rc = n <= MDBX_PNL_SIZE(pnl) && pnl[n] >= spilled_range_begin; + assert(n && (n == MDBX_PNL_GETSIZE(pnl) + 1 || spilled_range_last >= pnl[n])); + const bool rc = n <= MDBX_PNL_GETSIZE(pnl) && pnl[n] >= spilled_range_begin; #endif if (ASSERT_ENABLED()) { bool check = false; - for (unsigned i = 0; i < npages; ++i) - check |= search_spilled(txn, pgno + i) != 0; + for (size_t i = 0; i < npages; ++i) + check |= search_spilled(txn, (pgno_t)(pgno + i)) != 0; assert(check == rc); } return rc; @@ -2612,8 +2611,8 @@ static void txl_free(MDBX_TXL tl) { static int txl_reserve(MDBX_TXL *ptl, const size_t wanna) { const size_t allocated = (size_t)MDBX_PNL_ALLOCLEN(*ptl); - assert(MDBX_PNL_SIZE(*ptl) <= MDBX_TXL_MAX && - MDBX_PNL_ALLOCLEN(*ptl) >= MDBX_PNL_SIZE(*ptl)); + assert(MDBX_PNL_GETSIZE(*ptl) <= MDBX_TXL_MAX && + MDBX_PNL_ALLOCLEN(*ptl) >= MDBX_PNL_GETSIZE(*ptl)); if (likely(allocated >= wanna)) return MDBX_SUCCESS; @@ -2641,17 +2640,17 @@ static int txl_reserve(MDBX_TXL *ptl, const size_t wanna) { static __always_inline int __must_check_result txl_need(MDBX_TXL *ptl, size_t num) { - assert(MDBX_PNL_SIZE(*ptl) <= MDBX_TXL_MAX && - MDBX_PNL_ALLOCLEN(*ptl) >= MDBX_PNL_SIZE(*ptl)); + assert(MDBX_PNL_GETSIZE(*ptl) <= MDBX_TXL_MAX && + MDBX_PNL_ALLOCLEN(*ptl) >= MDBX_PNL_GETSIZE(*ptl)); assert(num <= MDBX_PGL_LIMIT); - const size_t wanna = (size_t)MDBX_PNL_SIZE(*ptl) + num; + const size_t wanna = (size_t)MDBX_PNL_GETSIZE(*ptl) + num; return likely(MDBX_PNL_ALLOCLEN(*ptl) >= wanna) ? MDBX_SUCCESS : txl_reserve(ptl, wanna); } static __always_inline void txl_xappend(MDBX_TXL tl, txnid_t id) { - assert(MDBX_PNL_SIZE(tl) < MDBX_PNL_ALLOCLEN(tl)); - MDBX_PNL_SIZE(tl) += 1; + assert(MDBX_PNL_GETSIZE(tl) < MDBX_PNL_ALLOCLEN(tl)); + tl[0] += 1; MDBX_PNL_LAST(tl) = id; } @@ -2662,7 +2661,7 @@ static void txl_sort(MDBX_TXL tl) { } static int __must_check_result txl_append(MDBX_TXL *ptl, txnid_t id) { - if (unlikely(MDBX_PNL_SIZE(*ptl) == MDBX_PNL_ALLOCLEN(*ptl))) { + if (unlikely(MDBX_PNL_GETSIZE(*ptl) == MDBX_PNL_ALLOCLEN(*ptl))) { int rc = txl_need(ptl, MDBX_TXL_GRANULATE); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -2698,7 +2697,7 @@ static __always_inline size_t dpl_size2bytes(ptrdiff_t size) { return bytes; } -static __always_inline unsigned dpl_bytes2size(const ptrdiff_t bytes) { +static __always_inline size_t dpl_bytes2size(const ptrdiff_t bytes) { size_t size = (bytes - sizeof(MDBX_dpl)) / sizeof(MDBX_dp); assert(size > CURSOR_STACK + MDBX_DPL_RESERVE_GAP && size <= MDBX_PGL_LIMIT + MDBX_PNL_GRANULATE); @@ -2706,10 +2705,10 @@ static __always_inline unsigned dpl_bytes2size(const ptrdiff_t bytes) { #if MDBX_DPL_PREALLOC_FOR_RADIXSORT size >>= 1; #endif /* MDBX_DPL_PREALLOC_FOR_RADIXSORT */ - return (unsigned)size; + return size; } -static __always_inline unsigned dpl_setlen(MDBX_dpl *dl, unsigned len) { +static __always_inline size_t dpl_setlen(MDBX_dpl *dl, size_t len) { static const MDBX_page dpl_stub_pageE = { {0}, 0, P_BAD, {0}, /* pgno */ ~(pgno_t)0}; assert(dpl_stub_pageE.mp_flags == P_BAD && @@ -2783,7 +2782,7 @@ SORT_IMPL(dp_sort, false, MDBX_dp, DP_SORT_CMP) __hot __noinline static MDBX_dpl *dpl_sort_slowpath(const MDBX_txn *txn) { MDBX_dpl *dl = txn->tw.dirtylist; assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); - const unsigned unsorted = dl->length - dl->sorted; + const size_t unsorted = dl->length - dl->sorted; if (likely(unsorted < MDBX_RADIXSORT_THRESHOLD) || unlikely(!dpl_radixsort(dl->items + 1, dl->length))) { if (dl->sorted > unsorted / 4 + 4 && @@ -2818,7 +2817,7 @@ __hot __noinline static MDBX_dpl *dpl_sort_slowpath(const MDBX_txn *txn) { assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); if (ASSERT_ENABLED()) - for (unsigned i = 0; i <= dl->length; ++i) + for (size_t i = 0; i <= dl->length; ++i) assert(dl->items[i].pgno < dl->items[i + 1].pgno); } else { dp_sort(dl->items + 1, dl->items + dl->length + 1); @@ -2846,7 +2845,7 @@ static __always_inline MDBX_dpl *dpl_sort(const MDBX_txn *txn) { #define DP_SEARCH_CMP(dp, id) ((dp).pgno < (id)) SEARCH_IMPL(dp_bsearch, MDBX_dp, pgno_t, DP_SEARCH_CMP) -__hot __noinline static unsigned dpl_search(const MDBX_txn *txn, pgno_t pgno) { +__hot __noinline static size_t dpl_search(const MDBX_txn *txn, pgno_t pgno) { MDBX_dpl *dl = txn->tw.dirtylist; assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); if (AUDIT_ENABLED()) { @@ -2884,28 +2883,28 @@ __hot __noinline static unsigned dpl_search(const MDBX_txn *txn, pgno_t pgno) { /* continue bsearch on the sorted part */ break; } - return (unsigned)(dp_bsearch(dl->items + 1, dl->sorted, pgno) - dl->items); + return dp_bsearch(dl->items + 1, dl->sorted, pgno) - dl->items; } MDBX_NOTHROW_PURE_FUNCTION static __inline unsigned -dpl_npages(const MDBX_dpl *dl, unsigned i) { - assert(0 <= (int)i && i <= dl->length); +dpl_npages(const MDBX_dpl *dl, size_t i) { + assert(0 <= (intptr_t)i && i <= dl->length); unsigned n = likely(!dl->items[i].multi) ? 1 : dl->items[i].ptr->mp_pages; assert(n == (IS_OVERFLOW(dl->items[i].ptr) ? dl->items[i].ptr->mp_pages : 1)); return n; } -MDBX_NOTHROW_PURE_FUNCTION static __inline unsigned -dpl_endpgno(const MDBX_dpl *dl, unsigned i) { +MDBX_NOTHROW_PURE_FUNCTION static __inline pgno_t +dpl_endpgno(const MDBX_dpl *dl, size_t i) { return dpl_npages(dl, i) + dl->items[i].pgno; } static __inline bool dpl_intersect(const MDBX_txn *txn, pgno_t pgno, - unsigned npages) { + pgno_t npages) { MDBX_dpl *dl = txn->tw.dirtylist; assert(dl->sorted == dl->length); assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); - unsigned const n = dpl_search(txn, pgno); + size_t const n = dpl_search(txn, pgno); assert(n >= 1 && n <= dl->length + 1); assert(pgno <= dl->items[n].pgno); assert(pgno > dl->items[n - 1].pgno); @@ -2914,7 +2913,7 @@ static __inline bool dpl_intersect(const MDBX_txn *txn, pgno_t pgno, /* intersection with prev */ dpl_endpgno(dl, n - 1) > pgno; if (ASSERT_ENABLED()) { bool check = false; - for (unsigned i = 1; i <= dl->length; ++i) { + for (size_t i = 1; i <= dl->length; ++i) { const MDBX_page *const dp = dl->items[i].ptr; if (!(dp->mp_pgno /* begin */ >= /* end */ pgno + npages || dpl_endpgno(dl, i) /* end */ <= /* begin */ pgno)) @@ -2925,9 +2924,9 @@ static __inline bool dpl_intersect(const MDBX_txn *txn, pgno_t pgno, return rc; } -static __always_inline unsigned dpl_exist(MDBX_txn *txn, pgno_t pgno) { +static __always_inline size_t dpl_exist(MDBX_txn *txn, pgno_t pgno) { MDBX_dpl *dl = txn->tw.dirtylist; - unsigned i = dpl_search(txn, pgno); + size_t i = dpl_search(txn, pgno); assert((int)i > 0); return (dl->items[i].pgno == pgno) ? i : 0; } @@ -2936,22 +2935,21 @@ MDBX_MAYBE_UNUSED static const MDBX_page *debug_dpl_find(const MDBX_txn *txn, const pgno_t pgno) { const MDBX_dpl *dl = txn->tw.dirtylist; assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); - for (unsigned i = dl->length; i > dl->sorted; --i) + for (size_t i = dl->length; i > dl->sorted; --i) if (dl->items[i].pgno == pgno) return dl->items[i].ptr; if (dl->sorted) { - const unsigned i = - (unsigned)(dp_bsearch(dl->items + 1, dl->sorted, pgno) - dl->items); + const size_t i = dp_bsearch(dl->items + 1, dl->sorted, pgno) - dl->items; if (dl->items[i].pgno == pgno) return dl->items[i].ptr; } return nullptr; } -static void dpl_remove_ex(const MDBX_txn *txn, unsigned i, unsigned npages) { +static void dpl_remove_ex(const MDBX_txn *txn, size_t i, pgno_t npages) { MDBX_dpl *dl = txn->tw.dirtylist; - assert((int)i > 0 && i <= dl->length); + assert((intptr_t)i > 0 && i <= dl->length); assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); dl->pages_including_loose -= npages; dl->sorted -= dl->sorted >= i; @@ -2961,29 +2959,29 @@ static void dpl_remove_ex(const MDBX_txn *txn, unsigned i, unsigned npages) { assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); } -static void dpl_remove(const MDBX_txn *txn, unsigned i) { +static void dpl_remove(const MDBX_txn *txn, size_t i) { dpl_remove_ex(txn, i, dpl_npages(txn->tw.dirtylist, i)); } static __always_inline int __must_check_result dpl_append(MDBX_txn *txn, pgno_t pgno, MDBX_page *page, - unsigned npages) { + pgno_t npages) { MDBX_dpl *dl = txn->tw.dirtylist; assert(dl->length <= MDBX_PGL_LIMIT + MDBX_PNL_GRANULATE); assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); if (AUDIT_ENABLED()) { - for (unsigned i = dl->length; i > 0; --i) { + for (size_t i = dl->length; i > 0; --i) { assert(dl->items[i].pgno != pgno); if (unlikely(dl->items[i].pgno == pgno)) { - ERROR("Page %u already exist in the DPL at %u", pgno, i); + ERROR("Page %u already exist in the DPL at %zu", pgno, i); return MDBX_PROBLEM; } } } - const unsigned length = dl->length + 1; - const unsigned sorted = + const size_t length = dl->length + 1; + const size_t sorted = (dl->sorted == dl->length && dl->items[dl->length].pgno < pgno) ? length : dl->sorted; @@ -3016,9 +3014,9 @@ static __always_inline int __must_check_result dpl_append(MDBX_txn *txn, return MDBX_SUCCESS; } -static __inline uint32_t dpl_age(const MDBX_txn *txn, unsigned i) { +static __inline uint32_t dpl_age(const MDBX_txn *txn, size_t i) { const MDBX_dpl *dl = txn->tw.dirtylist; - assert((int)i > 0 && i <= dl->length); + assert((intptr_t)i > 0 && i <= dl->length); /* overflow could be here */ return (txn->tw.dirtylru - dl->items[i].lru) & UINT32_C(0x7fffFFFF); } @@ -3033,7 +3031,7 @@ static __must_check_result __inline int page_retire(MDBX_cursor *mc, MDBX_page *mp); static int __must_check_result page_dirty(MDBX_txn *txn, MDBX_page *mp, - unsigned npages); + pgno_t npages); typedef struct page_result { MDBX_page *page; int err; @@ -3042,7 +3040,7 @@ typedef struct page_result { static txnid_t kick_longlived_readers(MDBX_env *env, const txnid_t laggard); static pgr_t page_new(MDBX_cursor *mc, const unsigned flags); -static pgr_t page_new_large(MDBX_cursor *mc, const unsigned npages); +static pgr_t page_new_large(MDBX_cursor *mc, const pgno_t npages); static int page_touch(MDBX_cursor *mc); static int cursor_touch(MDBX_cursor *mc); static int touch_dbi(MDBX_cursor *mc); @@ -3122,7 +3120,7 @@ static bool coherency_check_meta(const MDBX_env *env, static int __must_check_result validate_meta_copy(MDBX_env *env, const MDBX_meta *meta, MDBX_meta *dest); -static int __must_check_result override_meta(MDBX_env *env, unsigned target, +static int __must_check_result override_meta(MDBX_env *env, size_t target, txnid_t txnid, const MDBX_meta *shape); static int __must_check_result read_header(MDBX_env *env, MDBX_meta *meta, @@ -3140,17 +3138,17 @@ struct node_result { static struct node_result node_search(MDBX_cursor *mc, const MDBX_val *key); -static int __must_check_result node_add_branch(MDBX_cursor *mc, unsigned indx, +static int __must_check_result node_add_branch(MDBX_cursor *mc, size_t indx, const MDBX_val *key, pgno_t pgno); -static int __must_check_result node_add_leaf(MDBX_cursor *mc, unsigned indx, +static int __must_check_result node_add_leaf(MDBX_cursor *mc, size_t indx, const MDBX_val *key, MDBX_val *data, unsigned flags); -static int __must_check_result node_add_leaf2(MDBX_cursor *mc, unsigned indx, +static int __must_check_result node_add_leaf2(MDBX_cursor *mc, size_t indx, const MDBX_val *key); static void node_del(MDBX_cursor *mc, size_t ksize); -static void node_shrink(MDBX_page *mp, unsigned indx); +static void node_shrink(MDBX_page *mp, size_t indx); static int __must_check_result node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, bool fromleft); static int __must_check_result node_read(MDBX_cursor *mc, const MDBX_node *leaf, @@ -3161,7 +3159,7 @@ static int __must_check_result update_key(MDBX_cursor *mc, const MDBX_val *key); static void cursor_pop(MDBX_cursor *mc); static int __must_check_result cursor_push(MDBX_cursor *mc, MDBX_page *mp); -static int __must_check_result audit_ex(MDBX_txn *txn, unsigned retired_stored, +static int __must_check_result audit_ex(MDBX_txn *txn, size_t retired_stored, bool dont_filter_gc); static int __must_check_result page_check(MDBX_cursor *const mc, @@ -3192,7 +3190,7 @@ static int __must_check_result cursor_last(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data); static int __must_check_result cursor_init(MDBX_cursor *mc, MDBX_txn *txn, - MDBX_dbi dbi); + size_t dbi); static int __must_check_result cursor_xinit0(MDBX_cursor *mc); static int __must_check_result cursor_xinit1(MDBX_cursor *mc, MDBX_node *node, const MDBX_page *mp); @@ -3203,7 +3201,7 @@ static void cursor_copy(const MDBX_cursor *csrc, MDBX_cursor *cdst); static int __must_check_result drop_tree(MDBX_cursor *mc, const bool may_have_subDBs); -static int __must_check_result fetch_sdb(MDBX_txn *txn, MDBX_dbi dbi); +static int __must_check_result fetch_sdb(MDBX_txn *txn, size_t dbi); static int __must_check_result setup_dbx(MDBX_dbx *const dbx, const MDBX_db *const db, const unsigned pagesize); @@ -3423,7 +3421,7 @@ const char *mdbx_dump_val(const MDBX_val *key, char *const buf, bool is_ascii = true; const uint8_t *const data = key->iov_base; - for (unsigned i = 0; i < key->iov_len; i++) + for (size_t i = 0; i < key->iov_len; i++) if (data[i] < ' ' || data[i] > '~') { is_ascii = false; break; @@ -3433,13 +3431,13 @@ const char *mdbx_dump_val(const MDBX_val *key, char *const buf, int len = snprintf(buf, bufsize, "%.*s", (key->iov_len > INT_MAX) ? INT_MAX : (int)key->iov_len, data); - assert(len > 0 && (unsigned)len < bufsize); + assert(len > 0 && (size_t)len < bufsize); (void)len; } else { char *const detent = buf + bufsize - 2; char *ptr = buf; *ptr++ = '<'; - for (unsigned i = 0; i < key->iov_len; i++) { + for (size_t i = 0; i < key->iov_len; i++) { const ptrdiff_t left = detent - ptr; assert(left > 0); int len = snprintf(ptr, left, "%02x", data[i]); @@ -3471,7 +3469,7 @@ MDBX_MAYBE_UNUSED static void page_list(MDBX_page *mp) { pgno_t pgno = mp->mp_pgno; const char *type; MDBX_node *node; - unsigned i, nkeys, nsize, total = 0; + size_t i, nkeys, nsize, total = 0; MDBX_val key; DKBUF; @@ -3504,37 +3502,37 @@ MDBX_MAYBE_UNUSED static void page_list(MDBX_page *mp) { } nkeys = page_numkeys(mp); - VERBOSE("%s %" PRIaPGNO " numkeys %u\n", type, pgno, nkeys); + VERBOSE("%s %" PRIaPGNO " numkeys %zu\n", type, pgno, nkeys); for (i = 0; i < nkeys; i++) { if (IS_LEAF2(mp)) { /* LEAF2 pages have no mp_ptrs[] or node headers */ key.iov_len = nsize = mp->mp_leaf2_ksize; key.iov_base = page_leaf2key(mp, i, nsize); total += nsize; - VERBOSE("key %u: nsize %u, %s\n", i, nsize, DKEY(&key)); + VERBOSE("key %zu: nsize %zu, %s\n", i, nsize, DKEY(&key)); continue; } node = page_node(mp, i); key.iov_len = node_ks(node); key.iov_base = node->mn_data; - nsize = (unsigned)(NODESIZE + key.iov_len); + nsize = NODESIZE + key.iov_len; if (IS_BRANCH(mp)) { - VERBOSE("key %u: page %" PRIaPGNO ", %s\n", i, node_pgno(node), + VERBOSE("key %zu: page %" PRIaPGNO ", %s\n", i, node_pgno(node), DKEY(&key)); total += nsize; } else { if (node_flags(node) & F_BIGDATA) nsize += sizeof(pgno_t); else - nsize += (unsigned)node_ds(node); + nsize += node_ds(node); total += nsize; nsize += sizeof(indx_t); - VERBOSE("key %u: nsize %u, %s%s\n", i, nsize, DKEY(&key), + VERBOSE("key %zu: nsize %zu, %s%s\n", i, nsize, DKEY(&key), leafnode_type(node)); } total = EVEN(total); } - VERBOSE("Total: header %u + contents %u + unused %u\n", + VERBOSE("Total: header %zu + contents %zu + unused %zu\n", IS_LEAF2(mp) ? PAGEHDRSZ : PAGEHDRSZ + mp->mp_lower, total, page_room(mp)); } @@ -3601,7 +3599,7 @@ int mdbx_dcmp(const MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *a, /* Allocate memory for a page. * Re-use old malloc'ed pages first for singletons, otherwise just malloc. * Set MDBX_TXN_ERROR on failure. */ -static MDBX_page *page_malloc(MDBX_txn *txn, unsigned num) { +static MDBX_page *page_malloc(MDBX_txn *txn, size_t num) { MDBX_env *env = txn->mt_env; MDBX_page *np = env->me_dp_reserve; size_t size = env->me_psize; @@ -3636,12 +3634,12 @@ static MDBX_page *page_malloc(MDBX_txn *txn, unsigned num) { #endif VALGRIND_MAKE_MEM_UNDEFINED(np, size); np->mp_flags = 0; - np->mp_pages = num; + np->mp_pages = (pgno_t)num; return np; } /* Free a shadow dirty page */ -static void dpage_free(MDBX_env *env, MDBX_page *dp, unsigned npages) { +static void dpage_free(MDBX_env *env, MDBX_page *dp, pgno_t npages) { VALGRIND_MAKE_MEM_UNDEFINED(dp, pgno2bytes(env, npages)); MDBX_ASAN_UNPOISON_MEMORY_REGION(dp, pgno2bytes(env, npages)); if (unlikely(env->me_flags & MDBX_PAGEPERTURB)) @@ -3666,7 +3664,7 @@ static void dlist_free(MDBX_txn *txn) { MDBX_env *env = txn->mt_env; MDBX_dpl *const dl = txn->tw.dirtylist; - for (unsigned i = 1; i <= dl->length; i++) + for (size_t i = 1; i <= dl->length; i++) dpage_free(env, dl->items[i].ptr, dpl_npages(dl, i)); dpl_clear(dl); @@ -3691,8 +3689,8 @@ MDBX_MAYBE_UNUSED __cold static bool dirtylist_check(MDBX_txn *txn) { if (!AUDIT_ENABLED()) return true; - unsigned loose = 0, pages = 0; - for (unsigned i = dl->length; i > 0; --i) { + size_t loose = 0, pages = 0; + for (size_t i = dl->length; i > 0; --i) { const MDBX_page *const dp = dl->items[i].ptr; if (!dp) continue; @@ -3724,16 +3722,16 @@ MDBX_MAYBE_UNUSED __cold static bool dirtylist_check(MDBX_txn *txn) { return false; } - const unsigned rpa = + const size_t rpa = pnl_search(txn->tw.reclaimed_pglist, dp->mp_pgno, txn->mt_next_pgno); - tASSERT(txn, rpa > MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) || + tASSERT(txn, rpa > MDBX_PNL_GETSIZE(txn->tw.reclaimed_pglist) || txn->tw.reclaimed_pglist[rpa] != dp->mp_pgno); - if (rpa <= MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) && + if (rpa <= MDBX_PNL_GETSIZE(txn->tw.reclaimed_pglist) && unlikely(txn->tw.reclaimed_pglist[rpa] == dp->mp_pgno)) return false; if (num > 1) { - const unsigned rpb = pnl_search(txn->tw.reclaimed_pglist, - dp->mp_pgno + num - 1, txn->mt_next_pgno); + const size_t rpb = pnl_search(txn->tw.reclaimed_pglist, + dp->mp_pgno + num - 1, txn->mt_next_pgno); tASSERT(txn, rpa == rpb); if (unlikely(rpa != rpb)) return false; @@ -3748,7 +3746,7 @@ MDBX_MAYBE_UNUSED __cold static bool dirtylist_check(MDBX_txn *txn) { if (unlikely(pages != dl->pages_including_loose)) return false; - for (unsigned i = 1; i <= MDBX_PNL_SIZE(txn->tw.retired_pages); ++i) { + for (size_t i = 1; i <= MDBX_PNL_GETSIZE(txn->tw.retired_pages); ++i) { const MDBX_page *const dp = debug_dpl_find(txn, txn->tw.retired_pages[i]); tASSERT(txn, !dp); if (unlikely(dp)) @@ -3763,21 +3761,21 @@ static void refund_reclaimed(MDBX_txn *txn) { /* Scanning in descend order */ pgno_t next_pgno = txn->mt_next_pgno; const MDBX_PNL pnl = txn->tw.reclaimed_pglist; - tASSERT(txn, MDBX_PNL_SIZE(pnl) && MDBX_PNL_MOST(pnl) == next_pgno - 1); + tASSERT(txn, MDBX_PNL_GETSIZE(pnl) && MDBX_PNL_MOST(pnl) == next_pgno - 1); #if MDBX_PNL_ASCENDING - unsigned i = MDBX_PNL_SIZE(pnl); + size_t i = MDBX_PNL_GETSIZE(pnl); tASSERT(txn, pnl[i] == next_pgno - 1); while (--next_pgno, --i > 0 && pnl[i] == next_pgno - 1) ; - MDBX_PNL_SIZE(pnl) = i; + MDBX_PNL_SETSIZE(pnl, i); #else - unsigned i = 1; + size_t i = 1; tASSERT(txn, pnl[i] == next_pgno - 1); - unsigned len = MDBX_PNL_SIZE(pnl); + size_t len = MDBX_PNL_GETSIZE(pnl); while (--next_pgno, ++i <= len && pnl[i] == next_pgno - 1) ; - MDBX_PNL_SIZE(pnl) = len -= i - 1; - for (unsigned move = 0; move < len; ++move) + MDBX_PNL_SETSIZE(pnl, len -= i - 1); + for (size_t move = 0; move < len; ++move) pnl[1 + move] = pnl[i + move]; #endif VERBOSE("refunded %" PRIaPGNO " pages: %" PRIaPGNO " -> %" PRIaPGNO, @@ -3808,7 +3806,7 @@ static void refund_loose(MDBX_txn *txn) { /* Collect loose-pages which may be refunded. */ tASSERT(txn, txn->mt_next_pgno >= MIN_PAGENO + txn->tw.loose_count); pgno_t most = MIN_PAGENO; - unsigned w = 0; + size_t w = 0; for (const MDBX_page *lp = txn->tw.loose_pages; lp; lp = lp->mp_next) { tASSERT(txn, lp->mp_flags == P_LOOSE); tASSERT(txn, txn->mt_next_pgno > lp->mp_pgno); @@ -3823,23 +3821,25 @@ static void refund_loose(MDBX_txn *txn) { if (most + 1 == txn->mt_next_pgno) { /* Sort suitable list and refund pages at the tail. */ - MDBX_PNL_SIZE(suitable) = w; + MDBX_PNL_SETSIZE(suitable, w); pnl_sort(suitable, MAX_PAGENO + 1); /* Scanning in descend order */ - const int step = MDBX_PNL_ASCENDING ? -1 : 1; - const int begin = MDBX_PNL_ASCENDING ? MDBX_PNL_SIZE(suitable) : 1; - const int end = MDBX_PNL_ASCENDING ? 0 : MDBX_PNL_SIZE(suitable) + 1; + const intptr_t step = MDBX_PNL_ASCENDING ? -1 : 1; + const intptr_t begin = + MDBX_PNL_ASCENDING ? MDBX_PNL_GETSIZE(suitable) : 1; + const intptr_t end = + MDBX_PNL_ASCENDING ? 0 : MDBX_PNL_GETSIZE(suitable) + 1; tASSERT(txn, suitable[begin] >= suitable[end - step]); tASSERT(txn, most == suitable[begin]); - for (int i = begin + step; i != end; i += step) { + for (intptr_t i = begin + step; i != end; i += step) { if (suitable[i] != most - 1) break; most -= 1; } - const unsigned refunded = txn->mt_next_pgno - most; - DEBUG("refund-suitable %u pages %" PRIaPGNO " -> %" PRIaPGNO, refunded, + const size_t refunded = txn->mt_next_pgno - most; + DEBUG("refund-suitable %zu pages %" PRIaPGNO " -> %" PRIaPGNO, refunded, most, txn->mt_next_pgno); txn->tw.loose_count -= refunded; txn->tw.dirtyroom += refunded; @@ -3848,7 +3848,7 @@ static void refund_loose(MDBX_txn *txn) { txn->mt_next_pgno = most; /* Filter-out dirty list */ - unsigned r = 0; + size_t r = 0; w = 0; if (dl->sorted) { do { @@ -3880,7 +3880,7 @@ static void refund_loose(MDBX_txn *txn) { tASSERT(txn, dl->sorted == dl->length); /* Scan dirtylist tail-forward and cutoff suitable pages. */ - unsigned n; + size_t n; for (n = dl->length; dl->items[n].pgno == txn->mt_next_pgno - 1 && dl->items[n].ptr->mp_flags == P_LOOSE; --n) { @@ -3893,7 +3893,7 @@ static void refund_loose(MDBX_txn *txn) { dpl_setlen(dl, n); if (dl->sorted != dl->length) { - const unsigned refunded = dl->sorted - dl->length; + const size_t refunded = dl->sorted - dl->length; dl->sorted = dl->length; txn->tw.loose_count -= refunded; txn->tw.dirtyroom += refunded; @@ -3931,7 +3931,7 @@ static bool txn_refund(MDBX_txn *txn) { refund_loose(txn); while (true) { - if (MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) == 0 || + if (MDBX_PNL_GETSIZE(txn->tw.reclaimed_pglist) == 0 || MDBX_PNL_MOST(txn->tw.reclaimed_pglist) != txn->mt_next_pgno - 1) break; @@ -3963,7 +3963,7 @@ static __inline bool txn_refund(MDBX_txn *txn) { #endif /* MDBX_ENABLE_REFUND */ __cold static void kill_page(MDBX_txn *txn, MDBX_page *mp, pgno_t pgno, - unsigned npages) { + pgno_t npages) { MDBX_env *const env = txn->mt_env; DEBUG("kill %u page(s) %" PRIaPGNO, npages, pgno); eASSERT(env, pgno >= NUM_METAS && npages); @@ -3977,8 +3977,7 @@ __cold static void kill_page(MDBX_txn *txn, MDBX_page *mp, pgno_t pgno, struct iovec iov[MDBX_COMMIT_PAGES]; iov[0].iov_len = env->me_psize; iov[0].iov_base = (char *)env->me_pbuf + env->me_psize; - size_t iov_off = pgno2bytes(env, pgno); - unsigned n = 1; + size_t iov_off = pgno2bytes(env, pgno), n = 1; while (--npages) { iov[n] = iov[0]; if (++n == MDBX_COMMIT_PAGES) { @@ -3992,8 +3991,8 @@ __cold static void kill_page(MDBX_txn *txn, MDBX_page *mp, pgno_t pgno, } /* Remove page from dirty list */ -static __inline void page_wash(MDBX_txn *txn, const unsigned di, - MDBX_page *const mp, const unsigned npages) { +static __inline void page_wash(MDBX_txn *txn, const size_t di, + MDBX_page *const mp, const pgno_t npages) { tASSERT(txn, di && di <= txn->tw.dirtylist->length && txn->tw.dirtylist->items[di].ptr == mp); dpl_remove_ex(txn, di, npages); @@ -4039,7 +4038,8 @@ static int page_retire_ex(MDBX_cursor *mc, const pgno_t pgno, * requires support the list of dirty pages and avoid explicit spilling. * So for flexibility and avoid extra internal dependencies we just * fallback to reading if dirty list was not allocated yet. */ - unsigned di = 0, si = 0, npages = 1; + size_t di = 0, si = 0; + pgno_t npages = 1; bool is_frozen = false, is_spilled = false, is_shadowed = false; if (unlikely(!mp)) { if (ASSERT_ENABLED() && pageflags) { @@ -4331,7 +4331,7 @@ typedef struct iov_ctx { } iov_ctx_t; __must_check_result static int iov_init(MDBX_txn *const txn, iov_ctx_t *ctx, - unsigned items, pgno_t npages) { + size_t items, size_t npages) { ctx->env = txn->mt_env; ctx->ior = &txn->mt_env->me_ioring; ctx->err = osal_ioring_reserve(ctx->ior, items, @@ -4418,7 +4418,7 @@ __must_check_result static int iov_write(iov_ctx_t *ctx) { } __must_check_result static int iov_page(MDBX_txn *txn, iov_ctx_t *ctx, - MDBX_page *dp, unsigned npages) { + MDBX_page *dp, pgno_t npages) { MDBX_env *const env = txn->mt_env; tASSERT(txn, ctx->err == MDBX_SUCCESS); tASSERT(txn, dp->mp_pgno >= MIN_PAGENO && dp->mp_pgno < txn->mt_next_pgno); @@ -4471,7 +4471,7 @@ __must_check_result static int iov_page(MDBX_txn *txn, iov_ctx_t *ctx, } static int spill_page(MDBX_txn *txn, iov_ctx_t *ctx, MDBX_page *dp, - const unsigned npages) { + const pgno_t npages) { #if !MDBX_AVOID_MSYNC tASSERT(txn, !(txn->mt_flags & MDBX_WRITEMAP)); #endif /* MDBX_AVOID_MSYNC */ @@ -4488,13 +4488,13 @@ static int spill_page(MDBX_txn *txn, iov_ctx_t *ctx, MDBX_page *dp, /* Set unspillable LRU-label for dirty pages watched by txn. * Returns the number of pages marked as unspillable. */ -static unsigned cursor_keep(MDBX_txn *txn, MDBX_cursor *mc) { - unsigned keep = 0; +static size_t cursor_keep(MDBX_txn *txn, MDBX_cursor *mc) { + size_t keep = 0; while (mc->mc_flags & C_INITIALIZED) { - for (unsigned i = 0; i < mc->mc_snum; ++i) { + for (size_t i = 0; i < mc->mc_snum; ++i) { const MDBX_page *mp = mc->mc_pg[i]; if (IS_MODIFIABLE(txn, mp) && !IS_SUBP(mp)) { - unsigned const n = dpl_search(txn, mp->mp_pgno); + size_t const n = dpl_search(txn, mp->mp_pgno); if (txn->tw.dirtylist->items[n].pgno == mp->mp_pgno && dpl_age(txn, n)) { txn->tw.dirtylist->items[n].lru = txn->tw.dirtylru; @@ -4509,9 +4509,9 @@ static unsigned cursor_keep(MDBX_txn *txn, MDBX_cursor *mc) { return keep; } -static unsigned txn_keep(MDBX_txn *txn, MDBX_cursor *m0) { - unsigned keep = m0 ? cursor_keep(txn, m0) : 0; - for (unsigned i = FREE_DBI; i < txn->mt_numdbs; ++i) +static size_t txn_keep(MDBX_txn *txn, MDBX_cursor *m0) { + size_t keep = m0 ? cursor_keep(txn, m0) : 0; + for (size_t i = FREE_DBI; i < txn->mt_numdbs; ++i) if (F_ISSET(txn->mt_dbistate[i], DBI_DIRTY | DBI_VALID) && txn->mt_dbs[i].md_root != P_INVALID) for (MDBX_cursor *mc = txn->mt_cursors[i]; mc; mc = mc->mc_next) @@ -4524,7 +4524,7 @@ static unsigned txn_keep(MDBX_txn *txn, MDBX_cursor *m0) { * 0 = should be spilled; * ... * > 255 = must not be spilled. */ -static unsigned spill_prio(const MDBX_txn *txn, const unsigned i, +static unsigned spill_prio(const MDBX_txn *txn, const size_t i, const uint32_t reciprocal) { MDBX_dpl *const dl = txn->tw.dirtylist; const uint32_t age = dpl_age(txn, i); @@ -4604,22 +4604,22 @@ static unsigned spill_prio(const MDBX_txn *txn, const unsigned i, * the child hasn't committed yet, and we'd have no way to undo it if * the child aborted. */ static int txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0, - const unsigned need) { + const size_t need) { #if xMDBX_DEBUG_SPILLING != 1 /* production mode */ if (likely(txn->tw.dirtyroom + txn->tw.loose_count >= need)) return MDBX_SUCCESS; - unsigned wanna_spill = need - txn->tw.dirtyroom; + size_t wanna_spill = need - txn->tw.dirtyroom; #else /* debug mode: spill at least one page if xMDBX_DEBUG_SPILLING == 1 */ - unsigned wanna_spill = + size_t wanna_spill = (need > txn->tw.dirtyroom) ? need - txn->tw.dirtyroom : 1; #endif /* xMDBX_DEBUG_SPILLING */ int rc = MDBX_SUCCESS; #if !MDBX_AVOID_MSYNC if (txn->mt_flags & MDBX_WRITEMAP) { - NOTICE("%s-spilling of %u dirty-entries (have %u dirty-room, need %u)", + NOTICE("%s-spilling of %zu dirty-entries (have %zu dirty-room, need %zu)", "msync", wanna_spill, txn->tw.dirtyroom, need); tASSERT(txn, txn->tw.spill_pages == nullptr); const MDBX_env *env = txn->mt_env; @@ -4639,12 +4639,12 @@ static int txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0, } #endif /* MDBX_AVOID_MSYNC */ - const unsigned dirty = txn->tw.dirtylist->length; - const unsigned spill_min = + const size_t dirty = txn->tw.dirtylist->length; + const size_t spill_min = txn->mt_env->me_options.spill_min_denominator ? dirty / txn->mt_env->me_options.spill_min_denominator : 0; - const unsigned spill_max = + const size_t spill_max = dirty - (txn->mt_env->me_options.spill_max_denominator ? dirty / txn->mt_env->me_options.spill_max_denominator : 0); @@ -4653,8 +4653,8 @@ static int txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0, if (!wanna_spill) return MDBX_SUCCESS; - NOTICE("%s-spilling %u dirty-entries (have %u dirty-room, need %u)", "pwrite", - wanna_spill, txn->tw.dirtyroom, need); + NOTICE("%s-spilling %zu dirty-entries (have %zu dirty-room, need %zu)", + "pwrite", wanna_spill, txn->tw.dirtyroom, need); tASSERT(txn, txn->tw.dirtylist->length >= wanna_spill); if (!MDBX_AVOID_MSYNC || !(txn->mt_flags & MDBX_WRITEMAP)) { if (!txn->tw.spill_pages) { @@ -4680,13 +4680,13 @@ static int txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0, MDBX_dpl *const dl = dpl_sort(txn); /* Preserve pages which may soon be dirtied again */ - const unsigned unspillable = txn_keep(txn, m0); + const size_t unspillable = txn_keep(txn, m0); if (unspillable + txn->tw.loose_count >= dl->length) { #if xMDBX_DEBUG_SPILLING == 1 /* avoid false failure in debug mode */ if (likely(txn->tw.dirtyroom + txn->tw.loose_count >= need)) return MDBX_SUCCESS; #endif /* xMDBX_DEBUG_SPILLING */ - ERROR("all %u dirty pages are unspillable since referenced " + ERROR("all %zu dirty pages are unspillable since referenced " "by a cursor(s), use fewer cursors or increase " "MDBX_opt_txn_dp_limit", unspillable); @@ -4718,7 +4718,7 @@ static int txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0, /* get min/max of LRU-labels */ uint32_t age_max = 0; - for (unsigned i = 1; i <= dl->length; ++i) { + for (size_t i = 1; i <= dl->length; ++i) { const uint32_t age = dpl_age(txn, i); age_max = (age_max >= age) ? age_max : age; } @@ -4729,7 +4729,7 @@ static int txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0, pgno_t radix_counters[256], spillable = 0; memset(&radix_counters, 0, sizeof(radix_counters)); const uint32_t reciprocal = (UINT32_C(255) << 24) / (age_max + 1); - for (unsigned i = 1; i <= dl->length; ++i) { + for (size_t i = 1; i <= dl->length; ++i) { unsigned prio = spill_prio(txn, i, reciprocal); if (prio < 256) { radix_counters[prio] += 1; @@ -4738,8 +4738,8 @@ static int txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0, } if (likely(spillable > 0)) { - unsigned prio2spill = 0, prio2adjacent = 128, amount = radix_counters[0]; - for (unsigned i = 1; i < 256; i++) { + size_t prio2spill = 0, prio2adjacent = 128, amount = radix_counters[0]; + for (size_t i = 1; i < 256; i++) { if (amount < wanna_spill) { prio2spill = i; prio2adjacent = i + (257 - i) / 2; @@ -4752,8 +4752,8 @@ static int txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0, break; } - VERBOSE("prio2spill %u, prio2adjacent %u, spillable %u," - " wanna-spill %u, amount %u", + VERBOSE("prio2spill %zu, prio2adjacent %zu, spillable %u," + " wanna-spill %zu, amount %zu", prio2spill, prio2adjacent, spillable, wanna_spill, amount); tASSERT(txn, prio2spill < prio2adjacent && prio2adjacent <= 256); @@ -4764,8 +4764,8 @@ static int txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0, if (unlikely(rc != MDBX_SUCCESS)) goto bailout; - unsigned prev_prio = 256; - unsigned r, w, prio; + unsigned prev_prio = 256, prio; + size_t r, w; pgno_t spilled_entries = 0, spilled_npages = 0; for (w = 0, r = 1; r <= dl->length && spilled_entries < wanna_spill; prev_prio = prio, ++r) { @@ -4840,26 +4840,27 @@ static int txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0, pnl_sort(txn->tw.spill_pages, (size_t)txn->mt_next_pgno << 1); txn->mt_flags |= MDBX_TXN_SPILLS; } - NOTICE("spilled %u dirty-entries, now have %u dirty-room", spilled_entries, + NOTICE("spilled %u dirty-entries, now have %zu dirty-room", spilled_entries, txn->tw.dirtyroom); } else { tASSERT(txn, rc == MDBX_SUCCESS); - for (unsigned i = 1; i <= dl->length; ++i) { + for (size_t i = 1; i <= dl->length; ++i) { MDBX_page *dp = dl->items[i].ptr; - NOTICE("dirtylist[%u]: pgno %u, npages %u, flags 0x%04X, age %u, prio %u", - i, dp->mp_pgno, dpl_npages(dl, i), dp->mp_flags, dpl_age(txn, i), - spill_prio(txn, i, reciprocal)); + NOTICE( + "dirtylist[%zu]: pgno %u, npages %u, flags 0x%04X, age %u, prio %u", + i, dp->mp_pgno, dpl_npages(dl, i), dp->mp_flags, dpl_age(txn, i), + spill_prio(txn, i, reciprocal)); } } #if xMDBX_DEBUG_SPILLING == 2 if (txn->tw.loose_count + txn->tw.dirtyroom <= need / 2 + 1) - ERROR("dirty-list length: before %u, after %u, parent %i, loose %u; " - "needed %u, spillable %u; " - "spilled %u dirty-entries, now have %u dirty-room", + ERROR("dirty-list length: before %zu, after %zu, parent %zi, loose %zu; " + "needed %zu, spillable %u; " + "spilled %u dirty-entries, now have %zu dirty-room", dl->length + spilled, dl->length, (txn->mt_parent && txn->mt_parent->tw.dirtylist) - ? (int)txn->mt_parent->tw.dirtylist->length + ? (intptr_t)txn->mt_parent->tw.dirtylist->length : -1, txn->tw.loose_count, need, spillable, spilled, txn->tw.dirtyroom); ENSURE(txn->mt_env, txn->tw.loose_count + txn->tw.dirtyroom > need / 2); @@ -4877,7 +4878,7 @@ static int cursor_spill(MDBX_cursor *mc, const MDBX_val *key, MDBX_txn *txn = mc->mc_txn; /* Estimate how much space this operation will take: */ /* 1) Max b-tree height, reasonable enough with including dups' sub-tree */ - unsigned need = CURSOR_STACK + 3; + size_t need = CURSOR_STACK + 3; /* 2) GC/FreeDB for any payload */ if (mc->mc_dbi > FREE_DBI) { need += txn->mt_dbs[FREE_DBI].md_depth + 3; @@ -5152,8 +5153,7 @@ static txnid_t recent_committed_txnid(const MDBX_env *env) { return (m0 > m1) ? ((m0 > m2) ? m0 : m2) : ((m1 > m2) ? m1 : m2); } -static __inline bool meta_eq(const meta_troika_t *troika, unsigned a, - unsigned b) { +static __inline bool meta_eq(const meta_troika_t *troika, size_t a, size_t b) { assert(a < NUM_METAS && b < NUM_METAS); return troika->txnid[a] == troika->txnid[b] && (((troika->fsm >> a) ^ (troika->fsm >> b)) & 1) == 0 && @@ -5253,11 +5253,11 @@ static txnid_t find_oldest_reader(MDBX_env *const env, const txnid_t steady) { atomic_load32(&lck->mti_readers_refresh_flag, mo_AcquireRelease)) { lck->mti_readers_refresh_flag.weak = nothing_changed; jitter4testing(false); - const unsigned snap_nreaders = + const size_t snap_nreaders = atomic_load32(&lck->mti_numreaders, mo_AcquireRelease); new_oldest = steady; - for (unsigned i = 0; i < snap_nreaders; ++i) { + for (size_t i = 0; i < snap_nreaders; ++i) { const uint32_t pid = atomic_load32(&lck->mti_readers[i].mr_pid, mo_AcquireRelease); if (!pid) @@ -5270,7 +5270,7 @@ static txnid_t find_oldest_reader(MDBX_env *const env, const txnid_t steady) { atomic_load32(&lck->mti_readers_refresh_flag, mo_AcquireRelease)) && safe64_reset_compare(&lck->mti_readers[i].mr_txnid, rtxn)) { - NOTICE("kick stuck reader[%u of %u].pid_%u %" PRIaTXN + NOTICE("kick stuck reader[%zu of %zu].pid_%u %" PRIaTXN " < prev-oldest %" PRIaTXN ", steady-txn %" PRIaTXN, i, snap_nreaders, pid, rtxn, prev_oldest, steady); } @@ -5304,9 +5304,9 @@ __cold static pgno_t find_largest_snapshot(const MDBX_env *env, MDBX_lockinfo *const lck = env->me_lck_mmap.lck; if (likely(lck != NULL /* check for exclusive without-lck mode */)) { retry:; - const unsigned snap_nreaders = + const size_t snap_nreaders = atomic_load32(&lck->mti_numreaders, mo_AcquireRelease); - for (unsigned i = 0; i < snap_nreaders; ++i) { + for (size_t i = 0; i < snap_nreaders; ++i) { if (atomic_load32(&lck->mti_readers[i].mr_pid, mo_AcquireRelease)) { /* jitter4testing(true); */ const pgno_t snap_pages = atomic_load32( @@ -5329,7 +5329,7 @@ __cold static pgno_t find_largest_snapshot(const MDBX_env *env, /* Add a page to the txn's dirty list */ __hot static int __must_check_result page_dirty(MDBX_txn *txn, MDBX_page *mp, - unsigned npages) { + pgno_t npages) { #if xMDBX_DEBUG_SPILLING == 2 txn->mt_env->debug_dirtied_act += 1; ENSURE(txn->mt_env, @@ -5346,7 +5346,7 @@ __hot static int __must_check_result page_dirty(MDBX_txn *txn, MDBX_page *mp, rc = pnl_insert_range(&txn->tw.reclaimed_pglist, loose->mp_pgno, 1); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; - unsigned di = dpl_search(txn, loose->mp_pgno); + size_t di = dpl_search(txn, loose->mp_pgno); tASSERT(txn, txn->tw.dirtylist->items[di].ptr == loose); dpl_remove(txn, di); txn->tw.loose_pages = loose->mp_next; @@ -5355,7 +5355,7 @@ __hot static int __must_check_result page_dirty(MDBX_txn *txn, MDBX_page *mp, if (!(txn->mt_flags & MDBX_WRITEMAP)) dpage_free(txn->mt_env, loose, 1); } else { - ERROR("Dirtyroom is depleted, DPL length %u", txn->tw.dirtylist->length); + ERROR("Dirtyroom is depleted, DPL length %zu", txn->tw.dirtylist->length); if (!(txn->mt_flags & MDBX_WRITEMAP)) dpage_free(txn->mt_env, mp, npages); return MDBX_TXN_FULL; @@ -5605,11 +5605,11 @@ __cold static int map_resize(MDBX_env *env, const pgno_t used_pgno, } /* looking for readers from this process */ - const unsigned snap_nreaders = + const size_t snap_nreaders = atomic_load32(&lck->mti_numreaders, mo_AcquireRelease); eASSERT(env, !implicit); mresize_flags |= MDBX_MRESIZE_MAY_UNMAP | MDBX_MRESIZE_MAY_MOVE; - for (unsigned i = 0; i < snap_nreaders; ++i) { + for (size_t i = 0; i < snap_nreaders; ++i) { if (lck->mti_readers[i].mr_pid.weak == env->me_pid && lck->mti_readers[i].mr_tid.weak != osal_thread_self()) { /* the base address of the mapping can't be changed since @@ -5831,7 +5831,7 @@ __cold static int wipe_steady(MDBX_txn *txn, txnid_t last_steady) { //------------------------------------------------------------------------------ MDBX_MAYBE_UNUSED __hot static pgno_t * -scan4seq_fallback(pgno_t *range, const size_t len, const unsigned seq) { +scan4seq_fallback(pgno_t *range, const size_t len, const size_t seq) { assert(seq > 0 && len > seq); #if MDBX_PNL_ASCENDING assert(range[-1] == len); @@ -5896,10 +5896,10 @@ scan4seq_fallback(pgno_t *range, const size_t len, const unsigned seq) { } MDBX_MAYBE_UNUSED static const pgno_t *scan4range_checker(const MDBX_PNL pnl, - const unsigned seq) { - size_t begin = MDBX_PNL_ASCENDING ? 1 : MDBX_PNL_SIZE(pnl); + const size_t seq) { + size_t begin = MDBX_PNL_ASCENDING ? 1 : MDBX_PNL_GETSIZE(pnl); #if MDBX_PNL_ASCENDING - while (seq <= MDBX_PNL_SIZE(pnl) - begin) { + while (seq <= MDBX_PNL_GETSIZE(pnl) - begin) { if (pnl[begin + seq] - pnl[begin] == seq) return pnl + begin; ++begin; @@ -5977,7 +5977,7 @@ diffcmp2mask_sse2(const pgno_t *const ptr, const ptrdiff_t offset, } MDBX_MAYBE_UNUSED __hot MDBX_ATTRIBUTE_TARGET_SSE2 static pgno_t * -scan4seq_sse2(pgno_t *range, const size_t len, const unsigned seq) { +scan4seq_sse2(pgno_t *range, const size_t len, const size_t seq) { assert(seq > 0 && len > seq); #if MDBX_PNL_ASCENDING #error "FIXME: Not implemented" @@ -6039,7 +6039,7 @@ diffcmp2mask_avx2(const pgno_t *const ptr, const ptrdiff_t offset, } MDBX_MAYBE_UNUSED __hot MDBX_ATTRIBUTE_TARGET_AVX2 static pgno_t * -scan4seq_avx2(pgno_t *range, const size_t len, const unsigned seq) { +scan4seq_avx2(pgno_t *range, const size_t len, const size_t seq) { assert(seq > 0 && len > seq); #if MDBX_PNL_ASCENDING #error "FIXME: Not implemented" @@ -6107,7 +6107,7 @@ diffcmp2mask_avx512bw(const pgno_t *const ptr, const ptrdiff_t offset, } MDBX_MAYBE_UNUSED __hot MDBX_ATTRIBUTE_TARGET_AVX512BW static pgno_t * -scan4seq_avx512bw(pgno_t *range, const size_t len, const unsigned seq) { +scan4seq_avx512bw(pgno_t *range, const size_t len, const size_t seq) { assert(seq > 0 && len > seq); #if MDBX_PNL_ASCENDING #error "FIXME: Not implemented" @@ -6187,7 +6187,7 @@ static __always_inline size_t diffcmp2mask_neon(const pgno_t *const ptr, } __hot static pgno_t *scan4seq_neon(pgno_t *range, const size_t len, - const unsigned seq) { + const size_t seq) { assert(seq > 0 && len > seq); #if MDBX_PNL_ASCENDING #error "FIXME: Not implemented" @@ -6267,13 +6267,13 @@ __hot static pgno_t *scan4seq_neon(pgno_t *range, const size_t len, /* Selecting the most appropriate implementation at runtime, * depending on the available CPU features. */ static pgno_t *scan4seq_resolver(pgno_t *range, const size_t len, - const unsigned seq); + const size_t seq); static pgno_t *(*scan4seq)(pgno_t *range, const size_t len, - const unsigned seq) = scan4seq_resolver; + const size_t seq) = scan4seq_resolver; static pgno_t *scan4seq_resolver(pgno_t *range, const size_t len, - const unsigned seq) { - pgno_t *(*choice)(pgno_t * range, const size_t len, const unsigned seq) = + const size_t seq) { + pgno_t *(*choice)(pgno_t * range, const size_t len, const size_t seq) = nullptr; #if __has_builtin(__builtin_cpu_init) || defined(__BUILTIN_CPU_INIT__) || \ __GNUC_PREREQ(4, 8) @@ -6334,7 +6334,7 @@ static pgr_t page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num, int flags) { if (likely(flags & MDBX_ALLOC_GC)) { flags |= env->me_flags & MDBX_LIFORECLAIM; if (txn->mt_dbs[FREE_DBI].md_branch_pages && - MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) < coalesce_threshold) + MDBX_PNL_GETSIZE(txn->tw.reclaimed_pglist) < coalesce_threshold) flags |= MDBX_ALLOC_COALESCE; if (unlikely( /* If mc is updating the GC, then the retired-list cannot play @@ -6352,7 +6352,7 @@ static pgr_t page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num, int flags) { eASSERT(env, pnl_check_allocated(txn->tw.reclaimed_pglist, txn->mt_next_pgno - MDBX_ENABLE_REFUND)); pgno_t pgno, *re_list = txn->tw.reclaimed_pglist; - unsigned re_len = MDBX_PNL_SIZE(re_list); + size_t re_len = MDBX_PNL_GETSIZE(re_list); pgno_t *range = nullptr; txnid_t detent = 0, last = 0; #if MDBX_ENABLE_PGOP_STAT @@ -6461,7 +6461,7 @@ static pgr_t page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num, int flags) { /* skip IDs of records that already reclaimed */ if (txn->tw.lifo_reclaimed) { size_t i; - for (i = (size_t)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed); i > 0; --i) + for (i = MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed); i > 0; --i) if (txn->tw.lifo_reclaimed[i] == last) break; if (i) @@ -6495,24 +6495,24 @@ static pgr_t page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num, int flags) { ret.err = MDBX_CORRUPTED; goto fail; } - const unsigned gc_len = MDBX_PNL_SIZE(gc_pnl); - if (unlikely(/* list is too long already */ MDBX_PNL_SIZE( + const size_t gc_len = MDBX_PNL_GETSIZE(gc_pnl); + if (unlikely(/* list is too long already */ MDBX_PNL_GETSIZE( txn->tw.reclaimed_pglist) >= env->me_options.rp_augment_limit) && ((/* not a slot-request from gc-update */ (flags & MDBX_ALLOC_SLOT) == 0 && /* have enough unallocated space */ txn->mt_geo.upper >= txn->mt_next_pgno + (size_t)num) || - gc_len + MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) >= + gc_len + MDBX_PNL_GETSIZE(txn->tw.reclaimed_pglist) >= MDBX_PGL_LIMIT)) { /* Stop reclaiming to avoid large/overflow the page list. * This is a rare case while search for a continuously multi-page region * in a large database. * todo4recovery://erased_by_github/libmdbx/issues/123 */ - NOTICE("stop reclaiming to avoid PNL overflow: %u (current) + %u " - "(chunk) -> %u", - MDBX_PNL_SIZE(txn->tw.reclaimed_pglist), gc_len, - gc_len + MDBX_PNL_SIZE(txn->tw.reclaimed_pglist)); + NOTICE("stop reclaiming to avoid PNL overflow: %zu (current) + %zu " + "(chunk) -> %zu", + MDBX_PNL_GETSIZE(txn->tw.reclaimed_pglist), gc_len, + gc_len + MDBX_PNL_GETSIZE(txn->tw.reclaimed_pglist)); flags &= ~(MDBX_ALLOC_GC | MDBX_ALLOC_COALESCE); break; } @@ -6530,9 +6530,9 @@ static pgr_t page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num, int flags) { txn->tw.last_reclaimed = last; if (LOG_ENABLED(MDBX_LOG_EXTRA)) { - DEBUG_EXTRA("PNL read txn %" PRIaTXN " root %" PRIaPGNO " num %u, PNL", + DEBUG_EXTRA("PNL read txn %" PRIaTXN " root %" PRIaPGNO " len %zu, PNL", last, txn->mt_dbs[FREE_DBI].md_root, gc_len); - for (unsigned i = gc_len; i; i--) + for (size_t i = gc_len; i; i--) DEBUG_EXTRA_PRINT(" %" PRIaPGNO, gc_pnl[i]); DEBUG_EXTRA_PRINT("%s\n", "."); } @@ -6545,14 +6545,14 @@ static pgr_t page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num, int flags) { } tASSERT(txn, dirtylist_check(txn)); - re_len = MDBX_PNL_SIZE(re_list); + re_len = MDBX_PNL_GETSIZE(re_list); tASSERT(txn, re_len == 0 || re_list[re_len] < txn->mt_next_pgno); if (MDBX_ENABLE_REFUND && re_len && unlikely(MDBX_PNL_MOST(re_list) == txn->mt_next_pgno - 1)) { /* Refund suitable pages into "unallocated" space */ txn_refund(txn); re_list = txn->tw.reclaimed_pglist; - re_len = MDBX_PNL_SIZE(re_list); + re_len = MDBX_PNL_GETSIZE(re_list); } /* Done for a kick-reclaim mode, actually no page needed */ @@ -6762,11 +6762,11 @@ done: for (const pgno_t *const end = re_list + re_len; ++range <= end;) range[-(ptrdiff_t)num] = *range; #endif - MDBX_PNL_SIZE(re_list) = re_len -= num; + MDBX_PNL_SETSIZE(re_list, re_len -= num); tASSERT(txn, pnl_check_allocated(txn->tw.reclaimed_pglist, txn->mt_next_pgno - MDBX_ENABLE_REFUND)); } else { - txn->mt_next_pgno = pgno + num; + txn->mt_next_pgno = (pgno_t)(pgno + num); eASSERT(env, txn->mt_next_pgno <= txn->mt_end_pgno); } @@ -6778,7 +6778,7 @@ done: ret.page->mp_leaf2_ksize = 0; ret.page->mp_flags = 0; if ((ASSERT_ENABLED() || AUDIT_ENABLED()) && num > 1) { - ret.page->mp_pages = num; + ret.page->mp_pages = (pgno_t)num; ret.page->mp_flags = P_OVERFLOW; } ret.err = page_dirty(txn, ret.page, num); @@ -6818,12 +6818,12 @@ __hot static pgr_t page_alloc(MDBX_cursor *mc) { if (likely(!(mc->mc_flags & C_GCFREEZE))) { MDBX_PNL pnl = txn->tw.reclaimed_pglist; - const unsigned len = MDBX_PNL_SIZE(pnl); + const size_t len = MDBX_PNL_GETSIZE(pnl); if (likely(len > 0)) { - MDBX_PNL_SIZE(pnl) = len - 1; + MDBX_PNL_SETSIZE(pnl, len - 1); #if MDBX_PNL_ASCENDING const pgno_t pgno = pnl[1]; - for (unsigned i = 1; i < len; ++i) + for (size_t i = 1; i < len; ++i) pnl[i] = pnl[i + 1]; #else const pgno_t pgno = pnl[len]; @@ -6893,7 +6893,7 @@ static pgr_t __must_check_result page_unspill(MDBX_txn *const txn, pgr_t ret; do { tASSERT(txn, (scan->mt_flags & MDBX_TXN_SPILLS) != 0); - const unsigned si = search_spilled(scan, mp->mp_pgno); + const size_t si = search_spilled(scan, mp->mp_pgno); if (!si) continue; const unsigned npages = IS_OVERFLOW(mp) ? mp->mp_pages : 1; @@ -7328,7 +7328,7 @@ static int cursor_shadow(MDBX_txn *parent, MDBX_txn *nested) { * * Returns 0 on success, non-zero on failure. */ static void cursors_eot(MDBX_txn *txn, const bool merge) { - for (int i = txn->mt_numdbs; --i >= 0;) { + for (intptr_t i = txn->mt_numdbs; --i >= 0;) { MDBX_cursor *next, *mc = txn->mt_cursors[i]; if (!mc) continue; @@ -7339,7 +7339,7 @@ static void cursors_eot(MDBX_txn *txn, const bool merge) { next = mc->mc_next; ENSURE(txn->mt_env, stage == MDBX_MC_LIVE || (stage == MDBX_MC_WAIT4EOT && bk)); - cASSERT(mc, mc->mc_dbi == (unsigned)i); + cASSERT(mc, mc->mc_dbi == (MDBX_dbi)i); if (bk) { MDBX_xcursor *mx = mc->mc_xcursor; cASSERT(mc, mx == bk->mc_xcursor); @@ -7383,9 +7383,9 @@ static void cursors_eot(MDBX_txn *txn, const bool merge) { static pgno_t find_largest_this(MDBX_env *env, pgno_t largest) { MDBX_lockinfo *const lck = env->me_lck_mmap.lck; if (likely(lck != NULL /* exclusive mode */)) { - const unsigned snap_nreaders = + const size_t snap_nreaders = atomic_load32(&lck->mti_numreaders, mo_AcquireRelease); - for (unsigned i = 0; i < snap_nreaders; ++i) { + for (size_t i = 0; i < snap_nreaders; ++i) { retry: if (atomic_load32(&lck->mti_readers[i].mr_pid, mo_AcquireRelease) == env->me_pid) { @@ -7491,7 +7491,7 @@ static bind_rslot_result bind_rslot(MDBX_env *env, const uintptr_t tid) { } result.err = MDBX_SUCCESS; - unsigned slot, nreaders; + size_t slot, nreaders; while (1) { nreaders = env->me_lck->mti_numreaders.weak; for (slot = 0; slot < nreaders; slot++) @@ -7520,7 +7520,7 @@ static bind_rslot_result bind_rslot(MDBX_env *env, const uintptr_t tid) { atomic_store32(&result.rslot->mr_pid, 0, mo_AcquireRelease); safe64_reset(&result.rslot->mr_txnid, true); if (slot == nreaders) - env->me_lck->mti_numreaders.weak = ++nreaders; + env->me_lck->mti_numreaders.weak = (uint32_t)++nreaders; result.rslot->mr_tid.weak = (env->me_flags & MDBX_NOTLS) ? 0 : tid; atomic_store32(&result.rslot->mr_pid, env->me_pid, mo_AcquireRelease); osal_rdt_unlock(env); @@ -7809,7 +7809,7 @@ static int txn_renew(MDBX_txn *txn, const unsigned flags) { /* Seek & fetch the last meta */ uint64_t timestamp = 0; - unsigned loop = 0; + size_t loop = 0; meta_troika_t troika = meta_tap(env); while (1) { const meta_ptr_t head = @@ -7902,9 +7902,9 @@ static int txn_renew(MDBX_txn *txn, const unsigned flags) { MDBX_lockinfo *const lck = env->me_lck_mmap.lck; if (lck && (env->me_flags & MDBX_NOTLS) == 0 && (runtime_flags & MDBX_DBG_LEGACY_OVERLAP) == 0) { - const unsigned snap_nreaders = + const size_t snap_nreaders = atomic_load32(&lck->mti_numreaders, mo_AcquireRelease); - for (unsigned i = 0; i < snap_nreaders; ++i) { + for (size_t i = 0; i < snap_nreaders; ++i) { if (atomic_load32(&lck->mti_readers[i].mr_pid, mo_Relaxed) == env->me_pid && unlikely(atomic_load64(&lck->mti_readers[i].mr_tid, mo_Relaxed) == @@ -7960,12 +7960,12 @@ static int txn_renew(MDBX_txn *txn, const unsigned flags) { #if MDBX_ENABLE_REFUND txn->tw.loose_refund_wl = 0; #endif /* MDBX_ENABLE_REFUND */ - MDBX_PNL_SIZE(txn->tw.retired_pages) = 0; + MDBX_PNL_SETSIZE(txn->tw.retired_pages, 0); txn->tw.spill_pages = NULL; txn->tw.spill_least_removed = 0; txn->tw.last_reclaimed = 0; if (txn->tw.lifo_reclaimed) - MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) = 0; + MDBX_PNL_SETSIZE(txn->tw.lifo_reclaimed, 0); env->me_txn = txn; txn->mt_numdbs = env->me_numdbs; memcpy(txn->mt_dbiseqs, env->me_dbiseqs, txn->mt_numdbs * sizeof(unsigned)); @@ -7984,7 +7984,7 @@ static int txn_renew(MDBX_txn *txn, const unsigned flags) { /* Setup db info */ osal_compiler_barrier(); memset(txn->mt_cursors, 0, sizeof(MDBX_cursor *) * txn->mt_numdbs); - for (unsigned i = CORE_DBS; i < txn->mt_numdbs; i++) { + for (size_t i = CORE_DBS; i < txn->mt_numdbs; i++) { const unsigned db_flags = env->me_dbflags[i]; txn->mt_dbs[i].md_flags = db_flags & DB_PERSISTENT_FLAGS; txn->mt_dbistate[i] = @@ -8136,7 +8136,7 @@ void *mdbx_txn_get_userctx(const MDBX_txn *txn) { int mdbx_txn_begin_ex(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags, MDBX_txn **ret, void *context) { MDBX_txn *txn; - unsigned size, tsize; + size_t size, tsize; if (unlikely(!ret)) return MDBX_EINVAL; @@ -8210,8 +8210,8 @@ int mdbx_txn_begin_ex(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags, txn->mt_geo = parent->mt_geo; rc = dpl_alloc(txn); if (likely(rc == MDBX_SUCCESS)) { - const unsigned len = - MDBX_PNL_SIZE(parent->tw.reclaimed_pglist) + parent->tw.loose_count; + const size_t len = MDBX_PNL_GETSIZE(parent->tw.reclaimed_pglist) + + parent->tw.loose_count; txn->tw.reclaimed_pglist = pnl_alloc((len > MDBX_PNL_INITIAL) ? len : MDBX_PNL_INITIAL); if (unlikely(!txn->tw.reclaimed_pglist)) @@ -8229,7 +8229,7 @@ int mdbx_txn_begin_ex(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags, if (parent->tw.loose_count) { do { MDBX_page *lp = parent->tw.loose_pages; - const unsigned di = dpl_exist(parent, lp->mp_pgno); + const size_t di = dpl_exist(parent, lp->mp_pgno); tASSERT(parent, di && parent->tw.dirtylist->items[di].ptr == lp); tASSERT(parent, lp->mp_flags == P_LOOSE); rc = pnl_insert_range(&parent->tw.reclaimed_pglist, lp->mp_pgno, 1); @@ -8253,7 +8253,7 @@ int mdbx_txn_begin_ex(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags, spill_purge(parent); tASSERT(txn, MDBX_PNL_ALLOCLEN(txn->tw.reclaimed_pglist) >= - MDBX_PNL_SIZE(parent->tw.reclaimed_pglist)); + MDBX_PNL_GETSIZE(parent->tw.reclaimed_pglist)); memcpy(txn->tw.reclaimed_pglist, parent->tw.reclaimed_pglist, MDBX_PNL_SIZEOF(parent->tw.reclaimed_pglist)); eASSERT(env, pnl_check_allocated( @@ -8267,12 +8267,12 @@ int mdbx_txn_begin_ex(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags, if (parent->tw.lifo_reclaimed) { txn->tw.lifo_reclaimed = parent->tw.lifo_reclaimed; parent->tw.lifo_reclaimed = - (void *)(intptr_t)MDBX_PNL_SIZE(parent->tw.lifo_reclaimed); + (void *)(intptr_t)MDBX_PNL_GETSIZE(parent->tw.lifo_reclaimed); } txn->tw.retired_pages = parent->tw.retired_pages; parent->tw.retired_pages = - (void *)(intptr_t)MDBX_PNL_SIZE(parent->tw.retired_pages); + (void *)(intptr_t)MDBX_PNL_GETSIZE(parent->tw.retired_pages); txn->mt_txnid = parent->mt_txnid; txn->mt_front = parent->mt_front + 1; @@ -8288,7 +8288,7 @@ int mdbx_txn_begin_ex(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags, memcpy(txn->mt_dbs, parent->mt_dbs, txn->mt_numdbs * sizeof(MDBX_db)); txn->tw.troika = parent->tw.troika; /* Copy parent's mt_dbistate, but clear DB_NEW */ - for (unsigned i = 0; i < txn->mt_numdbs; i++) + for (size_t i = 0; i < txn->mt_numdbs; i++) txn->mt_dbistate[i] = parent->mt_dbistate[i] & ~(DBI_FRESH | DBI_CREAT | DBI_DIRTY); tASSERT(parent, @@ -8390,9 +8390,9 @@ int mdbx_txn_info(const MDBX_txn *txn, MDBX_txn_info *info, bool scan_rlt) { if (scan_rlt && info->txn_reader_lag > 1 && lck) { /* find next more recent reader */ txnid_t next_reader = head.txnid; - const unsigned snap_nreaders = + const size_t snap_nreaders = atomic_load32(&lck->mti_numreaders, mo_AcquireRelease); - for (unsigned i = 0; i < snap_nreaders; ++i) { + for (size_t i = 0; i < snap_nreaders; ++i) { retry: if (atomic_load32(&lck->mti_readers[i].mr_pid, mo_AcquireRelease)) { jitter4testing(true); @@ -8428,8 +8428,8 @@ int mdbx_txn_info(const MDBX_txn *txn, MDBX_txn_info *info, bool scan_rlt) { info->txn_space_limit_soft = pgno2bytes(env, txn->mt_geo.now); info->txn_space_limit_hard = pgno2bytes(env, txn->mt_geo.upper); info->txn_space_retired = pgno2bytes( - env, txn->mt_child ? (unsigned)(uintptr_t)txn->tw.retired_pages - : MDBX_PNL_SIZE(txn->tw.retired_pages)); + env, txn->mt_child ? (size_t)txn->tw.retired_pages + : MDBX_PNL_GETSIZE(txn->tw.retired_pages)); info->txn_space_leftover = pgno2bytes(env, txn->tw.dirtyroom); info->txn_space_dirty = pgno2bytes(env, txn->mt_env->me_options.dp_limit - txn->tw.dirtyroom); @@ -8437,14 +8437,14 @@ int mdbx_txn_info(const MDBX_txn *txn, MDBX_txn_info *info, bool scan_rlt) { MDBX_lockinfo *const lck = env->me_lck_mmap.lck; if (scan_rlt && lck) { txnid_t oldest_snapshot = txn->mt_txnid; - const unsigned snap_nreaders = + const size_t snap_nreaders = atomic_load32(&lck->mti_numreaders, mo_AcquireRelease); if (snap_nreaders) { oldest_snapshot = txn_oldest_reader(txn); if (oldest_snapshot == txn->mt_txnid - 1) { /* check if there is at least one reader */ bool exists = false; - for (unsigned i = 0; i < snap_nreaders; ++i) { + for (size_t i = 0; i < snap_nreaders; ++i) { if (atomic_load32(&lck->mti_readers[i].mr_pid, mo_Relaxed) && txn->mt_txnid > safe64_read(&lck->mti_readers[i].mr_txnid)) { exists = true; @@ -8484,7 +8484,7 @@ int mdbx_txn_flags(const MDBX_txn *txn) { } /* Check for misused dbi handles */ -static __inline bool dbi_changed(MDBX_txn *txn, MDBX_dbi dbi) { +static __inline bool dbi_changed(MDBX_txn *txn, size_t dbi) { if (txn->mt_dbiseqs == txn->mt_env->me_dbiseqs) return false; if (likely( @@ -8495,15 +8495,15 @@ static __inline bool dbi_changed(MDBX_txn *txn, MDBX_dbi dbi) { return true; } -static __inline unsigned dbi_seq(const MDBX_env *const env, unsigned slot) { +static __inline unsigned dbi_seq(const MDBX_env *const env, size_t slot) { unsigned v = env->me_dbiseqs[slot].weak + 1; return v + (v == 0); } static void dbi_import_locked(MDBX_txn *txn) { const MDBX_env *const env = txn->mt_env; - unsigned n = env->me_numdbs; - for (unsigned i = CORE_DBS; i < n; ++i) { + size_t n = env->me_numdbs; + for (size_t i = CORE_DBS; i < n; ++i) { if (i >= txn->mt_numdbs) { txn->mt_cursors[i] = NULL; if (txn->mt_dbiseqs != env->me_dbiseqs) @@ -8538,7 +8538,7 @@ static void dbi_import_locked(MDBX_txn *txn) { } ++n; } - txn->mt_numdbs = n; + txn->mt_numdbs = (MDBX_dbi)n; } /* Import DBI which opened after txn started into context */ @@ -8563,7 +8563,7 @@ static void dbi_update(MDBX_txn *txn, int keep) { bool locked = false; MDBX_env *const env = txn->mt_env; - for (unsigned i = n; --i >= CORE_DBS;) { + for (size_t i = n; --i >= CORE_DBS;) { if (likely((txn->mt_dbistate[i] & DBI_CREAT) == 0)) continue; if (!locked) { @@ -8608,19 +8608,19 @@ static void dbi_update(MDBX_txn *txn, int keep) { /* Filter-out pgno list from transaction's dirty-page list */ static void dpl_sift(MDBX_txn *const txn, MDBX_PNL pl, const bool spilled) { - if (MDBX_PNL_SIZE(pl) && txn->tw.dirtylist->length) { + if (MDBX_PNL_GETSIZE(pl) && txn->tw.dirtylist->length) { tASSERT(txn, pnl_check_allocated(pl, (size_t)txn->mt_next_pgno << spilled)); MDBX_dpl *dl = dpl_sort(txn); /* Scanning in ascend order */ - const int step = MDBX_PNL_ASCENDING ? 1 : -1; - const int begin = MDBX_PNL_ASCENDING ? 1 : MDBX_PNL_SIZE(pl); - const int end = MDBX_PNL_ASCENDING ? MDBX_PNL_SIZE(pl) + 1 : 0; + const intptr_t step = MDBX_PNL_ASCENDING ? 1 : -1; + const intptr_t begin = MDBX_PNL_ASCENDING ? 1 : MDBX_PNL_GETSIZE(pl); + const intptr_t end = MDBX_PNL_ASCENDING ? MDBX_PNL_GETSIZE(pl) + 1 : 0; tASSERT(txn, pl[begin] <= pl[end - step]); - unsigned r = dpl_search(txn, pl[begin] >> spilled); + size_t w, r = dpl_search(txn, pl[begin] >> spilled); tASSERT(txn, dl->sorted == dl->length); - for (int i = begin; r <= dl->length;) { /* scan loop */ + for (intptr_t i = begin; r <= dl->length;) { /* scan loop */ assert(i != end); tASSERT(txn, !spilled || (pl[i] & 1) == 0); pgno_t pl_pgno = pl[i] >> spilled; @@ -8635,7 +8635,8 @@ static void dpl_sift(MDBX_txn *const txn, MDBX_PNL pl, const bool spilled) { } /* update loop */ - unsigned npages, w = r; + unsigned npages; + w = r; remove_dl: npages = dpl_npages(dl, r); dl->pages_including_loose -= npages; @@ -8767,18 +8768,18 @@ static int txn_end(MDBX_txn *txn, const unsigned mode) { sizeof(meta_troika_t)) == 0); if (txn->tw.lifo_reclaimed) { - eASSERT(env, MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) >= - (unsigned)(uintptr_t)parent->tw.lifo_reclaimed); - MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) = - (unsigned)(uintptr_t)parent->tw.lifo_reclaimed; + eASSERT(env, MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed) >= + (uintptr_t)parent->tw.lifo_reclaimed); + MDBX_PNL_SETSIZE(txn->tw.lifo_reclaimed, + (uintptr_t)parent->tw.lifo_reclaimed); parent->tw.lifo_reclaimed = txn->tw.lifo_reclaimed; } if (txn->tw.retired_pages) { - eASSERT(env, MDBX_PNL_SIZE(txn->tw.retired_pages) >= - (unsigned)(uintptr_t)parent->tw.retired_pages); - MDBX_PNL_SIZE(txn->tw.retired_pages) = - (unsigned)(uintptr_t)parent->tw.retired_pages; + eASSERT(env, MDBX_PNL_GETSIZE(txn->tw.retired_pages) >= + (uintptr_t)parent->tw.retired_pages); + MDBX_PNL_SETSIZE(txn->tw.retired_pages, + (uintptr_t)parent->tw.retired_pages); parent->tw.retired_pages = txn->tw.retired_pages; } @@ -8882,20 +8883,19 @@ int mdbx_txn_abort(MDBX_txn *txn) { /* Count all the pages in each DB and in the GC and make sure * it matches the actual number of pages being used. */ -__cold static int audit_ex(MDBX_txn *txn, unsigned retired_stored, +__cold static int audit_ex(MDBX_txn *txn, size_t retired_stored, bool dont_filter_gc) { - pgno_t pending = 0; - if ((txn->mt_flags & MDBX_TXN_RDONLY) == 0) { - pending = txn->tw.loose_count + MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) + - (MDBX_PNL_SIZE(txn->tw.retired_pages) - retired_stored); - } + size_t pending = 0; + if ((txn->mt_flags & MDBX_TXN_RDONLY) == 0) + pending = txn->tw.loose_count + MDBX_PNL_GETSIZE(txn->tw.reclaimed_pglist) + + (MDBX_PNL_GETSIZE(txn->tw.retired_pages) - retired_stored); MDBX_cursor_couple cx; int rc = cursor_init(&cx.outer, txn, FREE_DBI); if (unlikely(rc != MDBX_SUCCESS)) return rc; - pgno_t gc = 0; + size_t gc = 0; MDBX_val key, data; while ((rc = mdbx_cursor_get(&cx.outer, &key, &data, MDBX_NEXT)) == 0) { if (!dont_filter_gc) { @@ -8903,7 +8903,7 @@ __cold static int audit_ex(MDBX_txn *txn, unsigned retired_stored, return MDBX_CORRUPTED; txnid_t id = unaligned_peek_u64(4, key.iov_base); if (txn->tw.lifo_reclaimed) { - for (unsigned i = 1; i <= MDBX_PNL_SIZE(txn->tw.lifo_reclaimed); ++i) + for (size_t i = 1; i <= MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed); ++i) if (id == txn->tw.lifo_reclaimed[i]) goto skip; } else if (id <= txn->tw.last_reclaimed) @@ -8915,11 +8915,11 @@ __cold static int audit_ex(MDBX_txn *txn, unsigned retired_stored, } tASSERT(txn, rc == MDBX_NOTFOUND); - for (MDBX_dbi i = FREE_DBI; i < txn->mt_numdbs; i++) + for (size_t i = FREE_DBI; i < txn->mt_numdbs; i++) txn->mt_dbistate[i] &= ~DBI_AUDITED; - pgno_t used = NUM_METAS; - for (MDBX_dbi i = FREE_DBI; i <= MAIN_DBI; i++) { + size_t used = NUM_METAS; + for (size_t i = FREE_DBI; i <= MAIN_DBI; i++) { if (!(txn->mt_dbistate[i] & DBI_VALID)) continue; rc = cursor_init(&cx.outer, txn, i); @@ -8936,7 +8936,7 @@ __cold static int audit_ex(MDBX_txn *txn, unsigned retired_stored, rc = page_search(&cx.outer, NULL, MDBX_PS_FIRST); while (rc == MDBX_SUCCESS) { MDBX_page *mp = cx.outer.mc_pg[cx.outer.mc_top]; - for (unsigned j = 0; j < page_numkeys(mp); j++) { + for (size_t j = 0; j < page_numkeys(mp); j++) { MDBX_node *node = page_node(mp, j); if (node_flags(node) == F_SUBDATA) { if (unlikely(node_ds(node) != sizeof(MDBX_db))) @@ -8966,7 +8966,7 @@ __cold static int audit_ex(MDBX_txn *txn, unsigned retired_stored, tASSERT(txn, rc == MDBX_NOTFOUND); } - for (MDBX_dbi i = FREE_DBI; i < txn->mt_numdbs; i++) { + for (size_t i = FREE_DBI; i < txn->mt_numdbs; i++) { if ((txn->mt_dbistate[i] & (DBI_VALID | DBI_AUDITED | DBI_STALE)) != DBI_VALID) continue; @@ -8979,7 +8979,7 @@ __cold static int audit_ex(MDBX_txn *txn, unsigned retired_stored, } if (!(txn->mt_dbistate[i] & DBI_AUDITED)) { WARNING("audit %s@%" PRIaTXN - ": unable account dbi %d / \"%*s\", state 0x%02x", + ": unable account dbi %zd / \"%*s\", state 0x%02x", txn->mt_parent ? "nested-" : "", txn->mt_txnid, i, (int)txn->mt_dbxs[i].md_name.iov_len, (const char *)txn->mt_dbxs[i].md_name.iov_base, @@ -8991,23 +8991,23 @@ __cold static int audit_ex(MDBX_txn *txn, unsigned retired_stored, return MDBX_SUCCESS; if ((txn->mt_flags & MDBX_TXN_RDONLY) == 0) - ERROR("audit @%" PRIaTXN ": %u(pending) = %u(loose) + " - "%u(reclaimed) + %u(retired-pending) - %u(retired-stored)", + ERROR("audit @%" PRIaTXN ": %zu(pending) = %zu(loose) + " + "%zu(reclaimed) + %zu(retired-pending) - %zu(retired-stored)", txn->mt_txnid, pending, txn->tw.loose_count, - MDBX_PNL_SIZE(txn->tw.reclaimed_pglist), - txn->tw.retired_pages ? MDBX_PNL_SIZE(txn->tw.retired_pages) : 0, + MDBX_PNL_GETSIZE(txn->tw.reclaimed_pglist), + txn->tw.retired_pages ? MDBX_PNL_GETSIZE(txn->tw.retired_pages) : 0, retired_stored); - ERROR("audit @%" PRIaTXN ": %" PRIaPGNO "(pending) + %" PRIaPGNO - "(gc) + %" PRIaPGNO "(count) = %" PRIaPGNO "(total) <> %" PRIaPGNO + ERROR("audit @%" PRIaTXN ": %zu(pending) + %zu" + "(gc) + %zu(count) = %zu(total) <> %zu" "(allocated)", txn->mt_txnid, pending, gc, used, pending + gc + used, - txn->mt_next_pgno); + (size_t)txn->mt_next_pgno); return MDBX_PROBLEM; } typedef struct gc_update_context { - unsigned retired_stored, loop; - unsigned settled, cleaned_slot, reused_slot, filled_slot; + size_t retired_stored, loop; + size_t settled, cleaned_slot, reused_slot, filled_slot; txnid_t cleaned_id, rid; bool lifo, dense; #if MDBX_ENABLE_BIGFOOT @@ -9025,8 +9025,8 @@ static __inline int gcu_context_init(MDBX_txn *txn, gcu_context_t *ctx) { return cursor_init(&ctx->cursor.outer, txn, FREE_DBI); } -static __always_inline unsigned gcu_backlog_size(MDBX_txn *txn) { - return MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) + txn->tw.loose_count; +static __always_inline size_t gcu_backlog_size(MDBX_txn *txn) { + return MDBX_PNL_GETSIZE(txn->tw.reclaimed_pglist) + txn->tw.loose_count; } static int gcu_clean_stored_retired(MDBX_txn *txn, gcu_context_t *ctx) { @@ -9045,7 +9045,7 @@ static int gcu_clean_stored_retired(MDBX_txn *txn, gcu_context_t *ctx) { if (csr.err == MDBX_SUCCESS && csr.exact) { ctx->retired_stored = 0; err = mdbx_cursor_del(&ctx->cursor.outer, 0); - TRACE("== clear-4linear, backlog %u, err %d", gcu_backlog_size(txn), + TRACE("== clear-4linear, backlog %zu, err %d", gcu_backlog_size(txn), err); } } @@ -9062,12 +9062,12 @@ static int gcu_clean_stored_retired(MDBX_txn *txn, gcu_context_t *ctx) { * during a deleting, when GC tree is unbalanced. */ static int gcu_prepare_backlog(MDBX_txn *txn, gcu_context_t *ctx, const bool reserve4retired) { - const unsigned pages4retiredlist = + const size_t pages4retiredlist = reserve4retired ? number_of_ovpages( txn->mt_env, MDBX_PNL_SIZEOF(txn->tw.retired_pages)) : 0; - const unsigned backlog4cow = txn->mt_dbs[FREE_DBI].md_depth; - const unsigned backlog4rebalance = backlog4cow + 1; + const size_t backlog4cow = txn->mt_dbs[FREE_DBI].md_depth; + const size_t backlog4rebalance = backlog4cow + 1; if (likely(pages4retiredlist < 2 && gcu_backlog_size(txn) > (reserve4retired @@ -9075,9 +9075,10 @@ static int gcu_prepare_backlog(MDBX_txn *txn, gcu_context_t *ctx, : (backlog4cow + backlog4rebalance)))) return MDBX_SUCCESS; - TRACE(">> reserve4retired %c, backlog %u, 4list %u, 4cow %u, 4rebalance %u", - reserve4retired ? 'Y' : 'N', gcu_backlog_size(txn), pages4retiredlist, - backlog4cow, backlog4rebalance); + TRACE( + ">> reserve4retired %c, backlog %zu, 4list %zu, 4cow %zu, 4rebalance %zu", + reserve4retired ? 'Y' : 'N', gcu_backlog_size(txn), pages4retiredlist, + backlog4cow, backlog4rebalance); int err; if (unlikely(pages4retiredlist > 2)) { @@ -9092,19 +9093,19 @@ static int gcu_prepare_backlog(MDBX_txn *txn, gcu_context_t *ctx, ctx->cursor.outer.mc_flags &= ~C_RECLAIMING; err = cursor_touch(&ctx->cursor.outer); - TRACE("== after-touch, backlog %u, err %d", gcu_backlog_size(txn), err); + TRACE("== after-touch, backlog %zu, err %d", gcu_backlog_size(txn), err); if (unlikely(pages4retiredlist > 1) && - MDBX_PNL_SIZE(txn->tw.retired_pages) != ctx->retired_stored && + MDBX_PNL_GETSIZE(txn->tw.retired_pages) != ctx->retired_stored && err == MDBX_SUCCESS) { tASSERT(txn, reserve4retired); err = gcu_clean_stored_retired(txn, ctx); if (unlikely(err != MDBX_SUCCESS)) return err; - err = page_alloc_slowpath(&ctx->cursor.outer, pages4retiredlist, + err = page_alloc_slowpath(&ctx->cursor.outer, (pgno_t)pages4retiredlist, MDBX_ALLOC_GC | MDBX_ALLOC_FAKE) .err; - TRACE("== after-4linear, backlog %u, err %d", gcu_backlog_size(txn), err); + TRACE("== after-4linear, backlog %zu, err %d", gcu_backlog_size(txn), err); cASSERT(&ctx->cursor.outer, gcu_backlog_size(txn) >= pages4retiredlist || err != MDBX_SUCCESS); } @@ -9117,7 +9118,7 @@ static int gcu_prepare_backlog(MDBX_txn *txn, gcu_context_t *ctx, .err; ctx->cursor.outer.mc_flags |= C_RECLAIMING; - TRACE("<< backlog %u, err %d", gcu_backlog_size(txn), err); + TRACE("<< backlog %zu, err %d", gcu_backlog_size(txn), err); return (err != MDBX_NOTFOUND) ? err : MDBX_SUCCESS; } @@ -9161,7 +9162,7 @@ retry: txn->mt_next_pgno - MDBX_ENABLE_REFUND)); tASSERT(txn, dirtylist_check(txn)); if (unlikely(/* paranoia */ ctx->loop > ((MDBX_DEBUG > 0) ? 12 : 42))) { - ERROR("too more loops %u, bailout", ctx->loop); + ERROR("too more loops %zu, bailout", ctx->loop); rc = MDBX_PROBLEM; goto bailout; } @@ -9183,8 +9184,8 @@ retry: MDBX_val key, data; TRACE("%s", " >> continue"); - if (ctx->retired_stored != MDBX_PNL_SIZE(txn->tw.retired_pages) && - (MDBX_PNL_SIZE(txn->tw.retired_pages) > env->me_maxgc_ov1page || + if (ctx->retired_stored != MDBX_PNL_GETSIZE(txn->tw.retired_pages) && + (MDBX_PNL_GETSIZE(txn->tw.retired_pages) > env->me_maxgc_ov1page || ctx->retired_stored > env->me_maxgc_ov1page)) { rc = gcu_prepare_backlog(txn, ctx, true); if (unlikely(rc != MDBX_SUCCESS)) @@ -9195,7 +9196,7 @@ retry: txn->mt_next_pgno - MDBX_ENABLE_REFUND)); if (ctx->lifo) { if (ctx->cleaned_slot < (txn->tw.lifo_reclaimed - ? MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) + ? MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed) : 0)) { ctx->settled = 0; ctx->cleaned_slot = 0; @@ -9220,13 +9221,13 @@ retry: goto bailout; } tASSERT(txn, ctx->cleaned_id <= env->me_lck->mti_oldest_reader.weak); - TRACE("%s: cleanup-reclaimed-id [%u]%" PRIaTXN, dbg_prefix_mode, + TRACE("%s: cleanup-reclaimed-id [%zu]%" PRIaTXN, dbg_prefix_mode, ctx->cleaned_slot, ctx->cleaned_id); tASSERT(txn, *txn->mt_cursors == &ctx->cursor.outer); rc = mdbx_cursor_del(&ctx->cursor.outer, 0); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; - } while (ctx->cleaned_slot < MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)); + } while (ctx->cleaned_slot < MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed)); txl_sort(txn->tw.lifo_reclaimed); } } else { @@ -9292,14 +9293,14 @@ retry: * The pages themselves remain in dirtylist. */ if (unlikely(!txn->tw.lifo_reclaimed && txn->tw.last_reclaimed < 1)) { if (txn->tw.loose_count > 0) { - TRACE("%s: try allocate gc-slot for %u loose-pages", dbg_prefix_mode, + TRACE("%s: try allocate gc-slot for %zu loose-pages", dbg_prefix_mode, txn->tw.loose_count); rc = page_alloc_slowpath(&ctx->cursor.outer, 0, MDBX_ALLOC_GC | MDBX_ALLOC_SLOT | MDBX_ALLOC_FAKE) .err; if (rc == MDBX_SUCCESS) { - TRACE("%s: retry since gc-slot for %u loose-pages available", + TRACE("%s: retry since gc-slot for %zu loose-pages available", dbg_prefix_mode, txn->tw.loose_count); continue; } @@ -9311,7 +9312,7 @@ retry: goto bailout; for (MDBX_page *mp = txn->tw.loose_pages; mp; mp = mp->mp_next) pnl_xappend(txn->tw.retired_pages, mp->mp_pgno); - TRACE("%s: append %u loose-pages to retired-pages", dbg_prefix_mode, + TRACE("%s: append %zu loose-pages to retired-pages", dbg_prefix_mode, txn->tw.loose_count); } } else { @@ -9322,23 +9323,23 @@ retry: MDBX_PNL loose = txn->tw.reclaimed_pglist + MDBX_PNL_ALLOCLEN(txn->tw.reclaimed_pglist) - txn->tw.loose_count - 1; - unsigned count = 0; + size_t count = 0; for (MDBX_page *mp = txn->tw.loose_pages; mp; mp = mp->mp_next) { tASSERT(txn, mp->mp_flags == P_LOOSE); loose[++count] = mp->mp_pgno; } tASSERT(txn, count == txn->tw.loose_count); - MDBX_PNL_SIZE(loose) = count; + MDBX_PNL_SETSIZE(loose, count); pnl_sort(loose, txn->mt_next_pgno); pnl_merge(txn->tw.reclaimed_pglist, loose); - TRACE("%s: append %u loose-pages to reclaimed-pages", dbg_prefix_mode, + TRACE("%s: append %zu loose-pages to reclaimed-pages", dbg_prefix_mode, txn->tw.loose_count); } /* filter-out list of dirty-pages from loose-pages */ MDBX_dpl *const dl = txn->tw.dirtylist; - unsigned w = 0; - for (unsigned r = w; ++r <= dl->length;) { + size_t w = 0; + for (size_t r = w; ++r <= dl->length;) { MDBX_page *dp = dl->items[r].ptr; tASSERT(txn, dp->mp_flags == P_LOOSE || IS_MODIFIABLE(txn, dp)); tASSERT(txn, dpl_endpgno(dl, r) <= txn->mt_next_pgno); @@ -9351,7 +9352,7 @@ retry: dpage_free(env, dp, 1); } } - TRACE("%s: filtered-out loose-pages from %u -> %u dirty-pages", + TRACE("%s: filtered-out loose-pages from %zu -> %zu dirty-pages", dbg_prefix_mode, dl->length, w); tASSERT(txn, txn->tw.loose_count == dl->length - w); dpl_setlen(dl, w); @@ -9368,9 +9369,9 @@ retry: #endif /* MDBX_ENABLE_REFUND */ } - const unsigned amount = (unsigned)MDBX_PNL_SIZE(txn->tw.reclaimed_pglist); + const size_t amount = MDBX_PNL_GETSIZE(txn->tw.reclaimed_pglist); /* handle retired-list - store ones into single gc-record */ - if (ctx->retired_stored < MDBX_PNL_SIZE(txn->tw.retired_pages)) { + if (ctx->retired_stored < MDBX_PNL_GETSIZE(txn->tw.retired_pages)) { if (unlikely(!ctx->retired_stored)) { /* Make sure last page of GC is touched and on retired-list */ ctx->cursor.outer.mc_flags &= ~C_RECLAIMING; @@ -9382,14 +9383,14 @@ retry: } #if MDBX_ENABLE_BIGFOOT - unsigned retired_pages_before; + size_t retired_pages_before; do { if (ctx->bigfoot > txn->mt_txnid) { rc = gcu_clean_stored_retired(txn, ctx); tASSERT(txn, ctx->bigfoot <= txn->mt_txnid); } - retired_pages_before = MDBX_PNL_SIZE(txn->tw.retired_pages); + retired_pages_before = MDBX_PNL_GETSIZE(txn->tw.retired_pages); rc = gcu_prepare_backlog(txn, ctx, true); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; @@ -9400,9 +9401,9 @@ retry: do { key.iov_len = sizeof(txnid_t); key.iov_base = &ctx->bigfoot; - const unsigned left = (unsigned)MDBX_PNL_SIZE(txn->tw.retired_pages) - - ctx->retired_stored; - const unsigned chunk = + const size_t left = + MDBX_PNL_GETSIZE(txn->tw.retired_pages) - ctx->retired_stored; + const size_t chunk = (left > env->me_maxgc_ov1page && ctx->bigfoot < MAX_TXNID) ? env->me_maxgc_ov1page : left; @@ -9411,10 +9412,10 @@ retry: if (unlikely(rc != MDBX_SUCCESS)) goto bailout; - if (retired_pages_before == MDBX_PNL_SIZE(txn->tw.retired_pages)) { - const unsigned at = (ctx->lifo == MDBX_PNL_ASCENDING) - ? left - chunk - : ctx->retired_stored; + if (retired_pages_before == MDBX_PNL_GETSIZE(txn->tw.retired_pages)) { + const size_t at = (ctx->lifo == MDBX_PNL_ASCENDING) + ? left - chunk + : ctx->retired_stored; pgno_t *const begin = txn->tw.retired_pages + at; /* MDBX_PNL_ASCENDING == false && LIFO == false: * - the larger pgno is at the beginning of retired list @@ -9424,19 +9425,20 @@ retry: * and should be placed with the smaller txnid. */ const pgno_t save = *begin; - *begin = chunk; + *begin = (pgno_t)chunk; memcpy(data.iov_base, begin, data.iov_len); *begin = save; TRACE("%s: put-retired/bigfoot @ %" PRIaTXN - " (slice #%u) #%u [%u..%u] of %u", + " (slice #%u) #%zu [%zu..%zu] of %zu", dbg_prefix_mode, ctx->bigfoot, (unsigned)(ctx->bigfoot - txn->mt_txnid), chunk, at, at + chunk, retired_pages_before); } ctx->retired_stored += chunk; - } while (ctx->retired_stored < MDBX_PNL_SIZE(txn->tw.retired_pages) && + } while (ctx->retired_stored < + MDBX_PNL_GETSIZE(txn->tw.retired_pages) && (++ctx->bigfoot, true)); - } while (retired_pages_before != MDBX_PNL_SIZE(txn->tw.retired_pages)); + } while (retired_pages_before != MDBX_PNL_GETSIZE(txn->tw.retired_pages)); #else /* Write to last page of GC */ key.iov_len = sizeof(txnid_t); @@ -9450,7 +9452,7 @@ retry: /* Retry if tw.retired_pages[] grew during the Put() */ } while (data.iov_len < MDBX_PNL_SIZEOF(txn->tw.retired_pages)); - ctx->retired_stored = (unsigned)MDBX_PNL_SIZE(txn->tw.retired_pages); + ctx->retired_stored = MDBX_PNL_GETSIZE(txn->tw.retired_pages); pnl_sort(txn->tw.retired_pages, txn->mt_next_pgno); eASSERT(env, data.iov_len == MDBX_PNL_SIZEOF(txn->tw.retired_pages)); memcpy(data.iov_base, txn->tw.retired_pages, data.iov_len); @@ -9459,17 +9461,17 @@ retry: ctx->retired_stored, txn->mt_txnid); #endif /* MDBX_ENABLE_BIGFOOT */ if (LOG_ENABLED(MDBX_LOG_EXTRA)) { - unsigned i = ctx->retired_stored; - DEBUG_EXTRA("txn %" PRIaTXN " root %" PRIaPGNO " num %u, retired-PNL", + size_t i = ctx->retired_stored; + DEBUG_EXTRA("txn %" PRIaTXN " root %" PRIaPGNO " num %zu, retired-PNL", txn->mt_txnid, txn->mt_dbs[FREE_DBI].md_root, i); for (; i; i--) DEBUG_EXTRA_PRINT(" %" PRIaPGNO, txn->tw.retired_pages[i]); DEBUG_EXTRA_PRINT("%s\n", "."); } - if (unlikely(amount != MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) && + if (unlikely(amount != MDBX_PNL_GETSIZE(txn->tw.reclaimed_pglist) && ctx->settled)) { - TRACE("%s: reclaimed-list changed %u -> %u, retry", dbg_prefix_mode, - amount, (unsigned)MDBX_PNL_SIZE(txn->tw.reclaimed_pglist)); + TRACE("%s: reclaimed-list changed %zu -> %zu, retry", dbg_prefix_mode, + amount, MDBX_PNL_GETSIZE(txn->tw.reclaimed_pglist)); goto retry /* rare case, but avoids GC fragmentation and one cycle. */ ; @@ -9488,18 +9490,16 @@ retry: if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } - const unsigned left = amount - ctx->settled; - TRACE("%s: amount %u, settled %d, left %d, lifo-reclaimed-slots %u, " - "reused-gc-slots %u", - dbg_prefix_mode, amount, ctx->settled, (int)left, - txn->tw.lifo_reclaimed - ? (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) - : 0, + const size_t left = amount - ctx->settled; + TRACE("%s: amount %zu, settled %zd, left %zd, lifo-reclaimed-slots %zu, " + "reused-gc-slots %zu", + dbg_prefix_mode, amount, ctx->settled, left, + txn->tw.lifo_reclaimed ? MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed) : 0, ctx->reused_slot); - if (0 >= (int)left) + if (0 >= (intptr_t)left) break; - const unsigned prefer_max_scatter = 257; + const size_t prefer_max_scatter = 257; txnid_t reservation_gc_id; if (ctx->lifo) { if (txn->tw.lifo_reclaimed == nullptr) { @@ -9509,10 +9509,8 @@ retry: goto bailout; } } - if ((unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) < - prefer_max_scatter && - left > ((unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) - - ctx->reused_slot) * + if (MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed) < prefer_max_scatter && + left > (MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed) - ctx->reused_slot) * env->me_maxgc_ov1page && !ctx->dense) { /* LY: need just a txn-id for save page list. */ @@ -9531,12 +9529,12 @@ retry: MDBX_PNL_LAST(txn->tw.lifo_reclaimed)); need_cleanup = true; } - } while (rc == MDBX_SUCCESS && - (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) < - prefer_max_scatter && - left > ((unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) - - ctx->reused_slot) * - env->me_maxgc_ov1page); + } while ( + rc == MDBX_SUCCESS && + MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed) < prefer_max_scatter && + left > + (MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed) - ctx->reused_slot) * + env->me_maxgc_ov1page); ctx->cursor.outer.mc_flags |= C_RECLAIMING; if (likely(rc == MDBX_SUCCESS)) { @@ -9546,7 +9544,7 @@ retry: /* LY: some troubles... */ goto bailout; - if (MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)) { + if (MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed)) { if (need_cleanup) { txl_sort(txn->tw.lifo_reclaimed); ctx->cleaned_slot = 0; @@ -9566,17 +9564,17 @@ retry: } /* LY: GC is empty, will look any free txn-id in high2low order. */ - while (MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) < prefer_max_scatter && - left > ((unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) - + while (MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed) < prefer_max_scatter && + left > (MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed) - ctx->reused_slot) * env->me_maxgc_ov1page) { if (unlikely(ctx->rid <= MIN_TXNID)) { - if (unlikely(MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) <= + if (unlikely(MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed) <= ctx->reused_slot)) { - NOTICE("** restart: reserve depleted (reused_gc_slot %u >= " - "lifo_reclaimed %u" PRIaTXN, + NOTICE("** restart: reserve depleted (reused_gc_slot %zu >= " + "lifo_reclaimed %zu" PRIaTXN, ctx->reused_slot, - (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)); + MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed)); goto retry; } break; @@ -9627,7 +9625,7 @@ retry: 1 /* mark cleanup is not needed for added slot. */; TRACE("%s: append @%" PRIaTXN - " to lifo-reclaimed, cleaned-gc-slot = %u", + " to lifo-reclaimed, cleaned-gc-slot = %zu", dbg_prefix_mode, ctx->rid, ctx->cleaned_slot); } @@ -9640,11 +9638,11 @@ retry: } } - const unsigned i = - (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) - ctx->reused_slot; - tASSERT(txn, i > 0 && i <= MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)); + const size_t i = + MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed) - ctx->reused_slot; + tASSERT(txn, i > 0 && i <= MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed)); reservation_gc_id = txn->tw.lifo_reclaimed[i]; - TRACE("%s: take @%" PRIaTXN " from lifo-reclaimed[%u]", dbg_prefix_mode, + TRACE("%s: take @%" PRIaTXN " from lifo-reclaimed[%zu]", dbg_prefix_mode, reservation_gc_id, i); } else { tASSERT(txn, txn->tw.lifo_reclaimed == NULL); @@ -9675,39 +9673,38 @@ retry: } ++ctx->reused_slot; - unsigned chunk = left; + size_t chunk = left; if (unlikely(chunk > env->me_maxgc_ov1page)) { - const unsigned avail_gc_slots = + const size_t avail_gc_slots = txn->tw.lifo_reclaimed - ? (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) - - ctx->reused_slot + 1 - : (ctx->rid < INT16_MAX) ? (unsigned)ctx->rid + ? MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed) - ctx->reused_slot + 1 + : (ctx->rid < INT16_MAX) ? (size_t)ctx->rid : INT16_MAX; if (avail_gc_slots > 1) { if (chunk < env->me_maxgc_ov1page * 2) chunk /= 2; else { - const unsigned threshold = + const size_t threshold = env->me_maxgc_ov1page * ((avail_gc_slots < prefer_max_scatter) ? avail_gc_slots : prefer_max_scatter); if (left < threshold) chunk = env->me_maxgc_ov1page; else { - const unsigned tail = left - threshold + env->me_maxgc_ov1page + 1; - unsigned span = 1; - unsigned avail = (unsigned)((pgno2bytes(env, span) - PAGEHDRSZ) / - sizeof(pgno_t)) /* - 1 + span */; + const size_t tail = left - threshold + env->me_maxgc_ov1page + 1; + size_t span = 1; + size_t avail = ((pgno2bytes(env, span) - PAGEHDRSZ) / + sizeof(pgno_t)) /* - 1 + span */; if (tail > avail) { - for (unsigned i = amount - span; i > 0; --i) { + for (size_t i = amount - span; i > 0; --i) { if (MDBX_PNL_ASCENDING ? (txn->tw.reclaimed_pglist[i] + span) : (txn->tw.reclaimed_pglist[i] - span) == txn->tw.reclaimed_pglist[i + span]) { span += 1; - avail = (unsigned)((pgno2bytes(env, span) - PAGEHDRSZ) / - sizeof(pgno_t)) - - 1 + span; + avail = + ((pgno2bytes(env, span) - PAGEHDRSZ) / sizeof(pgno_t)) - + 1 + span; if (avail >= tail) break; } @@ -9725,11 +9722,11 @@ retry: } tASSERT(txn, chunk > 0); - TRACE("%s: gc_rid %" PRIaTXN ", reused_gc_slot %u, reservation-id " + TRACE("%s: gc_rid %" PRIaTXN ", reused_gc_slot %zu, reservation-id " "%" PRIaTXN, dbg_prefix_mode, ctx->rid, ctx->reused_slot, reservation_gc_id); - TRACE("%s: chunk %u, gc-per-ovpage %u", dbg_prefix_mode, chunk, + TRACE("%s: chunk %zu, gc-per-ovpage %u", dbg_prefix_mode, chunk, env->me_maxgc_ov1page); tASSERT(txn, reservation_gc_id <= env->me_lck->mti_oldest_reader.weak); @@ -9746,7 +9743,7 @@ retry: key.iov_len = sizeof(reservation_gc_id); key.iov_base = &reservation_gc_id; data.iov_len = (chunk + 1) * sizeof(pgno_t); - TRACE("%s: reserve %u [%u...%u) @%" PRIaTXN, dbg_prefix_mode, chunk, + TRACE("%s: reserve %zu [%zu...%zu) @%" PRIaTXN, dbg_prefix_mode, chunk, ctx->settled + 1, ctx->settled + chunk + 1, reservation_gc_id); gcu_prepare_backlog(txn, ctx, true); rc = mdbx_cursor_put(&ctx->cursor.outer, &key, &data, @@ -9758,42 +9755,43 @@ retry: gcu_clean_reserved(env, data); ctx->settled += chunk; - TRACE("%s: settled %u (+%u), continue", dbg_prefix_mode, ctx->settled, + TRACE("%s: settled %zu (+%zu), continue", dbg_prefix_mode, ctx->settled, chunk); if (txn->tw.lifo_reclaimed && - unlikely(amount < MDBX_PNL_SIZE(txn->tw.reclaimed_pglist)) && - (ctx->loop < 5 || MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) - amount > + unlikely(amount < MDBX_PNL_GETSIZE(txn->tw.reclaimed_pglist)) && + (ctx->loop < 5 || MDBX_PNL_GETSIZE(txn->tw.reclaimed_pglist) - amount > env->me_maxgc_ov1page)) { - NOTICE("** restart: reclaimed-list growth %u -> %u", amount, - (unsigned)MDBX_PNL_SIZE(txn->tw.reclaimed_pglist)); + NOTICE("** restart: reclaimed-list growth %zu -> %zu", amount, + MDBX_PNL_GETSIZE(txn->tw.reclaimed_pglist)); goto retry; } continue; } - tASSERT(txn, ctx->cleaned_slot == (txn->tw.lifo_reclaimed - ? MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) - : 0)); + tASSERT(txn, + ctx->cleaned_slot == (txn->tw.lifo_reclaimed + ? MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed) + : 0)); TRACE("%s", " >> filling"); /* Fill in the reserved records */ ctx->filled_slot = txn->tw.lifo_reclaimed - ? (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) - ctx->reused_slot + ? MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed) - ctx->reused_slot : ctx->reused_slot; rc = MDBX_SUCCESS; tASSERT(txn, pnl_check_allocated(txn->tw.reclaimed_pglist, txn->mt_next_pgno - MDBX_ENABLE_REFUND)); tASSERT(txn, dirtylist_check(txn)); - if (MDBX_PNL_SIZE(txn->tw.reclaimed_pglist)) { + if (MDBX_PNL_GETSIZE(txn->tw.reclaimed_pglist)) { MDBX_val key, data; key.iov_len = data.iov_len = 0; /* avoid MSVC warning */ key.iov_base = data.iov_base = NULL; - const unsigned amount = MDBX_PNL_SIZE(txn->tw.reclaimed_pglist); - unsigned left = amount; + const size_t amount = MDBX_PNL_GETSIZE(txn->tw.reclaimed_pglist); + size_t left = amount; if (txn->tw.lifo_reclaimed == nullptr) { tASSERT(txn, ctx->lifo == 0); rc = cursor_first(&ctx->cursor.outer, &key, &data); @@ -9805,30 +9803,28 @@ retry: while (true) { txnid_t fill_gc_id; - TRACE("%s: left %u of %u", dbg_prefix_mode, left, - (unsigned)MDBX_PNL_SIZE(txn->tw.reclaimed_pglist)); + TRACE("%s: left %zu of %zu", dbg_prefix_mode, left, + MDBX_PNL_GETSIZE(txn->tw.reclaimed_pglist)); if (txn->tw.lifo_reclaimed == nullptr) { tASSERT(txn, ctx->lifo == 0); fill_gc_id = unaligned_peek_u64(4, key.iov_base); if (ctx->filled_slot-- == 0 || fill_gc_id > txn->tw.last_reclaimed) { NOTICE( - "** restart: reserve depleted (filled_slot %u, fill_id %" PRIaTXN + "** restart: reserve depleted (filled_slot %zu, fill_id %" PRIaTXN " > last_reclaimed %" PRIaTXN, ctx->filled_slot, fill_gc_id, txn->tw.last_reclaimed); goto retry; } } else { tASSERT(txn, ctx->lifo != 0); - if (++ctx->filled_slot > - (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)) { - NOTICE("** restart: reserve depleted (filled_gc_slot %u > " - "lifo_reclaimed %u" PRIaTXN, - ctx->filled_slot, - (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)); + if (++ctx->filled_slot > MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed)) { + NOTICE("** restart: reserve depleted (filled_gc_slot %zu > " + "lifo_reclaimed %zu" PRIaTXN, + ctx->filled_slot, MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed)); goto retry; } fill_gc_id = txn->tw.lifo_reclaimed[ctx->filled_slot]; - TRACE("%s: seek-reservation @%" PRIaTXN " at lifo_reclaimed[%u]", + TRACE("%s: seek-reservation @%" PRIaTXN " at lifo_reclaimed[%zu]", dbg_prefix_mode, fill_gc_id, ctx->filled_slot); key.iov_base = &fill_gc_id; key.iov_len = sizeof(fill_gc_id); @@ -9836,10 +9832,10 @@ retry: if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } - tASSERT(txn, - ctx->cleaned_slot == (txn->tw.lifo_reclaimed - ? MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) - : 0)); + tASSERT(txn, ctx->cleaned_slot == + (txn->tw.lifo_reclaimed + ? MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed) + : 0)); tASSERT(txn, fill_gc_id > 0 && fill_gc_id <= env->me_lck->mti_oldest_reader.weak); key.iov_base = &fill_gc_id; @@ -9847,9 +9843,9 @@ retry: tASSERT(txn, data.iov_len >= sizeof(pgno_t) * 2); ctx->cursor.outer.mc_flags |= C_GCFREEZE; - unsigned chunk = (unsigned)(data.iov_len / sizeof(pgno_t)) - 1; + size_t chunk = data.iov_len / sizeof(pgno_t) - 1; if (unlikely(chunk > left)) { - TRACE("%s: chunk %u > left %u, @%" PRIaTXN, dbg_prefix_mode, chunk, + TRACE("%s: chunk %zu > left %zu, @%" PRIaTXN, dbg_prefix_mode, chunk, left, fill_gc_id); if ((ctx->loop < 5 && chunk - left > ctx->loop / 2) || chunk - left > env->me_maxgc_ov1page) { @@ -9867,37 +9863,36 @@ retry: gcu_clean_reserved(env, data); if (unlikely(txn->tw.loose_count || - amount != MDBX_PNL_SIZE(txn->tw.reclaimed_pglist))) { - NOTICE("** restart: reclaimed-list growth (%u -> %u, loose +%u)", - amount, MDBX_PNL_SIZE(txn->tw.reclaimed_pglist), + amount != MDBX_PNL_GETSIZE(txn->tw.reclaimed_pglist))) { + NOTICE("** restart: reclaimed-list growth (%zu -> %zu, loose +%zu)", + amount, MDBX_PNL_GETSIZE(txn->tw.reclaimed_pglist), txn->tw.loose_count); goto retry; } if (unlikely(txn->tw.lifo_reclaimed ? ctx->cleaned_slot < - MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) + MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed) : ctx->cleaned_id < txn->tw.last_reclaimed)) { NOTICE("%s", "** restart: reclaimed-slots changed"); goto retry; } if (unlikely(ctx->retired_stored != - MDBX_PNL_SIZE(txn->tw.retired_pages))) { + MDBX_PNL_GETSIZE(txn->tw.retired_pages))) { tASSERT(txn, - ctx->retired_stored < MDBX_PNL_SIZE(txn->tw.retired_pages)); - NOTICE("** restart: retired-list growth (%u -> %u)", - ctx->retired_stored, MDBX_PNL_SIZE(txn->tw.retired_pages)); + ctx->retired_stored < MDBX_PNL_GETSIZE(txn->tw.retired_pages)); + NOTICE("** restart: retired-list growth (%zu -> %zu)", + ctx->retired_stored, MDBX_PNL_GETSIZE(txn->tw.retired_pages)); goto retry; } pgno_t *dst = data.iov_base; - *dst++ = chunk; + *dst++ = (pgno_t)chunk; pgno_t *src = MDBX_PNL_BEGIN(txn->tw.reclaimed_pglist) + left - chunk; memcpy(dst, src, chunk * sizeof(pgno_t)); pgno_t *from = src, *to = src + chunk; - TRACE("%s: fill %u [ %u:%" PRIaPGNO "...%u:%" PRIaPGNO "] @%" PRIaTXN, - dbg_prefix_mode, chunk, (unsigned)(from - txn->tw.reclaimed_pglist), - from[0], (unsigned)(to - txn->tw.reclaimed_pglist), to[-1], - fill_gc_id); + TRACE("%s: fill %zu [ %zu:%" PRIaPGNO "...%zu:%" PRIaPGNO "] @%" PRIaTXN, + dbg_prefix_mode, chunk, from - txn->tw.reclaimed_pglist, from[0], + to - txn->tw.reclaimed_pglist, to[-1], fill_gc_id); left -= chunk; if (AUDIT_ENABLED()) { @@ -9923,29 +9918,30 @@ retry: tASSERT(txn, rc == MDBX_SUCCESS); if (unlikely(txn->tw.loose_count != 0)) { - NOTICE("** restart: got %u loose pages", txn->tw.loose_count); + NOTICE("** restart: got %zu loose pages", txn->tw.loose_count); goto retry; } if (unlikely(ctx->filled_slot != (txn->tw.lifo_reclaimed - ? (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) + ? MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed) : 0))) { const bool will_retry = ctx->loop < 9; - NOTICE("** %s: reserve excess (filled-slot %u, loop %u)", + NOTICE("** %s: reserve excess (filled-slot %zu, loop %zu)", will_retry ? "restart" : "ignore", ctx->filled_slot, ctx->loop); if (will_retry) goto retry; } - tASSERT(txn, txn->tw.lifo_reclaimed == NULL || - ctx->cleaned_slot == MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)); + tASSERT(txn, + txn->tw.lifo_reclaimed == NULL || + ctx->cleaned_slot == MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed)); bailout: txn->mt_cursors[FREE_DBI] = ctx->cursor.outer.mc_next; - MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) = 0; - TRACE("<<< %u loops, rc = %d", ctx->loop, rc); + MDBX_PNL_SETSIZE(txn->tw.reclaimed_pglist, 0); + TRACE("<<< %zu loops, rc = %d", ctx->loop, rc); return rc; } @@ -9954,7 +9950,7 @@ static int txn_write(MDBX_txn *txn, iov_ctx_t *ctx) { if (MDBX_AVOID_MSYNC || !(txn->mt_flags & MDBX_WRITEMAP)) dl = dpl_sort(txn); int rc = MDBX_SUCCESS; - unsigned r, w; + size_t r, w; for (w = 0, r = 1; r <= dl->length; ++r) { MDBX_page *dp = dl->items[r].ptr; if (dp->mp_flags & P_LOOSE) { @@ -10004,13 +10000,13 @@ int mdbx_txn_commit(MDBX_txn *txn) { return __inline_mdbx_txn_commit(txn); } /* Merge child txn into parent */ static __inline void txn_merge(MDBX_txn *const parent, MDBX_txn *const txn, - const unsigned parent_retired_len) { + const size_t parent_retired_len) { MDBX_dpl *const src = dpl_sort(txn); /* Remove refunded pages from parent's dirty list */ MDBX_dpl *const dst = dpl_sort(parent); if (MDBX_ENABLE_REFUND) { - unsigned n = dst->length; + size_t n = dst->length; while (n && dst->items[n].pgno >= parent->mt_next_pgno) { if (!(txn->mt_env->me_flags & MDBX_WRITEMAP)) { unsigned npages = dpl_npages(dst, n); @@ -10031,12 +10027,12 @@ static __inline void txn_merge(MDBX_txn *const parent, MDBX_txn *const txn, dpl_sift(parent, reclaimed_list, false); /* Move retired pages from parent's dirty & spilled list to reclaimed */ - unsigned r, w, d, s, l; + size_t r, w, d, s, l; for (r = w = parent_retired_len; - ++r <= MDBX_PNL_SIZE(parent->tw.retired_pages);) { + ++r <= MDBX_PNL_GETSIZE(parent->tw.retired_pages);) { const pgno_t pgno = parent->tw.retired_pages[r]; - const unsigned di = dpl_exist(parent, pgno); - const unsigned si = !di ? search_spilled(parent, pgno) : 0; + const size_t di = dpl_exist(parent, pgno); + const size_t si = !di ? search_spilled(parent, pgno) : 0; unsigned npages; const char *kind; if (di) { @@ -10059,7 +10055,7 @@ static __inline void txn_merge(MDBX_txn *const parent, MDBX_txn *const txn, /* Список retired страниц не сортирован, но для ускорения сортировки * дополняется в соответствии с MDBX_PNL_ASCENDING */ #if MDBX_PNL_ASCENDING - const unsigned len = MDBX_PNL_SIZE(parent->tw.retired_pages); + const size_t len = MDBX_PNL_GETSIZE(parent->tw.retired_pages); while (r < len && parent->tw.retired_pages[r + 1] == pgno + l) { ++r; if (++l == npages) @@ -10083,46 +10079,46 @@ static __inline void txn_merge(MDBX_txn *const parent, MDBX_txn *const txn, continue; } - DEBUG("reclaim retired parent's %u->%u %s page %" PRIaPGNO, npages, l, kind, - pgno); + DEBUG("reclaim retired parent's %u -> %zu %s page %" PRIaPGNO, npages, l, + kind, pgno); int err = pnl_insert_range(&parent->tw.reclaimed_pglist, pgno, l); ENSURE(txn->mt_env, err == MDBX_SUCCESS); } - MDBX_PNL_SIZE(parent->tw.retired_pages) = w; + MDBX_PNL_SETSIZE(parent->tw.retired_pages, w); /* Filter-out parent spill list */ - if (parent->tw.spill_pages && MDBX_PNL_SIZE(parent->tw.spill_pages) > 0) { + if (parent->tw.spill_pages && MDBX_PNL_GETSIZE(parent->tw.spill_pages) > 0) { const MDBX_PNL sl = spill_purge(parent); - unsigned len = MDBX_PNL_SIZE(sl); + size_t len = MDBX_PNL_GETSIZE(sl); if (len) { /* Remove refunded pages from parent's spill list */ if (MDBX_ENABLE_REFUND && MDBX_PNL_MOST(sl) >= (parent->mt_next_pgno << 1)) { #if MDBX_PNL_ASCENDING - unsigned i = MDBX_PNL_SIZE(sl); + size_t i = MDBX_PNL_GETSIZE(sl); assert(MDBX_PNL_MOST(sl) == MDBX_PNL_LAST(sl)); do { if ((sl[i] & 1) == 0) DEBUG("refund parent's spilled page %" PRIaPGNO, sl[i] >> 1); i -= 1; } while (i && sl[i] >= (parent->mt_next_pgno << 1)); - MDBX_PNL_SIZE(sl) = i; + MDBX_PNL_GETSIZE(sl) = i; #else assert(MDBX_PNL_MOST(sl) == MDBX_PNL_FIRST(sl)); - unsigned i = 0; + size_t i = 0; do { ++i; if ((sl[i] & 1) == 0) DEBUG("refund parent's spilled page %" PRIaPGNO, sl[i] >> 1); } while (i < len && sl[i + 1] >= (parent->mt_next_pgno << 1)); - MDBX_PNL_SIZE(sl) = len -= i; + MDBX_PNL_SETSIZE(sl, len -= i); memmove(sl + 1, sl + 1 + i, len * sizeof(sl[0])); #endif } tASSERT(txn, pnl_check_allocated(sl, (size_t)parent->mt_next_pgno << 1)); /* Remove reclaimed pages from parent's spill list */ - s = MDBX_PNL_SIZE(sl), r = MDBX_PNL_SIZE(reclaimed_list); + s = MDBX_PNL_GETSIZE(sl), r = MDBX_PNL_GETSIZE(reclaimed_list); /* Scanning from end to begin */ while (s && r) { if (sl[s] & 1) { @@ -10146,10 +10142,10 @@ static __inline void txn_merge(MDBX_txn *const parent, MDBX_txn *const txn, /* Remove anything in our dirty list from parent's spill list */ /* Scanning spill list in descend order */ - const int step = MDBX_PNL_ASCENDING ? -1 : 1; - s = MDBX_PNL_ASCENDING ? MDBX_PNL_SIZE(sl) : 1; + const intptr_t step = MDBX_PNL_ASCENDING ? -1 : 1; + s = MDBX_PNL_ASCENDING ? MDBX_PNL_GETSIZE(sl) : 1; d = src->length; - while (d && (MDBX_PNL_ASCENDING ? s > 0 : s <= MDBX_PNL_SIZE(sl))) { + while (d && (MDBX_PNL_ASCENDING ? s > 0 : s <= MDBX_PNL_GETSIZE(sl))) { if (sl[s] & 1) { s += step; continue; @@ -10249,7 +10245,7 @@ static __inline void txn_merge(MDBX_txn *const parent, MDBX_txn *const txn, } ++w; } - NOTICE("squash to begin for extending-merge %u -> %u", d, w - 1); + NOTICE("squash to begin for extending-merge %zu -> %zu", d, w - 1); d = w - 1; continue; } @@ -10291,7 +10287,7 @@ static __inline void txn_merge(MDBX_txn *const parent, MDBX_txn *const txn, } --w; } - NOTICE("squash to end for shrinking-merge %u -> %u", d, w + 1); + NOTICE("squash to end for shrinking-merge %zu -> %zu", d, w + 1); d = w + 1; continue; } @@ -10351,7 +10347,7 @@ static __inline void txn_merge(MDBX_txn *const parent, MDBX_txn *const txn, if (parent->tw.spill_pages) { assert(pnl_check_allocated(parent->tw.spill_pages, (size_t)parent->mt_next_pgno << 1)); - if (MDBX_PNL_SIZE(parent->tw.spill_pages)) + if (MDBX_PNL_GETSIZE(parent->tw.spill_pages)) parent->mt_flags |= MDBX_TXN_SPILLS; } } @@ -10423,8 +10419,8 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { sizeof(parent->mt_geo)) == 0); tASSERT(txn, memcmp(&parent->mt_canary, &txn->mt_canary, sizeof(parent->mt_canary)) == 0); - tASSERT(txn, - !txn->tw.spill_pages || MDBX_PNL_SIZE(txn->tw.spill_pages) == 0); + tASSERT(txn, !txn->tw.spill_pages || + MDBX_PNL_GETSIZE(txn->tw.spill_pages) == 0); tASSERT(txn, txn->tw.loose_count == 0); /* fast completion of pure nested transaction */ @@ -10434,11 +10430,10 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { /* Preserve space for spill list to avoid parent's state corruption * if allocation fails. */ - const unsigned parent_retired_len = - (unsigned)(uintptr_t)parent->tw.retired_pages; - tASSERT(txn, parent_retired_len <= MDBX_PNL_SIZE(txn->tw.retired_pages)); - const unsigned retired_delta = - MDBX_PNL_SIZE(txn->tw.retired_pages) - parent_retired_len; + const size_t parent_retired_len = (uintptr_t)parent->tw.retired_pages; + tASSERT(txn, parent_retired_len <= MDBX_PNL_GETSIZE(txn->tw.retired_pages)); + const size_t retired_delta = + MDBX_PNL_GETSIZE(txn->tw.retired_pages) - parent_retired_len; if (retired_delta) { rc = pnl_need(&txn->tw.reclaimed_pglist, retired_delta); if (unlikely(rc != MDBX_SUCCESS)) @@ -10448,7 +10443,7 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { if (txn->tw.spill_pages) { if (parent->tw.spill_pages) { rc = pnl_need(&parent->tw.spill_pages, - MDBX_PNL_SIZE(txn->tw.spill_pages)); + MDBX_PNL_GETSIZE(txn->tw.spill_pages)); if (unlikely(rc != MDBX_SUCCESS)) goto fail; } @@ -10494,12 +10489,12 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { /* Update parent's DBs array */ memcpy(parent->mt_dbs, txn->mt_dbs, txn->mt_numdbs * sizeof(MDBX_db)); parent->mt_numdbs = txn->mt_numdbs; - for (unsigned i = 0; i < txn->mt_numdbs; i++) { + for (size_t i = 0; i < txn->mt_numdbs; i++) { /* preserve parent's status */ const uint8_t state = txn->mt_dbistate[i] | (parent->mt_dbistate[i] & (DBI_CREAT | DBI_FRESH | DBI_DIRTY)); - DEBUG("db %u dbi-state %s 0x%02x -> 0x%02x", i, + DEBUG("dbi %zu dbi-state %s 0x%02x -> 0x%02x", i, (parent->mt_dbistate[i] != state) ? "update" : "still", parent->mt_dbistate[i], state); parent->mt_dbistate[i] = state; @@ -10520,7 +10515,7 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { tASSERT(parent, lp->mp_pgno < parent->tw.loose_refund_wl && lp->mp_pgno + 1 < parent->mt_next_pgno); /* Check parent's reclaimed pages not suitable for refund */ - if (MDBX_PNL_SIZE(parent->tw.reclaimed_pglist)) + if (MDBX_PNL_GETSIZE(parent->tw.reclaimed_pglist)) tASSERT(parent, MDBX_PNL_MOST(parent->tw.reclaimed_pglist) + 1 < parent->mt_next_pgno); } @@ -10542,7 +10537,7 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { if (txn->tw.dirtylist->length == 0 && (txn->mt_flags & (MDBX_TXN_DIRTY | MDBX_TXN_SPILLS)) == 0) { - for (int i = txn->mt_numdbs; --i >= 0;) + for (intptr_t i = txn->mt_numdbs; --i >= 0;) tASSERT(txn, (txn->mt_dbistate[i] & DBI_DIRTY) == 0); #if defined(MDBX_NOSUCCESS_EMPTY_COMMIT) && MDBX_NOSUCCESS_EMPTY_COMMIT rc = txn_end(txn, end_mode); @@ -10607,7 +10602,7 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { ts_2 = latency ? osal_monotime() : 0; if (AUDIT_ENABLED()) { - rc = audit_ex(txn, MDBX_PNL_SIZE(txn->tw.retired_pages), true); + rc = audit_ex(txn, MDBX_PNL_GETSIZE(txn->tw.retired_pages), true); const uint64_t audit_end = osal_monotime(); audit_duration = osal_monotime_to_16dot16(audit_end - ts_2); ts_2 = audit_end; @@ -10650,7 +10645,7 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { meta.mm_extra_pagehdr = head.ptr_c->mm_extra_pagehdr; unaligned_poke_u64(4, meta.mm_pages_retired, unaligned_peek_u64(4, head.ptr_c->mm_pages_retired) + - MDBX_PNL_SIZE(txn->tw.retired_pages)); + MDBX_PNL_GETSIZE(txn->tw.retired_pages)); meta.mm_geo = txn->mt_geo; meta.mm_dbs[FREE_DBI] = txn->mt_dbs[FREE_DBI]; meta.mm_dbs[MAIN_DBI] = txn->mt_dbs[MAIN_DBI]; @@ -10660,8 +10655,8 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { #if MDBX_ENABLE_BIGFOOT if (gcu_ctx.bigfoot > txn->mt_txnid) { commit_txnid = gcu_ctx.bigfoot; - TRACE("use @%" PRIaTXN " (+%u) for commit bigfoot-txn", commit_txnid, - (unsigned)(commit_txnid - txn->mt_txnid)); + TRACE("use @%" PRIaTXN " (+%zu) for commit bigfoot-txn", commit_txnid, + (size_t)(commit_txnid - txn->mt_txnid)); } #endif meta_set_txnid(env, &meta, commit_txnid); @@ -11057,7 +11052,7 @@ __cold static int read_header(MDBX_env *env, MDBX_meta *dest, } __cold static MDBX_page *meta_model(const MDBX_env *env, MDBX_page *model, - unsigned num) { + size_t num) { ENSURE(env, is_powerof2(env->me_psize)); ENSURE(env, env->me_psize >= MIN_PAGESIZE); ENSURE(env, env->me_psize <= MAX_PAGESIZE); @@ -11067,7 +11062,7 @@ __cold static MDBX_page *meta_model(const MDBX_env *env, MDBX_page *model, ENSURE(env, env->me_dbgeo.now <= env->me_dbgeo.upper); memset(model, 0, env->me_psize); - model->mp_pgno = num; + model->mp_pgno = (pgno_t)num; model->mp_flags = P_META; MDBX_meta *const model_meta = page_meta(model); unaligned_poke_u64(4, model_meta->mm_magic_and_version, MDBX_DATA_MAGIC); @@ -11505,7 +11500,7 @@ fail: } static void recalculate_merge_threshold(MDBX_env *env) { - const unsigned bytes = page_space(env); + const size_t bytes = page_space(env); env->me_merge_threshold = (uint16_t)(bytes - (bytes * env->me_options.merge_threshold_16dot16_percent >> @@ -12904,7 +12899,7 @@ static uint32_t merge_sync_flags(const uint32_t a, const uint32_t b) { } __cold static int __must_check_result override_meta(MDBX_env *env, - unsigned target, + size_t target, txnid_t txnid, const MDBX_meta *shape) { int rc = alloc_page_buf(env); @@ -12917,7 +12912,7 @@ __cold static int __must_check_result override_meta(MDBX_env *env, eASSERT(env, coherency_check_meta(env, model, true)); if (shape) { if (txnid && unlikely(!coherency_check_meta(env, shape, false))) { - ERROR("bailout overriding meta-%u since model failed " + ERROR("bailout overriding meta-%zu since model failed " "freedb/maindb %s-check for txnid #%" PRIaTXN, target, "pre", constmeta_txnid(shape)); return MDBX_PROBLEM; @@ -12941,7 +12936,7 @@ __cold static int __must_check_result override_meta(MDBX_env *env, memcpy(&model->mm_magic_and_version, &shape->mm_magic_and_version, sizeof(model->mm_magic_and_version)); if (unlikely(!coherency_check_meta(env, model, false))) { - ERROR("bailout overriding meta-%u since model failed " + ERROR("bailout overriding meta-%zu since model failed " "freedb/maindb %s-check for txnid #%" PRIaTXN, target, "post", txnid); return MDBX_PROBLEM; @@ -12949,7 +12944,7 @@ __cold static int __must_check_result override_meta(MDBX_env *env, } } unaligned_poke_u64(4, model->mm_sign, meta_sign(model)); - rc = validate_meta(env, model, page, target, nullptr); + rc = validate_meta(env, model, page, (pgno_t)target, nullptr); if (unlikely(MDBX_IS_ERROR(rc))) return MDBX_PROBLEM; @@ -13007,11 +13002,11 @@ __cold int mdbx_env_turn_for_recovery(MDBX_env *env, unsigned target) { const MDBX_meta *target_meta = METAPAGE(env, target); txnid_t new_txnid = safe64_txnid_next(constmeta_txnid(target_meta)); - for (unsigned n = 0; n < NUM_METAS; ++n) { + for (size_t n = 0; n < NUM_METAS; ++n) { if (n == target) continue; MDBX_meta meta = *METAPAGE(env, target); - if (validate_meta(env, &meta, pgno2page(env, n), n, nullptr) != + if (validate_meta(env, &meta, pgno2page(env, n), (pgno_t)n, nullptr) != MDBX_SUCCESS) { int err = override_meta(env, n, 0, nullptr); if (unlikely(err != MDBX_SUCCESS)) @@ -13644,7 +13639,7 @@ __cold static int env_close(MDBX_env *env) { } if (env->me_dbxs) { - for (unsigned i = env->me_numdbs; --i >= CORE_DBS;) + for (size_t i = env->me_numdbs; --i >= CORE_DBS;) osal_free(env->me_dbxs[i].md_name.iov_base); osal_free(env->me_dbxs); env->me_dbxs = nullptr; @@ -13867,25 +13862,25 @@ static bool unsure_equal(MDBX_cmp_func cmp, const MDBX_val *a, __hot static struct node_result node_search(MDBX_cursor *mc, const MDBX_val *key) { MDBX_page *mp = mc->mc_pg[mc->mc_top]; - const int nkeys = page_numkeys(mp); + const intptr_t nkeys = page_numkeys(mp); DKBUF_DEBUG; - DEBUG("searching %u keys in %s %spage %" PRIaPGNO, nkeys, + DEBUG("searching %zu keys in %s %spage %" PRIaPGNO, nkeys, IS_LEAF(mp) ? "leaf" : "branch", IS_SUBP(mp) ? "sub-" : "", mp->mp_pgno); struct node_result ret; ret.exact = false; STATIC_ASSERT(P_BRANCH == 1); - int low = mp->mp_flags & P_BRANCH; - int high = nkeys - 1; + intptr_t low = mp->mp_flags & P_BRANCH; + intptr_t high = nkeys - 1; if (unlikely(high < low)) { mc->mc_ki[mc->mc_top] = 0; ret.node = NULL; return ret; } - int i; + intptr_t i; MDBX_cmp_func *cmp = mc->mc_dbx->md_cmp; MDBX_val nodekey; if (unlikely(IS_LEAF2(mp))) { @@ -13897,7 +13892,7 @@ __hot static struct node_result node_search(MDBX_cursor *mc, cASSERT(mc, (char *)mp + mc->mc_txn->mt_env->me_psize >= (char *)nodekey.iov_base + nodekey.iov_len); int cr = cmp(key, &nodekey); - DEBUG("found leaf index %u [%s], rc = %i", i, DKEY_DEBUG(&nodekey), cr); + DEBUG("found leaf index %zu [%s], rc = %i", i, DKEY_DEBUG(&nodekey), cr); if (cr > 0) /* Found entry is less than the key. */ /* Skip to get the smallest entry larger than key. */ @@ -13933,9 +13928,9 @@ __hot static struct node_result node_search(MDBX_cursor *mc, (char *)nodekey.iov_base + nodekey.iov_len); int cr = cmp(key, &nodekey); if (IS_LEAF(mp)) - DEBUG("found leaf index %u [%s], rc = %i", i, DKEY_DEBUG(&nodekey), cr); + DEBUG("found leaf index %zu [%s], rc = %i", i, DKEY_DEBUG(&nodekey), cr); else - DEBUG("found branch index %u [%s -> %" PRIaPGNO "], rc = %i", i, + DEBUG("found branch index %zu [%s -> %" PRIaPGNO "], rc = %i", i, DKEY_DEBUG(&nodekey), node_pgno(node), cr); if (cr > 0) /* Found entry is less than the key. */ @@ -14024,7 +14019,8 @@ __hot static __always_inline int page_get_checker_lite(const uint16_t ILL, if (unlikely(page->mp_upper < page->mp_lower || ((page->mp_lower | page->mp_upper) & 1) || PAGEHDRSZ + page->mp_upper > txn->mt_env->me_psize)) - return bad_page(page, "invalid page' lower(%u)/upper(%u) with limit %u\n", + return bad_page(page, + "invalid page' lower(%u)/upper(%u) with limit %zu\n", page->mp_lower, page->mp_upper, page_space(txn->mt_env)); } else if ((ILL & P_OVERFLOW) == 0) { @@ -14085,8 +14081,8 @@ __hot static __always_inline pgr_t page_get_inline(const uint16_t ILL, search_spilled(spiller, pgno)) break; - const unsigned i = dpl_search(spiller, pgno); - tASSERT(txn, (int)i > 0); + const size_t i = dpl_search(spiller, pgno); + tASSERT(txn, (intptr_t)i > 0); if (spiller->tw.dirtylist->items[i].pgno == pgno) { spiller->tw.dirtylist->items[i].lru = txn->tw.dirtylru++; r.page = spiller->tw.dirtylist->items[i].ptr; @@ -14127,9 +14123,9 @@ __hot __noinline static int page_search_root(MDBX_cursor *mc, while (IS_BRANCH(mp)) { MDBX_node *node; - int i; + intptr_t i; - DEBUG("branch page %" PRIaPGNO " has %u keys", mp->mp_pgno, + DEBUG("branch page %" PRIaPGNO " has %zu keys", mp->mp_pgno, page_numkeys(mp)); /* Don't assert on branch pages in the GC. We can get here * while in the process of rebalancing a GC branch page; we must @@ -14156,7 +14152,7 @@ __hot __noinline static int page_search_root(MDBX_cursor *mc, i = mc->mc_ki[mc->mc_top] + nsr.exact - 1; else i = page_numkeys(mp) - 1; - DEBUG("following index %u for key [%s]", i, DKEY_DEBUG(key)); + DEBUG("following index %zu for key [%s]", i, DKEY_DEBUG(key)); } cASSERT(mc, i >= 0 && i < (int)page_numkeys(mp)); @@ -14208,7 +14204,7 @@ static int setup_dbx(MDBX_dbx *const dbx, const MDBX_db *const db, ? 4 /* sizeof(uint32_t) */ : ((db->md_flags & MDBX_DUPFIXED) ? 1 : 0); dbx->md_vlen_max = valsize_max(pagesize, db->md_flags); - assert(dbx->md_vlen_max != (unsigned)-1); + assert(dbx->md_vlen_max != (size_t)-1); if ((db->md_flags & (MDBX_DUPFIXED | MDBX_INTEGERDUP)) != 0 && db->md_xsize) { if (!MDBX_DISABLE_VALIDATION && unlikely(db->md_xsize < dbx->md_vlen_min || @@ -14222,10 +14218,10 @@ static int setup_dbx(MDBX_dbx *const dbx, const MDBX_db *const db, return MDBX_SUCCESS; } -static int fetch_sdb(MDBX_txn *txn, MDBX_dbi dbi) { +static int fetch_sdb(MDBX_txn *txn, size_t dbi) { MDBX_cursor_couple couple; if (unlikely(dbi_changed(txn, dbi))) { - NOTICE("dbi %u was changed for txn %" PRIaTXN, dbi, txn->mt_txnid); + NOTICE("dbi %zu was changed for txn %" PRIaTXN, dbi, txn->mt_txnid); return MDBX_BAD_DBI; } int rc = cursor_init(&couple.outer, txn, MAIN_DBI); @@ -14236,7 +14232,7 @@ static int fetch_sdb(MDBX_txn *txn, MDBX_dbi dbi) { rc = page_search(&couple.outer, &dbx->md_name, 0); if (unlikely(rc != MDBX_SUCCESS)) { notfound: - NOTICE("dbi %u refs to inaccessible subDB `%*s` for txn %" PRIaTXN + NOTICE("dbi %zu refs to inaccessible subDB `%*s` for txn %" PRIaTXN " (err %d)", dbi, (int)dbx->md_name.iov_len, (const char *)dbx->md_name.iov_base, txn->mt_txnid, rc); @@ -14250,7 +14246,7 @@ static int fetch_sdb(MDBX_txn *txn, MDBX_dbi dbi) { goto notfound; } if (unlikely((node_flags(nsr.node) & (F_DUPDATA | F_SUBDATA)) != F_SUBDATA)) { - NOTICE("dbi %u refs to not a named subDB `%*s` for txn %" PRIaTXN " (%s)", + NOTICE("dbi %zu refs to not a named subDB `%*s` for txn %" PRIaTXN " (%s)", dbi, (int)dbx->md_name.iov_len, (const char *)dbx->md_name.iov_base, txn->mt_txnid, "wrong flags"); return MDBX_INCOMPATIBLE; /* not a named DB */ @@ -14262,7 +14258,7 @@ static int fetch_sdb(MDBX_txn *txn, MDBX_dbi dbi) { return rc; if (unlikely(data.iov_len != sizeof(MDBX_db))) { - NOTICE("dbi %u refs to not a named subDB `%*s` for txn %" PRIaTXN " (%s)", + NOTICE("dbi %zu refs to not a named subDB `%*s` for txn %" PRIaTXN " (%s)", dbi, (int)dbx->md_name.iov_len, (const char *)dbx->md_name.iov_base, txn->mt_txnid, "wrong rec-size"); return MDBX_INCOMPATIBLE; /* not a named DB */ @@ -14273,7 +14269,7 @@ static int fetch_sdb(MDBX_txn *txn, MDBX_dbi dbi) { * have dropped and recreated the DB with other flags. */ MDBX_db *const db = &txn->mt_dbs[dbi]; if (unlikely((db->md_flags & DB_PERSISTENT_FLAGS) != md_flags)) { - NOTICE("dbi %u refs to the re-created subDB `%*s` for txn %" PRIaTXN + NOTICE("dbi %zu refs to the re-created subDB `%*s` for txn %" PRIaTXN " with different flags (present 0x%X != wanna 0x%X)", dbi, (int)dbx->md_name.iov_len, (const char *)dbx->md_name.iov_base, txn->mt_txnid, db->md_flags & DB_PERSISTENT_FLAGS, md_flags); @@ -14635,9 +14631,9 @@ static int cursor_next(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, goto skip; } - int ki = mc->mc_ki[mc->mc_top]; + intptr_t ki = mc->mc_ki[mc->mc_top]; mc->mc_ki[mc->mc_top] = (indx_t)++ki; - const int numkeys = page_numkeys(mp); + const intptr_t numkeys = page_numkeys(mp); if (unlikely(ki >= numkeys)) { DEBUG("%s", "=====> move to next sibling page"); mc->mc_ki[mc->mc_top] = (indx_t)(numkeys - 1); @@ -14652,7 +14648,7 @@ static int cursor_next(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, } skip: - DEBUG("==> cursor points to page %" PRIaPGNO " with %u keys, key index %u", + DEBUG("==> cursor points to page %" PRIaPGNO " with %zu keys, key index %u", mp->mp_pgno, page_numkeys(mp), mc->mc_ki[mc->mc_top]); if (!MDBX_DISABLE_VALIDATION && unlikely(!CHECK_LEAF_TYPE(mc, mp))) { @@ -14742,7 +14738,7 @@ static int cursor_prev(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, DEBUG("prev page is %" PRIaPGNO ", key index %u", mp->mp_pgno, mc->mc_ki[mc->mc_top]); } - DEBUG("==> cursor points to page %" PRIaPGNO " with %u keys, key index %u", + DEBUG("==> cursor points to page %" PRIaPGNO " with %zu keys, key index %u", mp->mp_pgno, page_numkeys(mp), mc->mc_ki[mc->mc_top]); if (!MDBX_DISABLE_VALIDATION && unlikely(!CHECK_LEAF_TYPE(mc, mp))) { @@ -14850,7 +14846,7 @@ cursor_set(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, MDBX_cursor_op op) { goto got_node; } if (cmp > 0) { - const unsigned nkeys = page_numkeys(mp); + const size_t nkeys = page_numkeys(mp); if (nkeys > 1) { if (IS_LEAF2(mp)) { nodekey.iov_base = page_leaf2key(mp, nkeys - 1, nodekey.iov_len); @@ -14895,7 +14891,7 @@ cursor_set(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, MDBX_cursor_op op) { } /* If any parents have right-sibs, search. * Otherwise, there's nothing further. */ - unsigned i; + size_t i; for (i = 0; i < mc->mc_top; i++) if (mc->mc_ki[i] < page_numkeys(mc->mc_pg[i]) - 1) break; @@ -15182,7 +15178,7 @@ __hot int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, mp->mp_pgno, mp->mp_flags); return MDBX_CORRUPTED; } - const unsigned nkeys = page_numkeys(mp); + const size_t nkeys = page_numkeys(mp); if (unlikely(mc->mc_ki[mc->mc_top] >= nkeys)) { cASSERT(mc, nkeys <= UINT16_MAX); if (mc->mc_flags & C_EOF) @@ -15408,14 +15404,14 @@ static int cursor_next_batch(MDBX_cursor *mc) { MDBX_page *mp = mc->mc_pg[mc->mc_top]; if (unlikely(mc->mc_flags & C_EOF)) { - if ((unsigned)mc->mc_ki[mc->mc_top] + 1 >= page_numkeys(mp)) + if ((size_t)mc->mc_ki[mc->mc_top] + 1 >= page_numkeys(mp)) return MDBX_NOTFOUND; mc->mc_flags ^= C_EOF; } - int ki = mc->mc_ki[mc->mc_top]; + intptr_t ki = mc->mc_ki[mc->mc_top]; mc->mc_ki[mc->mc_top] = (indx_t)++ki; - const int numkeys = page_numkeys(mp); + const intptr_t numkeys = page_numkeys(mp); if (likely(ki >= numkeys)) { DEBUG("%s", "=====> move to next sibling page"); mc->mc_ki[mc->mc_top] = (indx_t)(numkeys - 1); @@ -15479,8 +15475,8 @@ int mdbx_cursor_get_batch(MDBX_cursor *mc, size_t *count, MDBX_val *pairs, mp->mp_pgno, mp->mp_flags); return MDBX_CORRUPTED; } - const unsigned nkeys = page_numkeys(mp); - unsigned i = mc->mc_ki[mc->mc_top], n = 0; + const size_t nkeys = page_numkeys(mp); + size_t i = mc->mc_ki[mc->mc_top], n = 0; if (unlikely(i >= nkeys)) { cASSERT(mc, op == MDBX_GET_CURRENT); cASSERT(mc, mdbx_cursor_on_last(mc) == MDBX_RESULT_TRUE); @@ -15888,7 +15884,7 @@ __hot int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, /* there's only a key anyway, so this is a no-op */ if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { char *ptr; - unsigned ksize = mc->mc_db->md_xsize; + size_t ksize = mc->mc_db->md_xsize; if (unlikely(key->iov_len != ksize)) return MDBX_BAD_VALSIZE; ptr = page_leaf2key(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], ksize); @@ -15897,7 +15893,7 @@ __hot int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, /* if overwriting slot 0 of leaf, need to * update branch key if there is a parent page */ if (mc->mc_top && !mc->mc_ki[mc->mc_top]) { - unsigned dtop = 1; + size_t dtop = 1; mc->mc_top--; /* slot 0 is always an empty key, find real slot */ while (mc->mc_top && !mc->mc_ki[mc->mc_top]) { @@ -16012,7 +16008,7 @@ __hot int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, * fp: old sub-page or a header faking it. * mp: new (sub-)page. offset: growth in page size. * xdata: node data with new page or DB. */ - unsigned i; + size_t i; size_t offset = 0; MDBX_page *mp = fp = xdata.iov_base = env->me_pbuf; mp->mp_pgno = mc->mc_pg[mc->mc_top]->mp_pgno; @@ -16224,7 +16220,7 @@ new_sub:; if (likely(rc == 0)) { /* Adjust other cursors pointing to mp */ const MDBX_dbi dbi = mc->mc_dbi; - const unsigned i = mc->mc_top; + const size_t i = mc->mc_top; MDBX_page *const mp = mc->mc_pg[i]; for (MDBX_cursor *m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) { @@ -16279,9 +16275,9 @@ new_sub:; /* Adjust other cursors pointing to mp */ MDBX_cursor *m2; MDBX_xcursor *mx = mc->mc_xcursor; - unsigned i = mc->mc_top; + size_t i = mc->mc_top; MDBX_page *mp = mc->mc_pg[i]; - const int nkeys = page_numkeys(mp); + const intptr_t nkeys = page_numkeys(mp); for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2 = m2->mc_next) { if (m2 == mc || m2->mc_snum < mc->mc_snum) @@ -16525,20 +16521,20 @@ static pgr_t page_new_large(MDBX_cursor *mc, const unsigned npages) { } __hot static int __must_check_result node_add_leaf2(MDBX_cursor *mc, - unsigned indx, + size_t indx, const MDBX_val *key) { MDBX_page *mp = mc->mc_pg[mc->mc_top]; DKBUF_DEBUG; - DEBUG("add to leaf2-%spage %" PRIaPGNO " index %i, " + DEBUG("add to leaf2-%spage %" PRIaPGNO " index %zi, " " key size %" PRIuPTR " [%s]", IS_SUBP(mp) ? "sub-" : "", mp->mp_pgno, indx, key ? key->iov_len : 0, DKEY_DEBUG(key)); cASSERT(mc, key); cASSERT(mc, PAGETYPE_COMPAT(mp) == (P_LEAF | P_LEAF2)); - const unsigned ksize = mc->mc_db->md_xsize; + const size_t ksize = mc->mc_db->md_xsize; cASSERT(mc, ksize == key->iov_len); - const unsigned nkeys = page_numkeys(mp); + const size_t nkeys = page_numkeys(mp); /* Just using these for counting */ const intptr_t lower = mp->mp_lower + sizeof(indx_t); @@ -16552,7 +16548,7 @@ __hot static int __must_check_result node_add_leaf2(MDBX_cursor *mc, char *const ptr = page_leaf2key(mp, indx, ksize); cASSERT(mc, nkeys >= indx); - const unsigned diff = nkeys - indx; + const size_t diff = nkeys - indx; if (likely(diff > 0)) /* Move higher keys up one slot. */ memmove(ptr + ksize, ptr, diff * ksize); @@ -16561,12 +16557,12 @@ __hot static int __must_check_result node_add_leaf2(MDBX_cursor *mc, return MDBX_SUCCESS; } -static int __must_check_result node_add_branch(MDBX_cursor *mc, unsigned indx, +static int __must_check_result node_add_branch(MDBX_cursor *mc, size_t indx, const MDBX_val *key, pgno_t pgno) { MDBX_page *mp = mc->mc_pg[mc->mc_top]; DKBUF_DEBUG; - DEBUG("add to branch-%spage %" PRIaPGNO " index %i, node-pgno %" PRIaPGNO + DEBUG("add to branch-%spage %" PRIaPGNO " index %zi, node-pgno %" PRIaPGNO " key size %" PRIuPTR " [%s]", IS_SUBP(mp) ? "sub-" : "", mp->mp_pgno, indx, pgno, key ? key->iov_len : 0, DKEY_DEBUG(key)); @@ -16575,9 +16571,9 @@ static int __must_check_result node_add_branch(MDBX_cursor *mc, unsigned indx, STATIC_ASSERT(NODESIZE % 2 == 0); /* Move higher pointers up one slot. */ - const unsigned nkeys = page_numkeys(mp); + const size_t nkeys = page_numkeys(mp); cASSERT(mc, nkeys >= indx); - for (unsigned i = nkeys; i > indx; --i) + for (size_t i = nkeys; i > indx; --i) mp->mp_ptrs[i] = mp->mp_ptrs[i - 1]; /* Adjust free space offsets. */ @@ -16604,14 +16600,13 @@ static int __must_check_result node_add_branch(MDBX_cursor *mc, unsigned indx, return MDBX_SUCCESS; } -__hot static int __must_check_result node_add_leaf(MDBX_cursor *mc, - unsigned indx, +__hot static int __must_check_result node_add_leaf(MDBX_cursor *mc, size_t indx, const MDBX_val *key, MDBX_val *data, unsigned flags) { MDBX_page *mp = mc->mc_pg[mc->mc_top]; DKBUF_DEBUG; - DEBUG("add to leaf-%spage %" PRIaPGNO " index %i, data size %" PRIuPTR + DEBUG("add to leaf-%spage %" PRIaPGNO " index %zi, data size %" PRIuPTR " key size %" PRIuPTR " [%s]", IS_SUBP(mp) ? "sub-" : "", mp->mp_pgno, indx, data ? data->iov_len : 0, key ? key->iov_len : 0, DKEY_DEBUG(key)); @@ -16656,9 +16651,9 @@ __hot static int __must_check_result node_add_leaf(MDBX_cursor *mc, cASSERT(mc, node_bytes == leaf_size(mc->mc_txn->mt_env, key, data)); /* Move higher pointers up one slot. */ - const unsigned nkeys = page_numkeys(mp); + const size_t nkeys = page_numkeys(mp); cASSERT(mc, nkeys >= indx); - for (unsigned i = nkeys; i > indx; --i) + for (size_t i = nkeys; i > indx; --i) mp->mp_ptrs[i] = mp->mp_ptrs[i - 1]; /* Adjust free space offsets. */ @@ -16703,16 +16698,16 @@ __hot static int __must_check_result node_add_leaf(MDBX_cursor *mc, * part of a MDBX_DUPFIXED database. */ __hot static void node_del(MDBX_cursor *mc, size_t ksize) { MDBX_page *mp = mc->mc_pg[mc->mc_top]; - const unsigned hole = mc->mc_ki[mc->mc_top]; - const unsigned nkeys = page_numkeys(mp); + const size_t hole = mc->mc_ki[mc->mc_top]; + const size_t nkeys = page_numkeys(mp); - DEBUG("delete node %u on %s page %" PRIaPGNO, hole, + DEBUG("delete node %zu on %s page %" PRIaPGNO, hole, IS_LEAF(mp) ? "leaf" : "branch", mp->mp_pgno); cASSERT(mc, hole < nkeys); if (IS_LEAF2(mp)) { cASSERT(mc, ksize >= sizeof(indx_t)); - unsigned diff = nkeys - 1 - hole; + size_t diff = nkeys - 1 - hole; char *base = page_leaf2key(mp, hole, ksize); if (diff) memmove(base, base + ksize, diff * ksize); @@ -16732,7 +16727,7 @@ __hot static void node_del(MDBX_cursor *mc, size_t ksize) { hole_size = EVEN(hole_size); const indx_t hole_offset = mp->mp_ptrs[hole]; - unsigned r, w; + size_t r, w; for (r = w = 0; r < nkeys; r++) if (r != hole) mp->mp_ptrs[w++] = (mp->mp_ptrs[r] < hole_offset) @@ -16759,12 +16754,12 @@ __hot static void node_del(MDBX_cursor *mc, size_t ksize) { /* Compact the main page after deleting a node on a subpage. * [in] mp The main page to operate on. * [in] indx The index of the subpage on the main page. */ -static void node_shrink(MDBX_page *mp, unsigned indx) { +static void node_shrink(MDBX_page *mp, size_t indx) { MDBX_node *node; MDBX_page *sp, *xp; char *base; size_t nsize, delta, len, ptr; - int i; + intptr_t i; node = page_node(mp, indx); sp = (MDBX_page *)node_data(node); @@ -16975,13 +16970,13 @@ static int cursor_xinit2(MDBX_cursor *mc, MDBX_xcursor *src_mx, return MDBX_SUCCESS; } -static __inline int couple_init(MDBX_cursor_couple *couple, const MDBX_dbi dbi, +static __inline int couple_init(MDBX_cursor_couple *couple, const size_t dbi, MDBX_txn *const txn, MDBX_db *const db, MDBX_dbx *const dbx, uint8_t *const dbstate) { couple->outer.mc_signature = MDBX_MC_LIVE; couple->outer.mc_next = NULL; couple->outer.mc_backup = NULL; - couple->outer.mc_dbi = dbi; + couple->outer.mc_dbi = (MDBX_dbi)dbi; couple->outer.mc_txn = txn; couple->outer.mc_db = db; couple->outer.mc_dbx = dbx; @@ -17021,7 +17016,7 @@ static __inline int couple_init(MDBX_cursor_couple *couple, const MDBX_dbi dbi, } /* Initialize a cursor for a given transaction and database. */ -static int cursor_init(MDBX_cursor *mc, MDBX_txn *txn, MDBX_dbi dbi) { +static int cursor_init(MDBX_cursor *mc, MDBX_txn *txn, size_t dbi) { STATIC_ASSERT(offsetof(MDBX_cursor_couple, outer) == 0); return couple_init(container_of(mc, MDBX_cursor_couple, outer), dbi, txn, &txn->mt_dbs[dbi], &txn->mt_dbxs[dbi], @@ -17179,7 +17174,7 @@ again: dest->mc_flags ^= (dest->mc_flags ^ src->mc_flags) & ~C_UNTRACK; dest->mc_top = src->mc_top; dest->mc_snum = src->mc_snum; - for (unsigned i = 0; i < src->mc_snum; ++i) { + for (size_t i = 0; i < src->mc_snum; ++i) { dest->mc_ki[i] = src->mc_ki[i]; dest->mc_pg[i] = src->mc_pg[i]; } @@ -17292,7 +17287,7 @@ static int update_key(MDBX_cursor *mc, const MDBX_val *key) { char *base; size_t len; ptrdiff_t delta, ksize, oksize; - int ptr, i, nkeys, indx; + intptr_t ptr, i, nkeys, indx; DKBUF_DEBUG; cASSERT(mc, cursor_is_tracked(mc)); @@ -17304,8 +17299,8 @@ static int update_key(MDBX_cursor *mc, const MDBX_val *key) { MDBX_val k2; k2.iov_base = node_key(node); k2.iov_len = node_ks(node); - DEBUG("update key %u (offset %u) [%s] to [%s] on page %" PRIaPGNO, indx, ptr, - DVAL_DEBUG(&k2), DKEY_DEBUG(key), mp->mp_pgno); + DEBUG("update key %zi (offset %zu) [%s] to [%s] on page %" PRIaPGNO, indx, + ptr, DVAL_DEBUG(&k2), DKEY_DEBUG(key), mp->mp_pgno); #endif /* MDBX_DEBUG */ /* Sizes must be 2-byte aligned. */ @@ -17379,7 +17374,7 @@ static int node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, bool fromleft) { key4move.iov_base = node_key(srcnode); if (csrc->mc_ki[csrc->mc_top] == 0) { - const unsigned snum = csrc->mc_snum; + const size_t snum = csrc->mc_snum; cASSERT(csrc, snum > 0); /* must find the lowest key below src */ rc = page_search_lowest(csrc); @@ -17411,7 +17406,7 @@ static int node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, bool fromleft) { } if (cdst->mc_ki[cdst->mc_top] == 0) { - const unsigned snum = cdst->mc_snum; + const size_t snum = cdst->mc_snum; cASSERT(csrc, snum > 0); MDBX_cursor mn; cursor_copy(cdst, &mn); @@ -17682,11 +17677,11 @@ static int page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { const int pagetype = PAGETYPE_WHOLE(psrc); /* Move all nodes from src to dst */ - const unsigned dst_nkeys = page_numkeys(pdst); - const unsigned src_nkeys = page_numkeys(psrc); + const size_t dst_nkeys = page_numkeys(pdst); + const size_t src_nkeys = page_numkeys(psrc); cASSERT(cdst, dst_nkeys + src_nkeys >= (IS_LEAF(psrc) ? 1u : 2u)); if (likely(src_nkeys)) { - unsigned j = dst_nkeys; + size_t j = dst_nkeys; if (unlikely(pagetype & P_LEAF2)) { /* Mark dst as dirty. */ if (unlikely(rc = page_touch(cdst))) @@ -17694,7 +17689,7 @@ static int page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { key.iov_len = csrc->mc_db->md_xsize; key.iov_base = page_data(psrc); - unsigned i = 0; + size_t i = 0; do { rc = node_add_leaf2(cdst, j++, &key); if (unlikely(rc != MDBX_SUCCESS)) @@ -17738,7 +17733,7 @@ static int page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { if (unlikely(rc = page_touch(cdst))) return rc; - unsigned i = 0; + size_t i = 0; while (true) { if (pagetype & P_LEAF) { MDBX_val data; @@ -17761,7 +17756,7 @@ static int page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { } pdst = cdst->mc_pg[cdst->mc_top]; - DEBUG("dst page %" PRIaPGNO " now has %u keys (%.1f%% filled)", + DEBUG("dst page %" PRIaPGNO " now has %zu keys (%.1f%% filled)", pdst->mp_pgno, page_numkeys(pdst), page_fill(cdst->mc_txn->mt_env, pdst)); @@ -17789,7 +17784,7 @@ static int page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { /* Adjust other cursors pointing to mp */ MDBX_cursor *m2, *m3; const MDBX_dbi dbi = csrc->mc_dbi; - const unsigned top = csrc->mc_top; + const size_t top = csrc->mc_top; for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) { m3 = (csrc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; @@ -17914,7 +17909,7 @@ static void cursor_restore(const MDBX_cursor *csrc, MDBX_cursor *cdst) { cdst->mc_flags = csrc->mc_flags; cdst->mc_checking = csrc->mc_checking; - for (unsigned i = 0; i < csrc->mc_snum; i++) { + for (size_t i = 0; i < csrc->mc_snum; i++) { cdst->mc_pg[i] = csrc->mc_pg[i]; cdst->mc_ki[i] = csrc->mc_ki[i]; } @@ -17948,30 +17943,30 @@ static int rebalance(MDBX_cursor *mc) { const int pagetype = PAGETYPE_WHOLE(mc->mc_pg[mc->mc_top]); STATIC_ASSERT(P_BRANCH == 1); - const unsigned minkeys = (pagetype & P_BRANCH) + 1; + const size_t minkeys = (pagetype & P_BRANCH) + 1; /* Pages emptier than this are candidates for merging. */ - unsigned room_threshold = likely(mc->mc_dbi != FREE_DBI) - ? mc->mc_txn->mt_env->me_merge_threshold - : mc->mc_txn->mt_env->me_merge_threshold_gc; + size_t room_threshold = likely(mc->mc_dbi != FREE_DBI) + ? mc->mc_txn->mt_env->me_merge_threshold + : mc->mc_txn->mt_env->me_merge_threshold_gc; const MDBX_page *const tp = mc->mc_pg[mc->mc_top]; - const unsigned numkeys = page_numkeys(tp); - const unsigned room = page_room(tp); + const size_t numkeys = page_numkeys(tp); + const size_t room = page_room(tp); DEBUG("rebalancing %s page %" PRIaPGNO - " (has %u keys, full %.1f%%, used %u, room %u bytes )", + " (has %zu keys, full %.1f%%, used %zu, room %zu bytes )", (pagetype & P_LEAF) ? "leaf" : "branch", tp->mp_pgno, numkeys, page_fill(mc->mc_txn->mt_env, tp), page_used(mc->mc_txn->mt_env, tp), room); if (unlikely(numkeys < minkeys)) { - DEBUG("page %" PRIaPGNO " must be merged due keys < %u threshold", + DEBUG("page %" PRIaPGNO " must be merged due keys < %zu threshold", tp->mp_pgno, minkeys); } else if (unlikely(room > room_threshold)) { - DEBUG("page %" PRIaPGNO " should be merged due room %u > %u threshold", + DEBUG("page %" PRIaPGNO " should be merged due room %zu > %zu threshold", tp->mp_pgno, room, room_threshold); } else { - DEBUG("no need to rebalance page %" PRIaPGNO ", room %u < %u threshold", + DEBUG("no need to rebalance page %" PRIaPGNO ", room %zu < %zu threshold", tp->mp_pgno, room, room_threshold); cASSERT(mc, mc->mc_db->md_entries > 0); return MDBX_SUCCESS; @@ -17980,7 +17975,7 @@ static int rebalance(MDBX_cursor *mc) { int rc; if (mc->mc_snum < 2) { MDBX_page *const mp = mc->mc_pg[0]; - const unsigned nkeys = page_numkeys(mp); + const size_t nkeys = page_numkeys(mp); cASSERT(mc, (mc->mc_db->md_entries == 0) == (nkeys == 0)); if (IS_SUBP(mp)) { DEBUG("%s", "Can't rebalance a subpage, ignoring"); @@ -18062,7 +18057,7 @@ static int rebalance(MDBX_cursor *mc) { /* The parent (branch page) must have at least 2 pointers, * otherwise the tree is invalid. */ - const unsigned pre_top = mc->mc_top - 1; + const size_t pre_top = mc->mc_top - 1; cASSERT(mc, IS_BRANCH(mc->mc_pg[pre_top])); cASSERT(mc, !IS_SUBP(mc->mc_pg[0])); cASSERT(mc, page_numkeys(mc->mc_pg[pre_top]) > 1); @@ -18094,14 +18089,14 @@ static int rebalance(MDBX_cursor *mc) { } cASSERT(mc, left || right); - const unsigned ki_top = mc->mc_ki[mc->mc_top]; - const unsigned ki_pre_top = mn.mc_ki[pre_top]; - const unsigned nkeys = page_numkeys(mn.mc_pg[mn.mc_top]); + const size_t ki_top = mc->mc_ki[mc->mc_top]; + const size_t ki_pre_top = mn.mc_ki[pre_top]; + const size_t nkeys = page_numkeys(mn.mc_pg[mn.mc_top]); - const unsigned left_room = left ? page_room(left) : 0; - const unsigned right_room = right ? page_room(right) : 0; - const unsigned left_nkeys = left ? page_numkeys(left) : 0; - const unsigned right_nkeys = right ? page_numkeys(right) : 0; + const size_t left_room = left ? page_room(left) : 0; + const size_t right_room = right ? page_room(right) : 0; + const size_t left_nkeys = left ? page_numkeys(left) : 0; + const size_t right_nkeys = right ? page_numkeys(right) : 0; retry: if (left_room > room_threshold && left_room >= right_room) { /* try merge with left */ @@ -18110,7 +18105,7 @@ retry: mn.mc_ki[mn.mc_top - 1] = (indx_t)(ki_pre_top - 1); mn.mc_ki[mn.mc_top] = (indx_t)(left_nkeys - 1); mc->mc_ki[mc->mc_top] = 0; - const unsigned new_ki = ki_top + left_nkeys; + const size_t new_ki = ki_top + left_nkeys; mn.mc_ki[mn.mc_top] += mc->mc_ki[mn.mc_top] + 1; /* We want rebalance to find mn when doing fixups */ WITH_CURSOR_TRACKING(mn, rc = page_merge(mc, &mn)); @@ -18176,7 +18171,7 @@ retry: goto retry; } ERROR("Unable to merge/rebalance %s page %" PRIaPGNO - " (has %u keys, full %.1f%%, used %u, room %u bytes )", + " (has %zu keys, full %.1f%%, used %zu, room %zu bytes )", (pagetype & P_LEAF) ? "leaf" : "branch", tp->mp_pgno, numkeys, page_fill(mc->mc_txn->mt_env, tp), page_used(mc->mc_txn->mt_env, tp), room); @@ -18259,24 +18254,24 @@ __cold static int page_check(MDBX_cursor *const mc, const MDBX_page *const mp) { if (unlikely(mp->mp_upper < mp->mp_lower || ((mp->mp_lower | mp->mp_upper) & 1) || PAGEHDRSZ + mp->mp_upper > env->me_psize)) - rc = bad_page(mp, "invalid page lower(%u)/upper(%u) with limit %u\n", + rc = bad_page(mp, "invalid page lower(%u)/upper(%u) with limit %zu\n", mp->mp_lower, mp->mp_upper, page_space(env)); char *const end_of_page = (char *)mp + env->me_psize; - const unsigned nkeys = page_numkeys(mp); + const size_t nkeys = page_numkeys(mp); STATIC_ASSERT(P_BRANCH == 1); if (unlikely(nkeys <= (uint8_t)(mp->mp_flags & P_BRANCH))) { if ((!(mc->mc_flags & C_SUB) || mc->mc_db->md_entries) && (!(mc->mc_checking & CC_UPDATING) || !(IS_MODIFIABLE(mc->mc_txn, mp) || (mp->mp_flags & P_SUBP)))) rc = - bad_page(mp, "%s-page nkeys (%u) < %u\n", + bad_page(mp, "%s-page nkeys (%zu) < %u\n", IS_BRANCH(mp) ? "branch" : "leaf", nkeys, 1 + IS_BRANCH(mp)); } if (!IS_LEAF2(mp) && unlikely(PAGEHDRSZ + mp->mp_upper + nkeys * sizeof(MDBX_node) + nkeys - 1 > env->me_psize)) - rc = bad_page(mp, "invalid page upper (%u) for nkeys %u with limit %u\n", + rc = bad_page(mp, "invalid page upper (%u) for nkeys %zu with limit %zu\n", mp->mp_upper, nkeys, page_space(env)); const size_t ksize_max = keysize_max(env->me_psize, 0); @@ -18291,7 +18286,7 @@ __cold static int page_check(MDBX_cursor *const mc, const MDBX_page *const mp) { } MDBX_val here, prev = {0, 0}; - for (unsigned i = 0; i < nkeys; ++i) { + for (size_t i = 0; i < nkeys; ++i) { if (IS_LEAF2(mp)) { char *const key = page_leaf2key(mp, i, leaf2_ksize); if (unlikely(end_of_page < key + leaf2_ksize)) { @@ -18313,7 +18308,7 @@ __cold static int page_check(MDBX_cursor *const mc, const MDBX_page *const mp) { here.iov_len = leaf2_ksize; here.iov_base = key; if (prev.iov_base && unlikely(mc->mc_dbx->md_cmp(&prev, &here) >= 0)) - rc = bad_page(mp, "leaf2-key #%u wrong order (%s >= %s)\n", i, + rc = bad_page(mp, "leaf2-key #%zu wrong order (%s >= %s)\n", i, DKEY(&prev), DVAL(&here)); prev = here; } @@ -18321,16 +18316,16 @@ __cold static int page_check(MDBX_cursor *const mc, const MDBX_page *const mp) { const MDBX_node *const node = page_node(mp, i); const char *node_end = (char *)node + NODESIZE; if (unlikely(node_end > end_of_page)) { - rc = bad_page(mp, "node[%u] (%zu) beyond page-end\n", i, + rc = bad_page(mp, "node[%zu] (%zu) beyond page-end\n", i, node_end - end_of_page); continue; } const size_t ksize = node_ks(node); if (unlikely(ksize > ksize_max)) - rc = bad_page(mp, "node[%u] too long key (%zu)\n", i, ksize); + rc = bad_page(mp, "node[%zu] too long key (%zu)\n", i, ksize); char *key = node_key(node); if (unlikely(end_of_page < key + ksize)) { - rc = bad_page(mp, "node[%u] key (%zu) beyond page-end\n", i, + rc = bad_page(mp, "node[%zu] key (%zu) beyond page-end\n", i, key + ksize - end_of_page); continue; } @@ -18338,13 +18333,13 @@ __cold static int page_check(MDBX_cursor *const mc, const MDBX_page *const mp) { if (unlikely(ksize < mc->mc_dbx->md_klen_min || ksize > mc->mc_dbx->md_klen_max)) rc = bad_page( - mp, "node[%u] key size (%zu) <> min/max key-length (%zu/%zu)\n", + mp, "node[%zu] key size (%zu) <> min/max key-length (%zu/%zu)\n", i, ksize, mc->mc_dbx->md_klen_min, mc->mc_dbx->md_klen_max); if ((mc->mc_checking & CC_SKIPORD) == 0) { here.iov_base = key; here.iov_len = ksize; if (prev.iov_base && unlikely(mc->mc_dbx->md_cmp(&prev, &here) >= 0)) - rc = bad_page(mp, "node[%u] key wrong order (%s >= %s)\n", i, + rc = bad_page(mp, "node[%zu] key wrong order (%s >= %s)\n", i, DKEY(&prev), DVAL(&here)); prev = here; } @@ -18352,23 +18347,24 @@ __cold static int page_check(MDBX_cursor *const mc, const MDBX_page *const mp) { if (IS_BRANCH(mp)) { if ((mc->mc_checking & CC_UPDATING) == 0 && i == 0 && unlikely(ksize != 0)) - rc = bad_page(mp, "branch-node[%u] wrong 0-node key-length (%zu)\n", + rc = bad_page(mp, "branch-node[%zu] wrong 0-node key-length (%zu)\n", i, ksize); const pgno_t ref = node_pgno(node); if (unlikely(ref < MIN_PAGENO) || (unlikely(ref >= mc->mc_txn->mt_next_pgno) && (unlikely(ref >= mc->mc_txn->mt_geo.now) || !(mc->mc_checking & CC_RETIRING)))) - rc = bad_page(mp, "branch-node[%u] wrong pgno (%u)\n", i, ref); + rc = bad_page(mp, "branch-node[%zu] wrong pgno (%u)\n", i, ref); if (unlikely(node_flags(node))) - rc = bad_page(mp, "branch-node[%u] wrong flags (%u)\n", i, + rc = bad_page(mp, "branch-node[%zu] wrong flags (%u)\n", i, node_flags(node)); continue; } switch (node_flags(node)) { default: - rc = bad_page(mp, "invalid node[%u] flags (%u)\n", i, node_flags(node)); + rc = + bad_page(mp, "invalid node[%zu] flags (%u)\n", i, node_flags(node)); break; case F_BIGDATA /* data on large-page */: case 0 /* usual */: @@ -18383,7 +18379,7 @@ __cold static int page_check(MDBX_cursor *const mc, const MDBX_page *const mp) { if (node_flags(node) & F_BIGDATA) { if (unlikely(end_of_page < data + sizeof(pgno_t))) { rc = bad_page( - mp, "node-%s(%u of %u, %zu bytes) beyond (%zu) page-end\n", + mp, "node-%s(%zu of %zu, %zu bytes) beyond (%zu) page-end\n", "bigdata-pgno", i, nkeys, dsize, data + dsize - end_of_page); continue; } @@ -18419,9 +18415,9 @@ __cold static int page_check(MDBX_cursor *const mc, const MDBX_page *const mp) { } if (unlikely(end_of_page < data + dsize)) { - rc = - bad_page(mp, "node-%s(%u of %u, %zu bytes) beyond (%zu) page-end\n", - "data", i, nkeys, dsize, data + dsize - end_of_page); + rc = bad_page(mp, + "node-%s(%zu of %zu, %zu bytes) beyond (%zu) page-end\n", + "data", i, nkeys, dsize, data + dsize - end_of_page); continue; } @@ -18469,7 +18465,7 @@ __cold static int page_check(MDBX_cursor *const mc, const MDBX_page *const mp) { } const char *const end_of_subpage = data + dsize; - const int nsubkeys = page_numkeys(sp); + const intptr_t nsubkeys = page_numkeys(sp); if (unlikely(nsubkeys == 0) && !(mc->mc_checking & CC_UPDATING) && mc->mc_db->md_entries) rc = bad_page(mp, "no keys on a %s-page\n", @@ -18577,7 +18573,7 @@ __cold static int cursor_check(MDBX_cursor *mc) { for (int n = 0; n < (int)mc->mc_snum; ++n) { MDBX_page *mp = mc->mc_pg[n]; - const unsigned nkeys = page_numkeys(mp); + const size_t nkeys = page_numkeys(mp); const bool expect_branch = (n < mc->mc_db->md_depth - 1) ? true : false; const bool expect_nested_leaf = (n + 1 == mc->mc_db->md_depth - 1) ? true : false; @@ -18602,7 +18598,7 @@ __cold static int cursor_check(MDBX_cursor *mc) { if (unlikely(err != MDBX_SUCCESS)) return err; - for (unsigned i = 0; i < nkeys; ++i) { + for (size_t i = 0; i < nkeys; ++i) { if (branch) { MDBX_node *node = page_node(mp, i); cASSERT(mc, node_flags(node) == 0); @@ -18640,7 +18636,7 @@ static int cursor_del(MDBX_cursor *mc) { int rc; MDBX_page *mp; indx_t ki; - unsigned nkeys; + size_t nkeys; MDBX_dbi dbi = mc->mc_dbi; cASSERT(mc, cursor_is_tracked(mc)); @@ -18826,25 +18822,25 @@ static int page_split(MDBX_cursor *mc, const MDBX_val *const newkey, const unsigned naf) { unsigned flags; int rc = MDBX_SUCCESS, foliage = 0; - unsigned i, ptop; + size_t i, ptop; MDBX_env *const env = mc->mc_txn->mt_env; MDBX_val sepkey, rkey, xdata; MDBX_page *tmp_ki_copy = NULL; DKBUF; MDBX_page *const mp = mc->mc_pg[mc->mc_top]; - const unsigned newindx = mc->mc_ki[mc->mc_top]; - unsigned nkeys = page_numkeys(mp); + const size_t newindx = mc->mc_ki[mc->mc_top]; + size_t nkeys = page_numkeys(mp); if (AUDIT_ENABLED()) { rc = cursor_check_updating(mc); if (unlikely(rc != MDBX_SUCCESS)) return rc; } STATIC_ASSERT(P_BRANCH == 1); - const unsigned minkeys = (mp->mp_flags & P_BRANCH) + 1; + const size_t minkeys = (mp->mp_flags & P_BRANCH) + 1; DEBUG(">> splitting %s-page %" PRIaPGNO - " and adding %zu+%zu [%s] at %i, nkeys %i", + " and adding %zu+%zu [%s] at %i, nkeys %zi", IS_LEAF(mp) ? "leaf" : "branch", mp->mp_pgno, newkey->iov_len, newdata ? newdata->iov_len : 0, DKEY_DEBUG(newkey), mc->mc_ki[mc->mc_top], nkeys); @@ -18913,7 +18909,7 @@ static int page_split(MDBX_cursor *mc, const MDBX_val *const newkey, mn.mc_ki[mn.mc_top] = 0; mn.mc_ki[ptop] = mc->mc_ki[ptop] + 1; - unsigned split_indx = + size_t split_indx = (newindx < nkeys) ? /* split at the middle */ (nkeys + 1) >> 1 : /* split at the end (i.e. like append-mode ) */ nkeys - minkeys + 1; @@ -18965,9 +18961,9 @@ static int page_split(MDBX_cursor *mc, const MDBX_val *const newkey, } else { if (IS_LEAF2(sister)) { char *split, *ins; - unsigned lsize, rsize, ksize; + size_t lsize, rsize, ksize; /* Move half of the keys to the right sibling */ - const int distance = mc->mc_ki[mc->mc_top] - split_indx; + const intptr_t distance = mc->mc_ki[mc->mc_top] - split_indx; ksize = mc->mc_db->md_xsize; split = page_leaf2key(mp, split_indx, ksize); rsize = (nkeys - split_indx) * ksize; @@ -19022,7 +19018,7 @@ static int page_split(MDBX_cursor *mc, const MDBX_val *const newkey, goto done; } - const unsigned max_space = page_space(env); + const size_t max_space = page_space(env); const size_t new_size = IS_LEAF(mp) ? leaf_size(env, newkey, newdata) : branch_size(env, newkey); @@ -19058,18 +19054,18 @@ static int page_split(MDBX_cursor *mc, const MDBX_val *const newkey, split_indx += mp->mp_flags & P_BRANCH; } eASSERT(env, split_indx >= minkeys && split_indx <= nkeys + 1 - minkeys); - const unsigned dim_nodes = + const size_t dim_nodes = (newindx >= split_indx) ? split_indx : nkeys - split_indx; - const unsigned dim_used = (sizeof(indx_t) + NODESIZE + 1) * dim_nodes; + const size_t dim_used = (sizeof(indx_t) + NODESIZE + 1) * dim_nodes; if (new_size >= dim_used) { /* Search for best acceptable split point */ i = (newindx < split_indx) ? 0 : nkeys; - int dir = (newindx < split_indx) ? 1 : -1; + intptr_t dir = (newindx < split_indx) ? 1 : -1; size_t before = 0, after = new_size + page_used(env, mp); - unsigned best_split = split_indx; - unsigned best_shift = INT_MAX; + size_t best_split = split_indx; + size_t best_shift = INT_MAX; - TRACE("seek separator from %u, step %i, default %u, new-idx %u, " + TRACE("seek separator from %zu, step %zi, default %zu, new-idx %zu, " "new-size %zu", i, dir, split_indx, newindx, new_size); do { @@ -19087,13 +19083,13 @@ static int page_split(MDBX_cursor *mc, const MDBX_val *const newkey, before += size; after -= size; - TRACE("step %u, size %zu, before %zu, after %zu, max %u", i, size, + TRACE("step %zu, size %zu, before %zu, after %zu, max %zu", i, size, before, after, max_space); if (before <= max_space && after <= max_space) { - const unsigned split = i + (dir > 0); + const size_t split = i + (dir > 0); if (split >= minkeys && split <= nkeys + 1 - minkeys) { - const unsigned shift = branchless_abs(split_indx - split); + const size_t shift = branchless_abs(split_indx - split); if (shift >= best_shift) break; best_shift = shift; @@ -19106,7 +19102,7 @@ static int page_split(MDBX_cursor *mc, const MDBX_val *const newkey, } while (i < nkeys); split_indx = best_split; - TRACE("chosen %u", split_indx); + TRACE("chosen %zu", split_indx); } eASSERT(env, split_indx >= minkeys && split_indx <= nkeys + 1 - minkeys); @@ -19120,7 +19116,7 @@ static int page_split(MDBX_cursor *mc, const MDBX_val *const newkey, } } } - DEBUG("separator is %d [%s]", split_indx, DKEY_DEBUG(&sepkey)); + DEBUG("separator is %zd [%s]", split_indx, DKEY_DEBUG(&sepkey)); bool did_split_parent = false; /* Copy separator key to the parent. */ @@ -19256,9 +19252,9 @@ static int page_split(MDBX_cursor *mc, const MDBX_val *const newkey, /* Move nodes */ mc->mc_pg[mc->mc_top] = sister; i = split_indx; - unsigned n = 0; + size_t n = 0; do { - TRACE("i %u, nkeys %u => n %u, rp #%u", i, nkeys, n, sister->mp_pgno); + TRACE("i %zu, nkeys %zu => n %zu, rp #%u", i, nkeys, n, sister->mp_pgno); pgno_t pgno = 0; MDBX_val *rdata = NULL; if (i == newindx) { @@ -19315,7 +19311,7 @@ static int page_split(MDBX_cursor *mc, const MDBX_val *const newkey, } } while (i != split_indx); - TRACE("i %u, nkeys %u, n %u, pgno #%u", i, nkeys, n, + TRACE("i %zu, nkeys %zu, n %zu, pgno #%u", i, nkeys, n, mc->mc_pg[mc->mc_top]->mp_pgno); nkeys = page_numkeys(tmp_ki_copy); @@ -19398,7 +19394,7 @@ static int page_split(MDBX_cursor *mc, const MDBX_val *const newkey, if (XCURSOR_INITED(m3) && IS_LEAF(mp)) XCURSOR_REFRESH(m3, m3->mc_pg[mc->mc_top], m3->mc_ki[mc->mc_top]); } - TRACE("mp #%u left: %d, sister #%u left: %d", mp->mp_pgno, page_room(mp), + TRACE("mp #%u left: %zd, sister #%u left: %zd", mp->mp_pgno, page_room(mp), sister->mp_pgno, page_room(sister)); done: @@ -19566,7 +19562,7 @@ static int compacting_put_bytes(mdbx_compacting_ctx *ctx, const void *src, size_t bytes, pgno_t pgno, pgno_t npages) { assert(pgno == 0 || bytes > PAGEHDRSZ); while (bytes > 0) { - const unsigned side = ctx->mc_head & 1; + const size_t side = ctx->mc_head & 1; const size_t left = (size_t)MDBX_ENVCOPY_WRITEBUF - ctx->mc_wlen[side]; if (left < (pgno ? PAGEHDRSZ : 1)) { int err = compacting_toggle_write_buffers(ctx); @@ -19645,7 +19641,7 @@ __cold static int compacting_walk_tree(mdbx_compacting_ctx *ctx, return MDBX_ENOMEM; char *ptr = buf; - for (unsigned i = 0; i < mc->mc_top; i++) { + for (size_t i = 0; i < mc->mc_top; i++) { page_copy((MDBX_page *)ptr, mc->mc_pg[i], ctx->mc_env->me_psize); mc->mc_pg[i] = (MDBX_page *)ptr; ptr += ctx->mc_env->me_psize; @@ -19655,12 +19651,12 @@ __cold static int compacting_walk_tree(mdbx_compacting_ctx *ctx, while (mc->mc_snum > 0) { MDBX_page *mp = mc->mc_pg[mc->mc_top]; - unsigned n = page_numkeys(mp); + size_t n = page_numkeys(mp); if (IS_LEAF(mp)) { if (!(mc->mc_flags & C_SUB) /* may have nested F_SUBDATA or F_BIGDATA nodes */) { - for (unsigned i = 0; i < n; i++) { + for (size_t i = 0; i < n; i++) { MDBX_node *node = page_node(mp, i); if (node_flags(node) == F_BIGDATA) { /* Need writable leaf */ @@ -19884,7 +19880,7 @@ __cold static int env_compact(MDBX_env *env, MDBX_txn *read_txn, data.iov_len < MDBX_PNL_SIZEOF(pnl) || !(pnl_check(pnl, read_txn->mt_next_pgno)))) return MDBX_CORRUPTED; - gc += MDBX_PNL_SIZE(pnl); + gc += MDBX_PNL_GETSIZE(pnl); } if (unlikely(rc != MDBX_NOTFOUND)) return rc; @@ -20423,7 +20419,7 @@ __cold static int stat_acc(const MDBX_txn *txn, MDBX_stat *st, size_t bytes) { err = page_search(&cx.outer, NULL, MDBX_PS_FIRST); while (err == MDBX_SUCCESS) { const MDBX_page *mp = cx.outer.mc_pg[cx.outer.mc_top]; - for (unsigned i = 0; i < page_numkeys(mp); i++) { + for (size_t i = 0; i < page_numkeys(mp); i++) { const MDBX_node *node = page_node(mp, i); if (node_flags(node) != F_SUBDATA) continue; @@ -20690,7 +20686,7 @@ __cold static int fetch_envinfo_ex(const MDBX_env *env, const MDBX_txn *txn, arg->mi_self_latter_reader_txnid = arg->mi_latter_reader_txnid = arg->mi_recent_txnid; if (env->me_lck_mmap.lck) { - for (unsigned i = 0; i < arg->mi_numreaders; ++i) { + for (size_t i = 0; i < arg->mi_numreaders; ++i) { const uint32_t pid = atomic_load32(&lck->mti_readers[i].mr_pid, mo_AcquireRelease); if (pid) { @@ -21090,11 +21086,11 @@ static int dbi_close_locked(MDBX_env *env, MDBX_dbi dbi) { osal_free(ptr); if (env->me_numdbs == dbi + 1) { - unsigned i = env->me_numdbs; + size_t i = env->me_numdbs; do --i; while (i > CORE_DBS && !env->me_dbxs[i - 1].md_name.iov_base); - env->me_numdbs = i; + env->me_numdbs = (MDBX_dbi)i; } return MDBX_SUCCESS; @@ -21165,10 +21161,10 @@ static int drop_tree(MDBX_cursor *mc, const bool may_have_subDBs) { cursor_copy(mc, &mx); while (mc->mc_snum > 0) { MDBX_page *const mp = mc->mc_pg[mc->mc_top]; - const unsigned nkeys = page_numkeys(mp); + const size_t nkeys = page_numkeys(mp); if (IS_LEAF(mp)) { cASSERT(mc, mc->mc_snum == mc->mc_db->md_depth); - for (unsigned i = 0; i < nkeys; i++) { + for (size_t i = 0; i < nkeys; i++) { MDBX_node *node = page_node(mp, i); if (node_flags(node) & F_BIGDATA) { rc = page_retire_ex(mc, node_largedata_pgno(node), nullptr, 0); @@ -21195,7 +21191,7 @@ static int drop_tree(MDBX_cursor *mc, const bool may_have_subDBs) { const unsigned pagetype = (IS_FROZEN(txn, mp) ? P_FROZEN : 0) + ((mc->mc_snum + 1 == mc->mc_db->md_depth) ? P_LEAF : P_BRANCH); - for (unsigned i = 0; i < nkeys; i++) { + for (size_t i = 0; i < nkeys; i++) { MDBX_node *node = page_node(mp, i); tASSERT(txn, (node_flags(node) & (F_BIGDATA | F_SUBDATA | F_DUPDATA)) == 0); @@ -21219,7 +21215,7 @@ static int drop_tree(MDBX_cursor *mc, const bool may_have_subDBs) { pop: cursor_pop(mc); mc->mc_ki[0] = 0; - for (unsigned i = 1; i < mc->mc_snum; i++) { + for (size_t i = 1; i < mc->mc_snum; i++) { mc->mc_ki[i] = 0; mc->mc_pg[i] = mx.mc_pg[i]; } @@ -21327,9 +21323,9 @@ __cold int mdbx_reader_list(const MDBX_env *env, MDBX_reader_list_func *func, int serial = 0; MDBX_lockinfo *const lck = env->me_lck_mmap.lck; if (likely(lck)) { - const unsigned snap_nreaders = + const size_t snap_nreaders = atomic_load32(&lck->mti_numreaders, mo_AcquireRelease); - for (unsigned i = 0; i < snap_nreaders; i++) { + for (size_t i = 0; i < snap_nreaders; i++) { const MDBX_reader *r = lck->mti_readers + i; retry_reader:; const uint32_t pid = atomic_load32(&r->mr_pid, mo_AcquireRelease); @@ -21377,8 +21373,8 @@ __cold int mdbx_reader_list(const MDBX_env *env, MDBX_reader_list_func *func, reader_pages_retired)) : 0; } - rc = func(ctx, ++serial, i, pid, (mdbx_tid_t)tid, txnid, lag, bytes_used, - bytes_retained); + rc = func(ctx, ++serial, (unsigned)i, pid, (mdbx_tid_t)tid, txnid, lag, + bytes_used, bytes_retained); if (unlikely(rc != MDBX_SUCCESS)) break; } @@ -21391,13 +21387,13 @@ __cold int mdbx_reader_list(const MDBX_env *env, MDBX_reader_list_func *func, * return -1 if already present. */ __cold static bool pid_insert(uint32_t *ids, uint32_t pid) { /* binary search of pid in list */ - unsigned base = 0; - unsigned cursor = 1; + size_t base = 0; + size_t cursor = 1; int val = 0; - unsigned n = ids[0]; + size_t n = ids[0]; while (n > 0) { - unsigned pivot = n >> 1; + size_t pivot = n >> 1; cursor = base + pivot + 1; val = pid - ids[cursor]; @@ -21447,7 +21443,7 @@ __cold MDBX_INTERNAL_FUNC int cleanup_dead_readers(MDBX_env *env, return MDBX_SUCCESS; } - const unsigned snap_nreaders = + const size_t snap_nreaders = atomic_load32(&lck->mti_numreaders, mo_AcquireRelease); uint32_t pidsbuf_onstask[142]; uint32_t *const pids = @@ -21459,7 +21455,7 @@ __cold MDBX_INTERNAL_FUNC int cleanup_dead_readers(MDBX_env *env, pids[0] = 0; int count = 0; - for (unsigned i = 0; i < snap_nreaders; i++) { + for (size_t i = 0; i < snap_nreaders; i++) { const uint32_t pid = atomic_load32(&lck->mti_readers[i].mr_pid, mo_AcquireRelease); if (pid == 0) @@ -21508,7 +21504,7 @@ __cold MDBX_INTERNAL_FUNC int cleanup_dead_readers(MDBX_env *env, } /* clean it */ - for (unsigned j = i; j < snap_nreaders; j++) { + for (size_t j = i; j < snap_nreaders; j++) { if (lck->mti_readers[j].mr_pid.weak == pid) { DEBUG("clear stale reader pid %" PRIuPTR " txn %" PRIaTXN, (size_t)pid, lck->mti_readers[j].mr_txnid.weak); @@ -21584,7 +21580,7 @@ __cold static txnid_t kick_longlived_readers(MDBX_env *env, MDBX_reader *stucked = nullptr; uint64_t hold_retired = 0; - for (unsigned i = 0; i < lck->mti_numreaders.weak; ++i) { + for (size_t i = 0; i < lck->mti_numreaders.weak; ++i) { const uint64_t snap_retired = atomic_load64( &lck->mti_readers[i].mr_snapshot_pages_retired, mo_Relaxed); const txnid_t rtxn = safe64_read(&lck->mti_readers[i].mr_txnid); @@ -21741,7 +21737,7 @@ __cold static int walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, int err = page_get(ctx->mw_cursor, pgno, &mp, parent_txnid); MDBX_page_type_t type = walk_page_type(mp); - const unsigned nentries = mp ? page_numkeys(mp) : 0; + const size_t nentries = mp ? page_numkeys(mp) : 0; unsigned npages = 1; size_t pagesize = pgno2bytes(ctx->mw_txn->mt_env, npages); size_t header_size = @@ -21751,7 +21747,7 @@ __cold static int walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, (mp ? page_room(mp) : pagesize - header_size) - payload_size; size_t align_bytes = 0; - for (unsigned i = 0; err == MDBX_SUCCESS && i < nentries; + for (size_t i = 0; err == MDBX_SUCCESS && i < nentries; align_bytes += ((payload_size + align_bytes) & 1), ++i) { if (type == MDBX_page_dupfixed_leaf) { /* LEAF2 pages have no mp_ptrs[] or node headers */ @@ -21822,7 +21818,7 @@ __cold static int walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, } MDBX_page *sp = node_data(node); - const unsigned nsubkeys = page_numkeys(sp); + const size_t nsubkeys = page_numkeys(sp); size_t subheader_size = IS_LEAF2(sp) ? PAGEHDRSZ : PAGEHDRSZ + sp->mp_lower; size_t subunused_size = page_room(sp); @@ -21843,7 +21839,7 @@ __cold static int walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, err = MDBX_CORRUPTED; } - for (unsigned j = 0; err == MDBX_SUCCESS && j < nsubkeys; + for (size_t j = 0; err == MDBX_SUCCESS && j < nsubkeys; subalign_bytes += ((subpayload_size + subalign_bytes) & 1), ++j) { if (subtype == MDBX_subpage_dupfixed_leaf) { @@ -21884,7 +21880,7 @@ __cold static int walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, if (unlikely(rc != MDBX_SUCCESS)) return (rc == MDBX_RESULT_TRUE) ? MDBX_SUCCESS : rc; - for (unsigned i = 0; err == MDBX_SUCCESS && i < nentries; ++i) { + for (size_t i = 0; err == MDBX_SUCCESS && i < nentries; ++i) { if (type == MDBX_page_dupfixed_leaf) continue; @@ -22050,7 +22046,7 @@ int mdbx_cursor_on_first(const MDBX_cursor *mc) { if (!(mc->mc_flags & C_INITIALIZED)) return mc->mc_db->md_entries ? MDBX_RESULT_FALSE : MDBX_RESULT_TRUE; - for (unsigned i = 0; i < mc->mc_snum; ++i) { + for (size_t i = 0; i < mc->mc_snum; ++i) { if (mc->mc_ki[i]) return MDBX_RESULT_FALSE; } @@ -22069,8 +22065,8 @@ int mdbx_cursor_on_last(const MDBX_cursor *mc) { if (!(mc->mc_flags & C_INITIALIZED)) return mc->mc_db->md_entries ? MDBX_RESULT_FALSE : MDBX_RESULT_TRUE; - for (unsigned i = 0; i < mc->mc_snum; ++i) { - unsigned nkeys = page_numkeys(mc->mc_pg[i]); + for (size_t i = 0; i < mc->mc_snum; ++i) { + size_t nkeys = page_numkeys(mc->mc_pg[i]); if (mc->mc_ki[i] < nkeys - 1) return MDBX_RESULT_FALSE; } @@ -22097,8 +22093,8 @@ int mdbx_cursor_eof(const MDBX_cursor *mc) { struct diff_result { ptrdiff_t diff; - unsigned level; - int root_nkeys; + size_t level; + ptrdiff_t root_nkeys; }; /* calculates: r = x - y */ @@ -22132,18 +22128,18 @@ __hot static int cursor_diff(const MDBX_cursor *const __restrict x, while (likely(r->level < y->mc_snum && r->level < x->mc_snum)) { if (unlikely(y->mc_pg[r->level] != x->mc_pg[r->level])) { - ERROR("Mismatch cursors's pages at %u level", r->level); + ERROR("Mismatch cursors's pages at %zu level", r->level); return MDBX_PROBLEM; } - int nkeys = page_numkeys(y->mc_pg[r->level]); + intptr_t nkeys = page_numkeys(y->mc_pg[r->level]); assert(nkeys > 0); if (r->level == 0) r->root_nkeys = nkeys; - const int limit_ki = nkeys - 1; - const int x_ki = x->mc_ki[r->level]; - const int y_ki = y->mc_ki[r->level]; + const intptr_t limit_ki = nkeys - 1; + const intptr_t x_ki = x->mc_ki[r->level]; + const intptr_t y_ki = y->mc_ki[r->level]; r->diff = ((x_ki < limit_ki) ? x_ki : limit_ki) - ((y_ki < limit_ki) ? y_ki : limit_ki); if (r->diff == 0) { @@ -23312,7 +23308,7 @@ __cold void global_ctor(void) { bootid = osal_bootid(); #if MDBX_DEBUG - for (unsigned i = 0; i < 2 * 2 * 2 * 3 * 3 * 3; ++i) { + for (size_t i = 0; i < 2 * 2 * 2 * 3 * 3 * 3; ++i) { const bool s0 = (i >> 0) & 1; const bool s1 = (i >> 1) & 1; const bool s2 = (i >> 2) & 1; @@ -23359,9 +23355,9 @@ __cold void global_ctor(void) { #endif /* MDBX_DEBUG*/ #if 0 /* debug */ - for (unsigned i = 0; i < 65536; ++i) { + for (size_t i = 0; i < 65536; ++i) { size_t pages = pv2pages(i); - unsigned x = pages2pv(pages); + size_t x = pages2pv(pages); size_t xp = pv2pages(x); if (!(x == i || (x % 2 == 0 && x < 65536)) || pages != xp) printf("%u => %zu => %u => %zu\n", i, pages, x, xp); diff --git a/src/internals.h b/src/internals.h index 05f7393f..061e5cba 100644 --- a/src/internals.h +++ b/src/internals.h @@ -570,7 +570,7 @@ typedef struct MDBX_page { : PAGETYPE_WHOLE(p)) /* Size of the page header, excluding dynamic data at the end */ -#define PAGEHDRSZ ((unsigned)offsetof(MDBX_page, mp_ptrs)) +#define PAGEHDRSZ offsetof(MDBX_page, mp_ptrs) #pragma pack(pop) @@ -860,7 +860,7 @@ typedef struct MDBX_dp { MDBX_page *ptr; pgno_t pgno; union { - unsigned extra; + uint32_t extra; __anonymous_struct_extension__ struct { unsigned multi : 1; unsigned lru : 31; @@ -870,10 +870,10 @@ typedef struct MDBX_dp { /* An DPL (dirty-page list) is a sorted array of MDBX_DPs. */ typedef struct MDBX_dpl { - unsigned sorted; - unsigned length; - unsigned pages_including_loose; /* number of pages, but not an entries. */ - unsigned detent; /* allocated size excluding the MDBX_DPL_RESERVE_GAP */ + size_t sorted; + size_t length; + size_t pages_including_loose; /* number of pages, but not an entries. */ + size_t detent; /* allocated size excluding the MDBX_DPL_RESERVE_GAP */ #if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || \ (!defined(__cplusplus) && defined(_MSC_VER)) MDBX_dp items[] /* dynamic size with holes at zero and after the last */; @@ -892,11 +892,17 @@ typedef struct MDBX_dpl { ((1u << 17) - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(txnid_t)) #define MDBX_PNL_ALLOCLEN(pl) ((pl)[-1]) -#define MDBX_PNL_SIZE(pl) ((pl)[0]) +#define MDBX_PNL_GETSIZE(pl) ((size_t)((pl)[0])) +#define MDBX_PNL_SETSIZE(pl, size) \ + do { \ + const size_t __size = size; \ + assert(__size < INT_MAX); \ + (pl)[0] = (pgno_t)__size; \ + } while (0) #define MDBX_PNL_FIRST(pl) ((pl)[1]) -#define MDBX_PNL_LAST(pl) ((pl)[MDBX_PNL_SIZE(pl)]) +#define MDBX_PNL_LAST(pl) ((pl)[MDBX_PNL_GETSIZE(pl)]) #define MDBX_PNL_BEGIN(pl) (&(pl)[1]) -#define MDBX_PNL_END(pl) (&(pl)[MDBX_PNL_SIZE(pl) + 1]) +#define MDBX_PNL_END(pl) (&(pl)[MDBX_PNL_GETSIZE(pl) + 1]) #if MDBX_PNL_ASCENDING #define MDBX_PNL_LEAST(pl) MDBX_PNL_FIRST(pl) @@ -906,8 +912,8 @@ typedef struct MDBX_dpl { #define MDBX_PNL_MOST(pl) MDBX_PNL_FIRST(pl) #endif -#define MDBX_PNL_SIZEOF(pl) ((MDBX_PNL_SIZE(pl) + 1) * sizeof(pgno_t)) -#define MDBX_PNL_IS_EMPTY(pl) (MDBX_PNL_SIZE(pl) == 0) +#define MDBX_PNL_SIZEOF(pl) ((MDBX_PNL_GETSIZE(pl) + 1) * sizeof(pgno_t)) +#define MDBX_PNL_IS_EMPTY(pl) (MDBX_PNL_GETSIZE(pl) == 0) /*----------------------------------------------------------------------------*/ /* Internal structures */ @@ -1013,13 +1019,13 @@ struct MDBX_txn { #if MDBX_ENABLE_REFUND pgno_t loose_refund_wl /* FIXME: describe */; #endif /* MDBX_ENABLE_REFUND */ + /* a sequence to spilling dirty page with LRU policy */ + unsigned dirtylru; /* dirtylist room: Dirty array size - dirty pages visible to this txn. * Includes ancestor txns' dirty pages not hidden by other txns' * dirty/spilled pages. Thus commit(nested txn) has room to merge * dirtylist into mt_parent after freeing hidden mt_parent pages. */ - unsigned dirtyroom; - /* a sequence to spilling dirty page with LRU policy */ - unsigned dirtylru; + size_t dirtyroom; /* For write txns: Modified pages. Sorted when not MDBX_WRITEMAP. */ MDBX_dpl *dirtylist; /* The list of reclaimed txns from GC */ @@ -1030,8 +1036,8 @@ struct MDBX_txn { * in this transaction, linked through `mp_next`. */ MDBX_page *loose_pages; /* Number of loose pages (tw.loose_pages) */ - unsigned loose_count; - unsigned spill_least_removed; + size_t loose_count; + size_t spill_least_removed; /* The sorted list of dirty pages we temporarily wrote to disk * because the dirty list was full. page numbers in here are * shifted left by 1, deleted slots have the LSB set. */ diff --git a/src/osal.c b/src/osal.c index 34aeb62d..f8600cb8 100644 --- a/src/osal.c +++ b/src/osal.c @@ -583,7 +583,7 @@ static size_t osal_iov_max; MDBX_INTERNAL_FUNC int osal_ioring_create(osal_ioring_t *ior, #if defined(_WIN32) || defined(_WIN64) - unsigned flags, + uint8_t flags, #endif /* Windows */ mdbx_filehandle_t fd) { memset(ior, 0, sizeof(osal_ioring_t)); @@ -1480,14 +1480,14 @@ MDBX_INTERNAL_FUNC int osal_write(mdbx_filehandle_t fd, const void *buf, } } -int osal_pwritev(mdbx_filehandle_t fd, struct iovec *iov, int sgvcnt, +int osal_pwritev(mdbx_filehandle_t fd, struct iovec *iov, size_t sgvcnt, uint64_t offset) { size_t expected = 0; - for (int i = 0; i < sgvcnt; ++i) + for (size_t i = 0; i < sgvcnt; ++i) expected += iov[i].iov_len; #if !MDBX_HAVE_PWRITEV size_t written = 0; - for (int i = 0; i < sgvcnt; ++i) { + for (size_t i = 0; i < sgvcnt; ++i) { int rc = osal_pwrite(fd, iov[i].iov_base, iov[i].iov_len, offset); if (unlikely(rc != MDBX_SUCCESS)) return rc; diff --git a/src/osal.h b/src/osal.h index 568ae9c0..7d4b37b3 100644 --- a/src/osal.h +++ b/src/osal.h @@ -352,7 +352,7 @@ typedef struct osal_ioring { /* Actually this is not ioring for now, but on the way. */ MDBX_INTERNAL_FUNC int osal_ioring_create(osal_ioring_t *, #if defined(_WIN32) || defined(_WIN64) - unsigned flags, + uint8_t flags, #endif /* Windows */ mdbx_filehandle_t fd); MDBX_INTERNAL_FUNC int osal_ioring_resize(osal_ioring_t *, size_t items); @@ -380,11 +380,11 @@ static inline unsigned osal_ioring_used(const osal_ioring_t *ior) { return ior->allocated - ior->slots_left; } -static inline int osal_ioring_reserve(osal_ioring_t *ior, unsigned items, +static inline int osal_ioring_reserve(osal_ioring_t *ior, size_t items, size_t bytes) { items = (items > 32) ? items : 32; #if defined(_WIN32) || defined(_WIN64) - const unsigned npages = (unsigned)(bytes >> ior->pagesize_ln2); + const size_t npages = bytes >> ior->pagesize_ln2; items = (items > npages) ? items : npages; #else (void)bytes; @@ -468,7 +468,7 @@ MDBX_INTERNAL_FUNC int osal_fastmutex_release(osal_fastmutex_t *fastmutex); MDBX_INTERNAL_FUNC int osal_fastmutex_destroy(osal_fastmutex_t *fastmutex); MDBX_INTERNAL_FUNC int osal_pwritev(mdbx_filehandle_t fd, struct iovec *iov, - int sgvcnt, uint64_t offset); + size_t sgvcnt, uint64_t offset); MDBX_INTERNAL_FUNC int osal_pread(mdbx_filehandle_t fd, void *buf, size_t count, uint64_t offset); MDBX_INTERNAL_FUNC int osal_pwrite(mdbx_filehandle_t fd, const void *buf, From 66f2e3d596e079d46d523c658055acb58ecd2a10 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Fri, 30 Sep 2022 14:06:55 +0300 Subject: [PATCH 125/364] =?UTF-8?q?mdbx:=20=D0=B4=D0=BE=D0=B1=D0=B0=D0=B2?= =?UTF-8?q?=D0=BB=D0=B5=D0=BD=D0=B8=D0=B5=20=D0=B2=20API=20=D1=84=D1=83?= =?UTF-8?q?=D0=BD=D0=BA=D1=86=D0=B8=D0=B9=20`mdbx=5Flimits=5Fpairsize4page?= =?UTF-8?q?=5Fmax()`=20=D0=B8=20`mdbx=5Flimits=5Fvalsize4page=5Fmax()`=20?= =?UTF-8?q?=D1=81=20=D1=81=D0=BE=D0=BF=D1=83=D1=82=D1=81=D1=82=D0=B2=D1=83?= =?UTF-8?q?=D1=8E=D1=89=D0=B8=D0=BC=D0=B8=20=D0=B4=D0=BE=D1=80=D0=B0=D0=B1?= =?UTF-8?q?=D0=BE=D1=82=D0=BA=D0=B0=D0=BC=D0=B8.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mdbx.h | 15 +++++++++++++++ src/core.c | 39 ++++++++++++++++++++++++++++++++++++--- src/internals.h | 7 ++++--- 3 files changed, 55 insertions(+), 6 deletions(-) diff --git a/mdbx.h b/mdbx.h index 7ab171a1..b8bca567 100644 --- a/mdbx.h +++ b/mdbx.h @@ -3119,6 +3119,21 @@ mdbx_limits_keysize_max(intptr_t pagesize, MDBX_db_flags_t flags); MDBX_NOTHROW_CONST_FUNCTION LIBMDBX_API intptr_t mdbx_limits_valsize_max(intptr_t pagesize, MDBX_db_flags_t flags); +/** \brief Returns maximal size of key-value pair to fit in a single page with + * the given size and database flags, or -1 if pagesize is invalid. + * \ingroup c_statinfo + * \see db_flags */ +MDBX_NOTHROW_CONST_FUNCTION LIBMDBX_API intptr_t +mdbx_limits_pairsize4page_max(intptr_t pagesize, MDBX_db_flags_t flags); + +/** \brief Returns maximal data size in bytes to fit in a leaf-page or + * single overflow/large-page with the given page size and database flags, + * or -1 if pagesize is invalid. + * \ingroup c_statinfo + * \see db_flags */ +MDBX_NOTHROW_CONST_FUNCTION LIBMDBX_API intptr_t +mdbx_limits_valsize4page_max(intptr_t pagesize, MDBX_db_flags_t flags); + /** \brief Returns maximal write transaction size (i.e. limit for summary volume * of dirty pages) in bytes for given page size, or -1 if pagesize is invalid. * \ingroup c_statinfo */ diff --git a/src/core.c b/src/core.c index 2dab0edb..90ae40b2 100644 --- a/src/core.c +++ b/src/core.c @@ -416,7 +416,7 @@ node_largedata_pgno(const MDBX_node *const __restrict node) { * and so on up to the root. Therefore double-splitting is avoided here and * the maximum node size is half of a leaf page space: * LEAF_NODE_MAX = even_floor(PAGEROOM / 2 - sizeof(indx_t)); - * DATALEN_NO_OVERFLOW = LEAF_NODE_MAX - KEYLEN_MAX; + * DATALEN_NO_OVERFLOW = LEAF_NODE_MAX - NODESIZE - KEYLEN_MAX; * * - SubDatabase-node must fit into one leaf-page: * SUBDB_NAME_MAX = LEAF_NODE_MAX - node_hdr_len - sizeof(MDBX_db); @@ -530,6 +530,38 @@ __cold intptr_t mdbx_limits_valsize_max(intptr_t pagesize, return valsize_max(pagesize, flags); } +__cold intptr_t mdbx_limits_pairsize4page_max(intptr_t pagesize, + MDBX_db_flags_t flags) { + if (pagesize < 1) + pagesize = (intptr_t)mdbx_default_pagesize(); + if (unlikely(pagesize < (intptr_t)MIN_PAGESIZE || + pagesize > (intptr_t)MAX_PAGESIZE || + !is_powerof2((size_t)pagesize))) + return -1; + + if (flags & + (MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_INTEGERDUP | MDBX_REVERSEDUP)) + return BRANCH_NODE_MAX(pagesize) - NODESIZE; + + return LEAF_NODE_MAX(pagesize) - NODESIZE; +} + +intptr_t mdbx_limits_valsize4page_max(intptr_t pagesize, + MDBX_db_flags_t flags) { + if (pagesize < 1) + pagesize = (intptr_t)mdbx_default_pagesize(); + if (unlikely(pagesize < (intptr_t)MIN_PAGESIZE || + pagesize > (intptr_t)MAX_PAGESIZE || + !is_powerof2((size_t)pagesize))) + return -1; + + if (flags & + (MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_INTEGERDUP | MDBX_REVERSEDUP)) + return valsize_max(pagesize, flags); + + return PAGEROOM(pagesize); +} + /* Calculate the size of a leaf node. * * The size depends on the environment's page size; if a data item @@ -565,10 +597,10 @@ branch_size(const MDBX_env *env, const MDBX_val *key) { /* Size of a node in a branch page with a given key. * This is just the node header plus the key, there is no data. */ size_t node_bytes = node_size(key, nullptr); - if (unlikely(node_bytes > env->me_leaf_nodemax)) { + if (unlikely(node_bytes > env->me_branch_nodemax)) { /* put on large/overflow page */ /* not implemented */ - mdbx_assert_fail(env, "INDXSIZE(key) <= env->me_nodemax", __func__, + mdbx_assert_fail(env, "node_size(key) <= branch_nodemax", __func__, __LINE__); node_bytes = node_size(key, nullptr) + sizeof(pgno_t); } @@ -11544,6 +11576,7 @@ __cold static void setup_pagesize(MDBX_env *env, const size_t pagesize) { leaf_nodemax >= branch_nodemax && leaf_nodemax < (int)UINT16_MAX && leaf_nodemax % 2 == 0); env->me_leaf_nodemax = (unsigned)leaf_nodemax; + env->me_branch_nodemax = (unsigned)branch_nodemax; env->me_psize2log = (uint8_t)log2n_powerof2(pagesize); eASSERT(env, pgno2bytes(env, 1) == pagesize); eASSERT(env, bytes2pgno(env, pagesize + pagesize) == 2); diff --git a/src/internals.h b/src/internals.h index 061e5cba..859a41c0 100644 --- a/src/internals.h +++ b/src/internals.h @@ -1162,9 +1162,10 @@ struct MDBX_env { #define me_lfd me_lck_mmap.fd struct MDBX_lockinfo *me_lck; - unsigned me_psize; /* DB page size, initialized from me_os_psize */ - unsigned me_leaf_nodemax; /* max size of a leaf-node */ - uint8_t me_psize2log; /* log2 of DB page size */ + unsigned me_psize; /* DB page size, initialized from me_os_psize */ + unsigned me_leaf_nodemax; /* max size of a leaf-node */ + unsigned me_branch_nodemax; /* max size of a branch-node */ + uint8_t me_psize2log; /* log2 of DB page size */ int8_t me_stuck_meta; /* recovery-only: target meta page or less that zero */ uint16_t me_merge_threshold, me_merge_threshold_gc; /* pages emptier than this are candidates for From 6eeb08de46081787b11b6edd8bc6e5952555c52f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Fri, 30 Sep 2022 17:13:07 +0300 Subject: [PATCH 126/364] =?UTF-8?q?mdbx:=20=D0=B8=D1=81=D0=BF=D0=BE=D0=BB?= =?UTF-8?q?=D1=8C=D0=B7=D0=BE=D0=B2=D0=B0=D0=BD=D0=B8=D0=B5=20`mdbx=5Fpani?= =?UTF-8?q?c()`=20=D0=B2=D0=BC=D0=B5=D1=81=D1=82=D0=BE=20`=5F=5Fassert=5Ff?= =?UTF-8?q?ail()`=20=D0=B2=20=D1=80=D1=8F=D0=B4=D0=B5=20=D0=B2=D0=BD=D1=83?= =?UTF-8?q?=D1=82=D1=80=D0=B5=D0=BD=D0=BD=D0=B8=D1=85=20=D0=BF=D1=80=D0=BE?= =?UTF-8?q?=D0=B2=D0=B5=D1=80=D0=BE=D0=BA.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/src/core.c b/src/core.c index 90ae40b2..008dae75 100644 --- a/src/core.c +++ b/src/core.c @@ -600,8 +600,8 @@ branch_size(const MDBX_env *env, const MDBX_val *key) { if (unlikely(node_bytes > env->me_branch_nodemax)) { /* put on large/overflow page */ /* not implemented */ - mdbx_assert_fail(env, "node_size(key) <= branch_nodemax", __func__, - __LINE__); + mdbx_panic("node_size(key) %zu > %u branch_nodemax", node_bytes, + env->me_branch_nodemax); node_bytes = node_size(key, nullptr) + sizeof(pgno_t); } @@ -13803,8 +13803,7 @@ __hot static int cmp_int_align4(const MDBX_val *a, const MDBX_val *b) { return CMP2INT(unaligned_peek_u64(4, a->iov_base), unaligned_peek_u64(4, b->iov_base)); default: - mdbx_assert_fail(NULL, "invalid size for INTEGERKEY/INTEGERDUP", __func__, - __LINE__); + mdbx_panic("invalid size %zu for INTEGERKEY/INTEGERDUP", a->iov_len); return 0; } } @@ -13820,8 +13819,7 @@ __hot static int cmp_int_align2(const MDBX_val *a, const MDBX_val *b) { return CMP2INT(unaligned_peek_u64(2, a->iov_base), unaligned_peek_u64(2, b->iov_base)); default: - mdbx_assert_fail(NULL, "invalid size for INTEGERKEY/INTEGERDUP", __func__, - __LINE__); + mdbx_panic("invalid size %zu for INTEGERKEY/INTEGERDUP", a->iov_len); return 0; } } @@ -13839,8 +13837,7 @@ __hot static int cmp_int_unaligned(const MDBX_val *a, const MDBX_val *b) { return CMP2INT(unaligned_peek_u64(1, a->iov_base), unaligned_peek_u64(1, b->iov_base)); default: - mdbx_assert_fail(NULL, "invalid size for INTEGERKEY/INTEGERDUP", __func__, - __LINE__); + mdbx_panic("invalid size %zu for INTEGERKEY/INTEGERDUP", a->iov_len); return 0; } } From c17617b8164e9907ac51e0073b9e1a71eca72a3c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Fri, 30 Sep 2022 19:40:18 +0300 Subject: [PATCH 127/364] =?UTF-8?q?mdbx:=20=D0=BE=D0=B1=D0=BB=D0=B5=D0=B3?= =?UTF-8?q?=D1=87=D0=B5=D0=BD=D0=BD=D0=B0=D1=8F=20`assert=5Ffail()`=20?= =?UTF-8?q?=D0=B4=D0=BB=D1=8F=20=D0=BD=D0=B5-=D0=BE=D1=82=D0=BB=D0=B0?= =?UTF-8?q?=D0=B4=D0=BE=D1=87=D0=BD=D1=8B=D1=85=20=D1=81=D0=B1=D0=BE=D1=80?= =?UTF-8?q?=D0=BE=D0=BA.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/internals.h | 14 +++++++++++++- src/osal.c | 17 ++++++++++++----- 2 files changed, 25 insertions(+), 6 deletions(-) diff --git a/src/internals.h b/src/internals.h index 859a41c0..8af9dde2 100644 --- a/src/internals.h +++ b/src/internals.h @@ -1368,10 +1368,22 @@ MDBX_INTERNAL_FUNC void debug_log_va(int level, const char *function, int line, #define FATAL(fmt, ...) \ debug_log(MDBX_LOG_FATAL, __func__, __LINE__, fmt "\n", __VA_ARGS__); +#if MDBX_DEBUG +#define ASSERT_FAIL(env, msg, func, line) mdbx_assert_fail(env, msg, func, line) +#else /* MDBX_DEBUG */ +MDBX_NORETURN __cold void assert_fail(const char *msg, const char *func, + unsigned line); +#define ASSERT_FAIL(env, msg, func, line) \ + do { \ + (void)(env); \ + assert_fail(msg, func, line); \ + } while (0) +#endif /* MDBX_DEBUG */ + #define ENSURE_MSG(env, expr, msg) \ do { \ if (unlikely(!(expr))) \ - mdbx_assert_fail(env, msg, __func__, __LINE__); \ + ASSERT_FAIL(env, msg, __func__, __LINE__); \ } while (0) #define ENSURE(env, expr) ENSURE_MSG(env, expr, #expr) diff --git a/src/osal.c b/src/osal.c index f8600cb8..6a2697fa 100644 --- a/src/osal.c +++ b/src/osal.c @@ -224,12 +224,15 @@ __extern_C void __assert(const char *function, const char *file, int line, __cold void mdbx_assert_fail(const MDBX_env *env, const char *msg, const char *func, unsigned line) { #if MDBX_DEBUG - if (env && env->me_assert_func) { + if (env && env->me_assert_func) env->me_assert_func(env, msg, func, line); - return; - } #else (void)env; + assert_fail(msg, func, line); +} + +MDBX_NORETURN __cold void assert_fail(const char *msg, const char *func, + unsigned line) { #endif /* MDBX_DEBUG */ if (debug_logger) @@ -266,8 +269,12 @@ __cold void mdbx_panic(const char *fmt, ...) { const int num = osal_vasprintf(&message, fmt, ap); va_end(ap); const char *const const_message = - (num < 1 || !message) ? "" - : message; + unlikely(num < 1 || !message) + ? "" + : message; + + if (debug_logger) + debug_log(MDBX_LOG_FATAL, "panic", 0, "%s", const_message); while (1) { #if defined(_WIN32) || defined(_WIN64) From a95ee8daa30dbc5bb2915f427043c51754e11d84 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Sat, 1 Oct 2022 01:54:09 +0300 Subject: [PATCH 128/364] =?UTF-8?q?mdbx:=20=D0=BC=D0=B8=D0=BD=D0=BE=D1=80?= =?UTF-8?q?=D0=BD=D0=B0=D1=8F=20=D0=B4=D0=BE=D1=80=D0=B0=D0=B1=D0=BE=D1=82?= =?UTF-8?q?=D0=BA=D0=B0=20`mdbx=5Fenv=5Fcreate()`.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/core.c b/src/core.c index 008dae75..0a9b8285 100644 --- a/src/core.c +++ b/src/core.c @@ -11669,12 +11669,11 @@ __cold int mdbx_env_create(MDBX_env **penv) { env->me_options.dp_loose_limit = 64; env->me_options.merge_threshold_16dot16_percent = 65536 / 4 /* 25% */; - int rc; env->me_os_psize = (unsigned)os_psize; setup_pagesize(env, (env->me_os_psize < MAX_PAGESIZE) ? env->me_os_psize : MAX_PAGESIZE); - rc = osal_fastmutex_init(&env->me_dbi_lock); + int rc = osal_fastmutex_init(&env->me_dbi_lock); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; From 559f3005caf79237e5a43f029fd12dec243fae1b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Thu, 6 Oct 2022 13:52:05 +0300 Subject: [PATCH 129/364] =?UTF-8?q?mdbx-test:=20=D1=87=D1=83=D1=82=D1=8C?= =?UTF-8?q?=20=D0=B1=D0=BE=D0=BB=D1=8C=D1=88=D0=B5=20=D0=BB=D0=BE=D0=B3?= =?UTF-8?q?=D0=B8=D1=80=D0=BE=D0=B2=D0=B0=D0=BD=D0=B8=D1=8F=20=D0=BE=D1=88?= =?UTF-8?q?=D0=B8=D0=B1=D0=BE=D0=BA.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test/main.cc | 7 +++++-- test/osal-windows.cc | 1 + 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/test/main.cc b/test/main.cc index 88d47799..e5854276 100644 --- a/test/main.cc +++ b/test/main.cc @@ -681,9 +681,9 @@ int main(int argc, char *const argv[]) { if (!actor) continue; - log_verbose("actor #%u, id %d, pid %ld: %s\n", actor->actor_id, - actor->space_id, (long)pid, status2str(status)); if (status > as_running) { + log_notice("actor #%u, id %d, pid %ld: %s\n", actor->actor_id, + actor->space_id, (long)pid, status2str(status)); left -= 1; if (status != as_successful) { if (global::config::failfast && !failed) { @@ -693,6 +693,9 @@ int main(int argc, char *const argv[]) { } failed = true; } + } else { + log_verbose("actor #%u, id %d, pid %ld: %s\n", actor->actor_id, + actor->space_id, (long)pid, status2str(status)); } } else { if (timeout_seconds_left == 0) diff --git a/test/osal-windows.cc b/test/osal-windows.cc index c90e4c05..7b3b4437 100644 --- a/test/osal-windows.cc +++ b/test/osal-windows.cc @@ -360,6 +360,7 @@ actor_status osal_actor_info(const mdbx_pid_t pid) { status = as_coredump; break; default: + log_error("pid %u, ExitCode", pid, ExitCode); status = as_failed; break; } From 24d7a4d6058db365c44a9aec822b372836d5aa96 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Thu, 6 Oct 2022 20:00:50 +0300 Subject: [PATCH 130/364] =?UTF-8?q?mdbx:=20=D0=B4=D0=BE=D0=B1=D0=B0=D0=B2?= =?UTF-8?q?=D0=BB=D0=B5=D0=BD=D0=BE=20=D0=BE=D0=BF=D0=B8=D1=81=D0=B0=D0=BD?= =?UTF-8?q?=D0=B8=D0=B5=20=D0=B8=D1=81=D0=BF=D0=BE=D0=BB=D1=8C=D0=B7=D0=BE?= =?UTF-8?q?=D0=B2=D0=B0=D0=BD=D0=B8=D1=8F=20=D1=84=D0=B0=D0=B9=D0=BB=D0=BE?= =?UTF-8?q?=D0=B2=D1=8B=D1=85=20=D0=B4=D0=B5=D1=81=D0=BA=D1=80=D0=B8=D0=BF?= =?UTF-8?q?=D1=82=D0=BE=D1=80=D0=BE=D0=B2=20=D0=B2=20=D1=80=D0=B0=D0=B7?= =?UTF-8?q?=D0=BB=D0=B8=D1=87=D0=BD=D1=8B=D1=85=20=D1=80=D0=B5=D0=B6=D0=B8?= =?UTF-8?q?=D0=BC=D0=B0=D1=85.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core.c | 107 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 107 insertions(+) diff --git a/src/core.c b/src/core.c index 0a9b8285..562ce62e 100644 --- a/src/core.c +++ b/src/core.c @@ -13376,6 +13376,93 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, env->me_dbxs[FREE_DBI].md_cmp = cmp_int_align4; /* aligned MDBX_INTEGERKEY */ env->me_dbxs[FREE_DBI].md_dcmp = cmp_lenfast; + /* Использование O_DSYNC или FILE_FLAG_WRITE_THROUGH: + * + * 0) Если размер страниц БД меньше системной страницы ОЗУ, то ядру ОС + * придется чаще обновлять страницы в unified page cache. + * + * Однако, O_DSYNC не предполагает отключение unified page cache, + * поэтому подобные затруднения будем считать проблемой ОС и/или + * ожидаемым пенальти из-за использования мелких страниц БД. + * + * 1) В режиме MDBX_SYNC_DURABLE - O_DSYNC для записи как данных, + * так и мета-страниц. Однако, на Linux отказ от O_DSYNC с последующим + * fdatasync() может быть выгоднее при использовании HDD, так как + * позволяет io-scheduler переупорядочить запись с учетом актуального + * расположения файла БД на носителе. + * + * 2) В режиме MDBX_NOMETASYNC - O_DSYNC можно использовать для данных, + * но в этом может не быть смысла, так как fdatasync() всё равно + * требуется для гарантии фиксации мета после предыдущей транзакции. + * + * В итоге на нормальных системах (не Windows) есть два варианта: + * - при возможности O_DIRECT и/или io_ring для данных, скорее всего, + * есть смысл вызвать fdatasync() перед записью данных, а затем + * использовать O_DSYNC; + * - не использовать O_DSYNC и вызывать fdatasync() после записи данных. + * + * На Windows же следует минимизировать использование FlushFileBuffers() + * из-за проблем с производительностью. Поэтому на Windows в режиме + * MDBX_NOMETASYNC: + * - мета обновляется через дескриптор без FILE_FLAG_WRITE_THROUGH; + * - перед началом записи данных вызывается FlushFileBuffers(), если + * mti_meta_sync_txnid не совпадает с последней записанной мета; + * - данные записываются через дескриптор с FILE_FLAG_WRITE_THROUGH. + * + * 3) В режиме MDBX_SAFE_NOSYNC - O_DSYNC нет смысла использовать, пока не + * будет реализована возможность полностью асинхронной "догоняющей" + * записи в выделенном процессе-сервере с io-ring очередями внутри. + * + * ----- + * + * Использование O_DIRECT или FILE_FLAG_NO_BUFFERING: + * + * Назначение этих флагов в отключении файлового дескриптора от + * unified page cache, т.е. от отображенных в память данных в случае + * libmdbx. + * + * Поэтому, использование direct i/o в libmdbx без MDBX_WRITEMAP лишено + * смысла и контр-продуктивно, ибо так мы провоцируем ядро ОС на + * не-когерентность отображения в память с содержимым файла на носителе, + * либо требуем дополнительных проверок и действий направленных на + * фактическое отключение O_DIRECT для отображенных в память данных. + * + * В режиме MDBX_WRITEMAP когерентность отображенных данных обеспечивается + * физически. Поэтому использование direct i/o может иметь смысл, если у + * ядра ОС есть какие-то проблемы с msync(), в том числе с + * производительностью: + * - использование io_ring или gather-write может быть дешевле, чем + * просмотр PTE ядром и запись измененных/грязных; + * - но проблема в том, что записываемые из user mode страницы либо не + * будут помечены чистыми (и соответственно будут записаны ядром + * еще раз), либо ядру необходимо искать и чистить PTE при получении + * запроса на запись. + * + * Поэтому O_DIRECT или FILE_FLAG_NO_BUFFERING используется: + * - только в режиме MDBX_SYNC_DURABLE с MDBX_WRITEMAP; + * - когда me_psize >= me_os_psize; + * - опция сборки MDBX_AVOID_MSYNC != 0, которая по-умолчанию включена + * только на Windows (см ниже). + * + * ----- + * + * Использование FILE_FLAG_OVERLAPPED на Windows: + * + * У Windows очень плохо с I/O (за исключением прямых постраничных + * scatter/gather, которые работают в обход проблемного unified page + * cache и поэтому почти бесполезны в libmdbx). + * + * При этом всё еще хуже при использовании FlushFileBuffers(), что также + * требуется после FlushViewOfFile() в режиме MDBX_WRITEMAP. Поэтому + * на Windows вместо FlushViewOfFile() и FlushFileBuffers() следует + * использовать запись через дескриптор с FILE_FLAG_WRITE_THROUGH. + * + * В свою очередь, запись с FILE_FLAG_WRITE_THROUGH дешевле/быстрее + * при использовании FILE_FLAG_OVERLAPPED. В результате, на Windows + * в durable-режимах запись данных всегда в overlapped-режиме, + * при этом для записи мета требуется отдельный не-overlapped дескриптор. + */ + rc = osal_openfile((flags & MDBX_RDONLY) ? MDBX_OPEN_DXB_READ : MDBX_OPEN_DXB_LAZY, env, env_pathname.dxb, &env->me_lazy_fd, mode); @@ -13401,6 +13488,26 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, if ((flags & (MDBX_RDONLY | MDBX_SAFE_NOSYNC)) == MDBX_SYNC_DURABLE) { ior_flags = IOR_OVERLAPPED; if ((flags & MDBX_WRITEMAP) && MDBX_AVOID_MSYNC) { + /* Запрошен режим MDBX_SAFE_NOSYNC | MDBX_WRITEMAP при активной опции + * MDBX_AVOID_MSYNC. + * + * 1) В этой комбинации наиболее выгодно использовать WriteFileGather(), + * но для этого необходимо открыть файл с флагом FILE_FLAG_NO_BUFFERING и + * после обеспечивать выравнивание адресов и размера данных на границу + * системной страницы, что в свою очередь возможно если размер страницы БД + * не меньше размера системной страницы ОЗУ. Поэтому для открытия файла в + * нужном режиме требуется знать размер страницы БД. + * + * 2) Кроме этого, в Windows запись в заблокированный регион файла + * возможно только через тот-же дескриптор. Поэтому изначальный захват + * блокировок посредством osal_lck_seize(), захват/освобождение блокировок + * во время пишущих транзакций и запись данных должны выполнять через один + * дескриптор. + * + * Таким образом, требуется прочитать волатильный заголовок БД, чтобы + * узнать размер страницы, чтобы открыть дескриптор файла в режиме нужном + * для записи данных, чтобы использовать именно этот дескриптор для + * изначального захвата блокировок. */ MDBX_meta header; if (read_header(env, &header, MDBX_SUCCESS, true) == MDBX_SUCCESS && header.mm_psize >= env->me_os_psize) From 3579496945c69a7a0f2b7b8705f804d26e16da3b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Thu, 6 Oct 2022 20:50:49 +0300 Subject: [PATCH 131/364] =?UTF-8?q?mdbx:=20=D0=BE=D1=82=D0=BA=D0=BB=D1=8E?= =?UTF-8?q?=D1=87=D0=B5=D0=BD=D0=B8=D0=B5=20`MDBX=5FHAVE=5FBUILTIN=5FCPU?= =?UTF-8?q?=5FSUPPORTS`=20=D0=B4=D0=BB=D1=8F=20e2k.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/options.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/options.h b/src/options.h index 1a28e619..c81529fd 100644 --- a/src/options.h +++ b/src/options.h @@ -200,6 +200,8 @@ /* Never use any modern features on Apple's or Google's OSes * since a lot of troubles with compatibility and/or performance */ #define MDBX_HAVE_BUILTIN_CPU_SUPPORTS 0 +#elif defined(__e2k__) +#define MDBX_HAVE_BUILTIN_CPU_SUPPORTS 0 #elif __has_builtin(__builtin_cpu_supports) || \ defined(__BUILTIN_CPU_SUPPORTS__) || \ (defined(__ia32__) && __GNUC_PREREQ(4, 8) && __GLIBC_PREREQ(2, 23)) From bee7431f76dc9aa21415fb9e88aa241920f841ae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Thu, 6 Oct 2022 23:47:16 +0300 Subject: [PATCH 132/364] =?UTF-8?q?mdbx++:=20=D0=B4=D0=BE=D0=B1=D0=B0?= =?UTF-8?q?=D0=B2=D0=BB=D0=B5=D0=BD=D0=B0=20=D1=84=D0=B8=D0=BA=D1=81=D0=B0?= =?UTF-8?q?=D1=86=D0=B8=D1=8F=20=D1=82=D1=80=D0=B0=D0=BD=D0=B7=D0=B0=D0=BA?= =?UTF-8?q?=D1=86=D0=B8=D0=B8=20=D1=81=20=D0=BF=D0=BE=D0=BB=D1=83=D1=87?= =?UTF-8?q?=D0=B5=D0=BD=D0=B8=D0=B5=D0=BC=20=D0=B8=D0=BD=D1=84=D0=BE=D1=80?= =?UTF-8?q?=D0=BC=D0=B0=D1=86=D0=B8=D0=B8=20=D0=BE=20=D0=B7=D0=B0=D0=B4?= =?UTF-8?q?=D0=B5=D1=80=D0=B6=D0=BA=D0=B0=D1=85.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mdbx.h++ | 23 +++++++++++++++++++++-- src/mdbx.c++ | 9 +++++++++ 2 files changed, 30 insertions(+), 2 deletions(-) diff --git a/mdbx.h++ b/mdbx.h++ index 623b4cc2..473def91 100644 --- a/mdbx.h++ +++ b/mdbx.h++ @@ -3875,12 +3875,31 @@ public: //---------------------------------------------------------------------------- - /// \brief Abandon all the operations of the transaction instead of saving - /// them. + /// \brief Abandon all the operations of the transaction + /// instead of saving ones. void abort(); /// \brief Commit all the operations of a transaction into the database. void commit(); + + using commit_latency = MDBX_commit_latency; + + /// \brief Commit all the operations of a transaction into the database + /// and collect latency information. + void commit(commit_latency *); + + /// \brief Commit all the operations of a transaction into the database + /// and collect latency information. + void commit(commit_latency &latency) { return commit(&latency); } + + /// \brief Commit all the operations of a transaction into the database + /// and return latency information. + /// \returns latency information of commit stages. + commit_latency commit_get_latency() { + commit_latency result; + commit(&result); + return result; + } }; /// \brief Unmanaged cursor. diff --git a/src/mdbx.c++ b/src/mdbx.c++ index 4be94939..78a4ead0 100644 --- a/src/mdbx.c++ +++ b/src/mdbx.c++ @@ -1424,6 +1424,15 @@ void txn_managed::commit() { MDBX_CXX20_UNLIKELY err.throw_exception(); } +void txn_managed::commit(commit_latency *latency) { + const error err = + static_cast(::mdbx_txn_commit_ex(handle_, latency)); + if (MDBX_LIKELY(err.code() != MDBX_THREAD_MISMATCH)) + MDBX_CXX20_LIKELY handle_ = nullptr; + if (MDBX_UNLIKELY(err.code() != MDBX_SUCCESS)) + MDBX_CXX20_UNLIKELY err.throw_exception(); +} + //------------------------------------------------------------------------------ bool txn::drop_map(const char *name, bool throw_if_absent) { From 0f7e5073dbce4b92b67c985de77e5c4d21d2cf84 Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Fri, 7 Oct 2022 00:42:01 +0300 Subject: [PATCH 133/364] =?UTF-8?q?mdbx:=20=D0=BF=D0=BE=D0=BF=D1=80=D0=B0?= =?UTF-8?q?=D0=B2=D0=BB=D0=B5=D0=BD=20=D1=81=D0=B1=D0=BE=D1=80=20=D0=B8?= =?UTF-8?q?=D0=BD=D1=84=D0=BE=D1=80=D0=BC=D0=B0=D1=86=D0=B8=D0=B8=20=D0=BE?= =?UTF-8?q?=20=D0=B7=D0=B0=D0=B4=D0=B5=D1=80=D0=B6=D0=BA=D0=B0=D1=85,=20?= =?UTF-8?q?=D1=87=D1=82=D0=BE=D0=B1=D1=8B=20=D0=B2=D0=BA=D0=BB=D1=8E=D1=87?= =?UTF-8?q?=D0=B5=D0=BD=D0=BD=D1=8B=D0=B9=20=D0=B0=D1=83=D0=B4=D0=B8=D1=82?= =?UTF-8?q?=20=D0=BD=D0=B5=20=D0=B8=D1=81=D0=BA=D0=B0=D0=B6=D0=B0=D0=BB=20?= =?UTF-8?q?=D0=B7=D0=B0=D1=82=D1=80=D0=B0=D1=82=D1=8B=20=D0=BD=D0=B0=20GC.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core.c | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/src/core.c b/src/core.c index 562ce62e..f05cf356 100644 --- a/src/core.c +++ b/src/core.c @@ -10388,8 +10388,7 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { STATIC_ASSERT(MDBX_TXN_FINISHED == MDBX_TXN_BLOCKED - MDBX_TXN_HAS_CHILD - MDBX_TXN_ERROR); const uint64_t ts_0 = latency ? osal_monotime() : 0; - uint64_t ts_1 = 0, ts_2 = 0, ts_3 = 0, ts_4 = 0; - uint32_t audit_duration = 0; + uint64_t ts_1 = 0, ts_2 = 0, ts_3 = 0, ts_4 = 0, ts_5 = 0; int rc = check_txn(txn, MDBX_TXN_FINISHED); if (unlikely(rc != MDBX_SUCCESS)) @@ -10534,7 +10533,7 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { ts_1 = latency ? osal_monotime() : 0; txn_merge(parent, txn, parent_retired_len); - ts_2 = latency ? osal_monotime() : 0; + ts_2 = ts_3 = latency ? osal_monotime() : 0; env->me_txn = parent; parent->mt_child = NULL; tASSERT(parent, dirtylist_check(parent)); @@ -10553,7 +10552,7 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { } #endif /* MDBX_ENABLE_REFUND */ - ts_4 = ts_3 = latency ? osal_monotime() : 0; + ts_4 = ts_5 = latency ? osal_monotime() : 0; txn->mt_signature = 0; osal_free(txn); tASSERT(parent, audit_ex(parent, 0, false) == 0); @@ -10632,12 +10631,10 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { ? txn->mt_txnid : txn->mt_dbs[MAIN_DBI].md_mod_txnid; - ts_2 = latency ? osal_monotime() : 0; + ts_2 = ts_3 = latency ? osal_monotime() : 0; if (AUDIT_ENABLED()) { rc = audit_ex(txn, MDBX_PNL_GETSIZE(txn->tw.retired_pages), true); - const uint64_t audit_end = osal_monotime(); - audit_duration = osal_monotime_to_16dot16(audit_end - ts_2); - ts_2 = audit_end; + ts_3 = osal_monotime(); if (unlikely(rc != MDBX_SUCCESS)) goto fail; } @@ -10668,7 +10665,7 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { } /* TODO: use ctx.flush_begin & ctx.flush_end for range-sync */ - ts_3 = latency ? osal_monotime() : 0; + ts_4 = latency ? osal_monotime() : 0; MDBX_meta meta; memcpy(meta.mm_magic_and_version, head.ptr_c->mm_magic_and_version, 8); @@ -10696,7 +10693,7 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { rc = sync_locked(env, env->me_flags | txn->mt_flags | MDBX_SHRINK_ALLOWED, &meta, &txn->tw.troika); - ts_4 = latency ? osal_monotime() : 0; + ts_5 = latency ? osal_monotime() : 0; if (unlikely(rc != MDBX_SUCCESS)) { env->me_flags |= MDBX_FATAL_ERROR; ERROR("txn-%s: error %d", "sync", rc); @@ -10710,14 +10707,15 @@ done: provide_latency: if (latency) { - latency->audit = audit_duration; latency->preparation = ts_1 ? osal_monotime_to_16dot16(ts_1 - ts_0) : 0; latency->gc = (ts_1 && ts_2) ? osal_monotime_to_16dot16(ts_2 - ts_1) : 0; - latency->write = (ts_2 && ts_3) ? osal_monotime_to_16dot16(ts_3 - ts_2) : 0; - latency->sync = (ts_3 && ts_4) ? osal_monotime_to_16dot16(ts_4 - ts_3) : 0; - const uint64_t ts_5 = osal_monotime(); - latency->ending = ts_4 ? osal_monotime_to_16dot16(ts_5 - ts_4) : 0; - latency->whole = osal_monotime_to_16dot16(ts_5 - ts_0); + latency->audit = + (ts_2 && AUDIT_ENABLED()) ? osal_monotime_to_16dot16(ts_3 - ts_2) : 0; + latency->write = (ts_3 && ts_4) ? osal_monotime_to_16dot16(ts_4 - ts_3) : 0; + latency->sync = (ts_4 && ts_5) ? osal_monotime_to_16dot16(ts_5 - ts_4) : 0; + const uint64_t ts_6 = osal_monotime(); + latency->ending = ts_5 ? osal_monotime_to_16dot16(ts_6 - ts_5) : 0; + latency->whole = osal_monotime_to_16dot16(ts_6 - ts_0); } return rc; From 92dabe1ad198dc0c2c495612f1792ab1b0c86c87 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Fri, 7 Oct 2022 12:19:08 +0300 Subject: [PATCH 134/364] =?UTF-8?q?mdbx:=20=D0=B8=D1=81=D0=BF=D1=80=D0=B0?= =?UTF-8?q?=D0=B2=D0=BB=D0=B5=D0=BD=D0=B8=D0=B5=20=D0=BB=D0=B8=D1=88=D0=BD?= =?UTF-8?q?=D0=B5=D0=B3=D0=BE=20=D1=81=D0=B1=D1=80=D0=BE=D1=81=D0=B0=20?= =?UTF-8?q?=D0=B4=D0=B0=D0=BD=D0=BD=D1=8B=D1=85=20=D0=BD=D0=B0=20=D0=B4?= =?UTF-8?q?=D0=B8=D1=81=D0=BA=20=D0=B2=20=D1=80=D0=B5=D0=B6=D0=B8=D0=BC?= =?UTF-8?q?=D0=B5=20`MDBX=5FSAFE=5FNOSYNC`=20=D0=BF=D1=80=D0=B8=20=D0=BE?= =?UTF-8?q?=D0=B1=D0=BD=D0=BE=D0=B2=D0=BB=D0=B5=D0=BD=D0=B8=D0=B8=20GC.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/src/core.c b/src/core.c index f05cf356..09ecd9b5 100644 --- a/src/core.c +++ b/src/core.c @@ -6351,8 +6351,8 @@ static pgno_t *scan4seq_resolver(pgno_t *range, const size_t len, #define MDBX_ALLOC_NEW 2 #define MDBX_ALLOC_COALESCE 4 #define MDBX_ALLOC_SLOT 8 -#define MDBX_ALLOC_FAKE 16 -#define MDBX_ALLOC_NOLOG 32 +#define MDBX_ALLOC_RESERVE 16 +#define MDBX_ALLOC_BACKLOG 32 #define MDBX_ALLOC_ALL (MDBX_ALLOC_GC | MDBX_ALLOC_NEW) static pgr_t page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num, int flags) { @@ -6657,7 +6657,7 @@ static pgr_t page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num, int flags) { DEBUG("gc-wipe-steady, rc %d", ret.err); eASSERT(env, prefer_steady.ptr_c != meta_prefer_steady(env, &txn->tw.troika).ptr_c); - } else if ((flags & MDBX_ALLOC_NEW) == 0 || + } else if ((flags & (MDBX_ALLOC_BACKLOG | MDBX_ALLOC_NEW)) == 0 || (autosync_threshold && atomic_load32(&env->me_lck->mti_unsynced_pages, mo_Relaxed) >= autosync_threshold) || @@ -6736,12 +6736,12 @@ static pgr_t page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num, int flags) { txn->mt_next_pgno - MDBX_ENABLE_REFUND)); int level; const char *what; - if (likely(!(flags & MDBX_ALLOC_FAKE))) { + if (likely(!(flags & MDBX_ALLOC_RESERVE))) { txn->mt_flags |= MDBX_TXN_ERROR; level = MDBX_LOG_ERROR; what = "pages"; } else { - level = (flags & MDBX_ALLOC_NOLOG) ? MDBX_LOG_DEBUG : MDBX_LOG_NOTICE; + level = (flags & MDBX_ALLOC_BACKLOG) ? MDBX_LOG_DEBUG : MDBX_LOG_NOTICE; what = (flags & MDBX_ALLOC_SLOT) ? "gc-slot/backlog" : "backlog-pages"; } if (LOG_ENABLED(level)) @@ -6761,9 +6761,8 @@ done: if (likely(timestamp)) env->me_lck->mti_pgop_stat.gcrtime.weak += osal_monotime() - timestamp; #endif /* MDBX_ENABLE_PGOP_STAT */ - if (unlikely(flags & MDBX_ALLOC_FAKE)) { - DEBUG("return NULL-page for %u pages %s allocation", num, - "gc-slot/backlog"); + if (unlikely(flags & MDBX_ALLOC_RESERVE)) { + DEBUG("return NULL for %u pages %s reservation", num, "gc-slot/backlog"); ret.page = NULL; ret.err = MDBX_SUCCESS; return ret; @@ -9135,7 +9134,7 @@ static int gcu_prepare_backlog(MDBX_txn *txn, gcu_context_t *ctx, if (unlikely(err != MDBX_SUCCESS)) return err; err = page_alloc_slowpath(&ctx->cursor.outer, (pgno_t)pages4retiredlist, - MDBX_ALLOC_GC | MDBX_ALLOC_FAKE) + MDBX_ALLOC_GC | MDBX_ALLOC_RESERVE) .err; TRACE("== after-4linear, backlog %zu, err %d", gcu_backlog_size(txn), err); cASSERT(&ctx->cursor.outer, @@ -9146,7 +9145,7 @@ static int gcu_prepare_backlog(MDBX_txn *txn, gcu_context_t *ctx, err == MDBX_SUCCESS) err = page_alloc_slowpath(&ctx->cursor.outer, 0, MDBX_ALLOC_GC | MDBX_ALLOC_SLOT | - MDBX_ALLOC_FAKE | MDBX_ALLOC_NOLOG) + MDBX_ALLOC_RESERVE | MDBX_ALLOC_BACKLOG) .err; ctx->cursor.outer.mc_flags |= C_RECLAIMING; @@ -9329,7 +9328,7 @@ retry: txn->tw.loose_count); rc = page_alloc_slowpath(&ctx->cursor.outer, 0, MDBX_ALLOC_GC | MDBX_ALLOC_SLOT | - MDBX_ALLOC_FAKE) + MDBX_ALLOC_RESERVE) .err; if (rc == MDBX_SUCCESS) { TRACE("%s: retry since gc-slot for %zu loose-pages available", @@ -9554,7 +9553,7 @@ retry: snap_oldest = txn_oldest_reader(txn); rc = page_alloc_slowpath(&ctx->cursor.outer, 0, MDBX_ALLOC_GC | MDBX_ALLOC_SLOT | - MDBX_ALLOC_FAKE) + MDBX_ALLOC_RESERVE) .err; if (likely(rc == MDBX_SUCCESS)) { TRACE("%s: took @%" PRIaTXN " from GC", dbg_prefix_mode, From f6eec7195b3aa5317cfa98ac942679bb2b5debe6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Fri, 7 Oct 2022 14:53:35 +0300 Subject: [PATCH 135/364] =?UTF-8?q?mdbx:=20=D0=B7=D0=B0=D1=89=D0=B8=D1=82?= =?UTF-8?q?=D0=B0=20=D0=BE=D1=82=20=D0=BD=D1=83=D0=BB=D1=8F=20=D1=82=D0=BE?= =?UTF-8?q?=D0=BB=D1=8C=D0=BA=D0=BE=20=D0=BE=D0=B1=D1=89=D0=B5=D0=B9=20?= =?UTF-8?q?=D0=B7=D0=B0=D0=B4=D0=B5=D1=80=D0=B6=D0=BA=D0=B8=20=D0=B2=20?= =?UTF-8?q?=D0=BC=D0=B5=D1=82=D1=80=D0=B8=D0=BA=D0=B0=D1=85=20=D1=82=D1=80?= =?UTF-8?q?=D0=B0=D0=BD=D0=B7=D0=B0=D0=BA=D1=86=D0=B8=D0=B8.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Ранее, при конвертации очень коротких интервалов в формат фиксированной точки 16-точка-16, всегда выполнялось замещение нуля единицей. Т.е. если интервал был не нулевым, но меньше 15.259 микросекунд (1/65536 секунды), то вместо 0 возвращалось 1. Это приводило к тому, что сумма длительности отдельных стадий нередко была больше чем общее время фиксации транзакции. Проблема усугублялась, если получаемые значения аккумулировались по серии транзакций. Теперь такая защита от нуля выполняется только для общего времени, но не для отдельных стадий. Было: latency(ms): preparation=72.69 gc=72.69 write=73.04 sync=141.40 ending=72.69 whole=142.14 Аккумулированная сумма длительности этапов ВТРОЕ(!) больше общей длительности. Стало: latency(ms): preparation=0.00 gc=0.02 write=0.79 sync=67.98 ending=0.00 whole=140.81 Аккумулированная сумма длительности этапов меньше общей длительности, так как для каждой транзакции общая длительность возвращается не менее 15.259 микросекунд. --- src/core.c | 11 ++++++----- src/osal.c | 4 +--- src/osal.h | 5 +++++ 3 files changed, 12 insertions(+), 8 deletions(-) diff --git a/src/core.c b/src/core.c index 09ecd9b5..a8371188 100644 --- a/src/core.c +++ b/src/core.c @@ -10714,7 +10714,7 @@ provide_latency: latency->sync = (ts_4 && ts_5) ? osal_monotime_to_16dot16(ts_5 - ts_4) : 0; const uint64_t ts_6 = osal_monotime(); latency->ending = ts_5 ? osal_monotime_to_16dot16(ts_6 - ts_5) : 0; - latency->whole = osal_monotime_to_16dot16(ts_6 - ts_0); + latency->whole = osal_monotime_to_16dot16_noUnderflow(ts_6 - ts_0); } return rc; @@ -20775,14 +20775,15 @@ __cold static int fetch_envinfo_ex(const MDBX_env *env, const MDBX_txn *txn, const uint64_t monotime_now = osal_monotime(); uint64_t ts = atomic_load64(&lck->mti_sync_timestamp, mo_Relaxed); arg->mi_since_sync_seconds16dot16 = - ts ? osal_monotime_to_16dot16(monotime_now - ts) : 0; + ts ? osal_monotime_to_16dot16_noUnderflow(monotime_now - ts) : 0; ts = atomic_load64(&lck->mti_reader_check_timestamp, mo_Relaxed); arg->mi_since_reader_check_seconds16dot16 = - ts ? osal_monotime_to_16dot16(monotime_now - ts) : 0; + ts ? osal_monotime_to_16dot16_noUnderflow(monotime_now - ts) : 0; arg->mi_autosync_threshold = pgno2bytes( env, atomic_load32(&lck->mti_autosync_threshold, mo_Relaxed)); - arg->mi_autosync_period_seconds16dot16 = osal_monotime_to_16dot16( - atomic_load64(&lck->mti_autosync_period, mo_Relaxed)); + arg->mi_autosync_period_seconds16dot16 = + osal_monotime_to_16dot16_noUnderflow( + atomic_load64(&lck->mti_autosync_period, mo_Relaxed)); arg->mi_bootid.current.x = bootid.x; arg->mi_bootid.current.y = bootid.y; arg->mi_mode = env->me_lck_mmap.lck ? lck->mti_envmode.weak : env->me_flags; diff --git a/src/osal.c b/src/osal.c index 6a2697fa..79760a12 100644 --- a/src/osal.c +++ b/src/osal.c @@ -2655,9 +2655,7 @@ MDBX_INTERNAL_FUNC uint32_t osal_monotime_to_16dot16(uint64_t monotime) { #else (uint32_t)(monotime * 128 / 1953125); #endif - if (likely(ret > 0)) - return ret; - return monotime > 0 /* fix underflow */; + return ret; } MDBX_INTERNAL_FUNC uint64_t osal_monotime(void) { diff --git a/src/osal.h b/src/osal.h index 7d4b37b3..20842eff 100644 --- a/src/osal.h +++ b/src/osal.h @@ -585,6 +585,11 @@ MDBX_INTERNAL_FUNC uint64_t osal_monotime(void); MDBX_INTERNAL_FUNC uint64_t osal_16dot16_to_monotime(uint32_t seconds_16dot16); MDBX_INTERNAL_FUNC uint32_t osal_monotime_to_16dot16(uint64_t monotime); +static inline uint32_t osal_monotime_to_16dot16_noUnderflow(uint64_t monotime) { + uint32_t seconds_16dot16 = osal_monotime_to_16dot16(monotime); + return seconds_16dot16 ? seconds_16dot16 : /* fix underflow */ (monotime > 0); +} + MDBX_INTERNAL_FUNC bin128_t osal_bootid(void); /*----------------------------------------------------------------------------*/ /* lck stuff */ From 940ef30659322008d8762c9f303e26d038f11c9b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Sat, 8 Oct 2022 00:36:38 +0300 Subject: [PATCH 136/364] =?UTF-8?q?mdbx:=20=D1=81=D0=BF=D0=B8=D0=BB=D0=BB?= =?UTF-8?q?=D0=B8=D0=BD=D0=B3=20=D0=B3=D1=80=D1=8F=D0=B7=D0=BD=D1=8B=D1=85?= =?UTF-8?q?=20=D1=81=D1=82=D1=80=D0=B0=D0=BD=D0=B8=D1=86=20=D1=81=20=D1=83?= =?UTF-8?q?=D1=87=D0=B5=D1=82=D0=BE=D0=BC=20=D0=B8=D1=85=20=D1=81=D1=83?= =?UTF-8?q?=D0=BC=D0=BC=D0=B0=D1=80=D0=BD=D0=BE=D0=B3=D0=BE=20=D1=80=D0=B0?= =?UTF-8?q?=D0=B7=D0=BC=D0=B5=D1=80=D0=B0.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- TODO.md | 2 +- src/core.c | 175 +++++++++++++++++++++++++++++++++++------------------ 2 files changed, 116 insertions(+), 61 deletions(-) diff --git a/TODO.md b/TODO.md index 984b97ea..035f9e45 100644 --- a/TODO.md +++ b/TODO.md @@ -16,7 +16,6 @@ So currently most of the links are broken due to noted malicious ~~Github~~ sabo - [More flexible support of asynchronous runtime/framework(s)](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/200). - [Migration guide from LMDB to MDBX](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/199). - [Get rid of dirty-pages list in MDBX_WRITEMAP mode](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/193). - - [Large/Overflow pages accounting for dirty-room](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/192). - [Support for RAW devices](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/124). - [Support MessagePack for Keys & Values](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/115). - [Engage new terminology](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/137). @@ -27,3 +26,4 @@ Done - [Simple careful mode for working with corrupted DB](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/223). - [Engage an "overlapped I/O" on Windows](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/224). + - [Large/Overflow pages accounting for dirty-room](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/192). diff --git a/src/core.c b/src/core.c index a8371188..a0a2d7a2 100644 --- a/src/core.c +++ b/src/core.c @@ -4635,24 +4635,74 @@ static unsigned spill_prio(const MDBX_txn *txn, const size_t i, * parent txn. That would alter the parent txns' data even though * the child hasn't committed yet, and we'd have no way to undo it if * the child aborted. */ -static int txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0, - const size_t need) { -#if xMDBX_DEBUG_SPILLING != 1 - /* production mode */ - if (likely(txn->tw.dirtyroom + txn->tw.loose_count >= need)) - return MDBX_SUCCESS; - size_t wanna_spill = need - txn->tw.dirtyroom; -#else - /* debug mode: spill at least one page if xMDBX_DEBUG_SPILLING == 1 */ - size_t wanna_spill = - (need > txn->tw.dirtyroom) ? need - txn->tw.dirtyroom : 1; -#endif /* xMDBX_DEBUG_SPILLING */ +__cold static int txn_spill_slowpath(MDBX_txn *const txn, MDBX_cursor *const m0, + const intptr_t wanna_spill_entries, + const intptr_t wanna_spill_npages, + const size_t need); +static __inline int txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0, + const size_t need) { + intptr_t wanna_spill_entries = need - txn->tw.dirtyroom - txn->tw.loose_count; + intptr_t wanna_spill_npages = + need + txn->tw.dirtylist->pages_including_loose - txn->tw.loose_count - + txn->mt_env->me_options.dp_limit; + + /* production mode */ + if (likely(wanna_spill_npages < 1 && wanna_spill_entries < 1) +#if xMDBX_DEBUG_SPILLING == 1 + /* debug mode: always try to spill if xMDBX_DEBUG_SPILLING == 1 */ + && txn->mt_txnid % 23 > 11 +#endif + ) + return MDBX_SUCCESS; + + return txn_spill_slowpath(txn, m0, wanna_spill_entries, wanna_spill_npages, + need); +} + +static size_t spill_gate(const MDBX_env *env, intptr_t part, + const size_t total) { + const intptr_t spill_min = + env->me_options.spill_min_denominator + ? (total + env->me_options.spill_min_denominator - 1) / + env->me_options.spill_min_denominator + : 1; + const intptr_t spill_max = + total - (env->me_options.spill_max_denominator + ? total / env->me_options.spill_max_denominator + : 0); + part = (part < spill_max) ? part : spill_max; + part = (part > spill_min) ? part : spill_min; + eASSERT(env, part > 0 && (size_t)part <= total); + return (size_t)part; +} + +__cold static int txn_spill_slowpath(MDBX_txn *const txn, MDBX_cursor *const m0, + const intptr_t wanna_spill_entries, + const intptr_t wanna_spill_npages, + const size_t need) { int rc = MDBX_SUCCESS; + if (unlikely(txn->tw.dirtylist->length <= txn->tw.loose_count)) + goto done; + + const size_t dirty_entries = txn->tw.dirtylist->length - txn->tw.loose_count; + const size_t dirty_npages = + txn->tw.dirtylist->pages_including_loose - txn->tw.loose_count; + const size_t need_spill_entries = + spill_gate(txn->mt_env, wanna_spill_entries, dirty_entries); + const size_t need_spill_npages = + spill_gate(txn->mt_env, wanna_spill_npages, dirty_npages); + + const size_t need_spill = (need_spill_entries > need_spill_npages) + ? need_spill_entries + : need_spill_npages; + if (!need_spill) + goto done; + #if !MDBX_AVOID_MSYNC if (txn->mt_flags & MDBX_WRITEMAP) { - NOTICE("%s-spilling of %zu dirty-entries (have %zu dirty-room, need %zu)", - "msync", wanna_spill, txn->tw.dirtyroom, need); + NOTICE("%s-spilling %zu dirty-entries, %zu dirty-npages", "msync", + dirty_entries, dirty_npages); tASSERT(txn, txn->tw.spill_pages == nullptr); const MDBX_env *env = txn->mt_env; rc = @@ -4671,27 +4721,15 @@ static int txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0, } #endif /* MDBX_AVOID_MSYNC */ - const size_t dirty = txn->tw.dirtylist->length; - const size_t spill_min = - txn->mt_env->me_options.spill_min_denominator - ? dirty / txn->mt_env->me_options.spill_min_denominator - : 0; - const size_t spill_max = - dirty - (txn->mt_env->me_options.spill_max_denominator - ? dirty / txn->mt_env->me_options.spill_max_denominator - : 0); - wanna_spill = (wanna_spill > spill_min) ? wanna_spill : spill_min; - wanna_spill = (wanna_spill < spill_max) ? wanna_spill : spill_max; - if (!wanna_spill) - return MDBX_SUCCESS; - - NOTICE("%s-spilling %zu dirty-entries (have %zu dirty-room, need %zu)", - "pwrite", wanna_spill, txn->tw.dirtyroom, need); - tASSERT(txn, txn->tw.dirtylist->length >= wanna_spill); + NOTICE("%s-spilling %zu dirty-entries, %zu dirty-npages", "write", + need_spill_entries, need_spill_npages); + tASSERT(txn, txn->tw.dirtylist->length - txn->tw.loose_count >= 1); + tASSERT(txn, txn->tw.dirtylist->pages_including_loose - txn->tw.loose_count >= + need_spill_npages); if (!MDBX_AVOID_MSYNC || !(txn->mt_flags & MDBX_WRITEMAP)) { if (!txn->tw.spill_pages) { txn->tw.spill_least_removed = INT_MAX; - txn->tw.spill_pages = pnl_alloc(wanna_spill); + txn->tw.spill_pages = pnl_alloc(need_spill); if (unlikely(!txn->tw.spill_pages)) { rc = MDBX_ENOMEM; bailout: @@ -4701,7 +4739,7 @@ static int txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0, } else { /* purge deleted slots */ spill_purge(txn); - rc = pnl_reserve(&txn->tw.spill_pages, wanna_spill); + rc = pnl_reserve(&txn->tw.spill_pages, need_spill); (void)rc /* ignore since the resulting list may be shorter and pnl_append() will increase pnl on demand */ ; @@ -4758,48 +4796,63 @@ static int txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0, VERBOSE("lru-head %u, age-max %u", txn->tw.dirtylru, age_max); /* half of 8-bit radix-sort */ - pgno_t radix_counters[256], spillable = 0; - memset(&radix_counters, 0, sizeof(radix_counters)); + pgno_t radix_entries[256], radix_npages[256]; + memset(&radix_entries, 0, sizeof(radix_entries)); + memset(&radix_npages, 0, sizeof(radix_npages)); + size_t spillable_entries = 0, spillable_npages = 0; const uint32_t reciprocal = (UINT32_C(255) << 24) / (age_max + 1); for (size_t i = 1; i <= dl->length; ++i) { - unsigned prio = spill_prio(txn, i, reciprocal); + const unsigned prio = spill_prio(txn, i, reciprocal); if (prio < 256) { - radix_counters[prio] += 1; - spillable += 1; + radix_entries[prio] += 1; + spillable_entries += 1; + const pgno_t npages = dpl_npages(dl, i); + radix_npages[prio] += npages; + spillable_npages += npages; } } - if (likely(spillable > 0)) { - size_t prio2spill = 0, prio2adjacent = 128, amount = radix_counters[0]; + tASSERT(txn, spillable_npages >= spillable_entries); + pgno_t spilled_entries = 0, spilled_npages = 0; + if (likely(spillable_entries > 0)) { + size_t prio2spill = 0, prio2adjacent = 128, + amount_entries = radix_entries[0], amount_npages = radix_npages[0]; for (size_t i = 1; i < 256; i++) { - if (amount < wanna_spill) { + if (amount_entries < need_spill_entries || + amount_npages < need_spill_npages) { prio2spill = i; prio2adjacent = i + (257 - i) / 2; - amount += radix_counters[i]; - } else if (amount + amount < spillable + wanna_spill - /* РАВНОЗНАЧНО: amount - wanna_spill < spillable - amount */) { + amount_entries += radix_entries[i]; + amount_npages += radix_npages[i]; + } else if (amount_entries + amount_entries < + spillable_entries + need_spill_entries + /* РАВНОЗНАЧНО: amount - need_spill < spillable - amount */ + || amount_npages + amount_npages < + spillable_npages + need_spill_npages) { prio2adjacent = i; - amount += radix_counters[i]; + amount_entries += radix_entries[i]; + amount_npages += radix_npages[i]; } else break; } - VERBOSE("prio2spill %zu, prio2adjacent %zu, spillable %u," - " wanna-spill %zu, amount %zu", - prio2spill, prio2adjacent, spillable, wanna_spill, amount); + VERBOSE("prio2spill %zu, prio2adjacent %zu, spillable %zu/%zu," + " wanna-spill %zu/%zu, amount %zu/%zu", + prio2spill, prio2adjacent, spillable_entries, spillable_npages, + need_spill_entries, need_spill_npages, amount_entries, + amount_npages); tASSERT(txn, prio2spill < prio2adjacent && prio2adjacent <= 256); iov_ctx_t ctx; - rc = iov_init(txn, &ctx, amount, - txn->tw.dirtylist->pages_including_loose - - txn->tw.loose_count); + rc = iov_init(txn, &ctx, amount_entries, amount_npages); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; unsigned prev_prio = 256, prio; size_t r, w; - pgno_t spilled_entries = 0, spilled_npages = 0; - for (w = 0, r = 1; r <= dl->length && spilled_entries < wanna_spill; + for (w = 0, r = 1; + r <= dl->length && (spilled_entries < need_spill_entries || + spilled_npages < need_spill_npages); prev_prio = prio, ++r) { prio = spill_prio(txn, r, reciprocal); MDBX_page *const dp = dl->items[r].ptr; @@ -4850,7 +4903,8 @@ static int txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0, VERBOSE("spilled entries %u, spilled npages %u", spilled_entries, spilled_npages); - tASSERT(txn, spillable == 0 || spilled_entries > 0); + tASSERT(txn, spillable_entries == 0 || spilled_entries > 0); + tASSERT(txn, spilled_npages >= spilled_entries); while (r <= dl->length) dl->items[++w] = dl->items[r++]; @@ -4872,13 +4926,13 @@ static int txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0, pnl_sort(txn->tw.spill_pages, (size_t)txn->mt_next_pgno << 1); txn->mt_flags |= MDBX_TXN_SPILLS; } - NOTICE("spilled %u dirty-entries, now have %zu dirty-room", spilled_entries, - txn->tw.dirtyroom); + NOTICE("spilled %u dirty-entries, %u dirty-npages, now have %zu dirty-room", + spilled_entries, spilled_npages, txn->tw.dirtyroom); } else { tASSERT(txn, rc == MDBX_SUCCESS); for (size_t i = 1; i <= dl->length; ++i) { MDBX_page *dp = dl->items[i].ptr; - NOTICE( + VERBOSE( "dirtylist[%zu]: pgno %u, npages %u, flags 0x%04X, age %u, prio %u", i, dp->mp_pgno, dpl_npages(dl, i), dp->mp_flags, dpl_age(txn, i), spill_prio(txn, i, reciprocal)); @@ -4888,13 +4942,14 @@ static int txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0, #if xMDBX_DEBUG_SPILLING == 2 if (txn->tw.loose_count + txn->tw.dirtyroom <= need / 2 + 1) ERROR("dirty-list length: before %zu, after %zu, parent %zi, loose %zu; " - "needed %zu, spillable %u; " + "needed %zu, spillable %zu; " "spilled %u dirty-entries, now have %zu dirty-room", - dl->length + spilled, dl->length, + dl->length + spilled_entries, dl->length, (txn->mt_parent && txn->mt_parent->tw.dirtylist) ? (intptr_t)txn->mt_parent->tw.dirtylist->length : -1, - txn->tw.loose_count, need, spillable, spilled, txn->tw.dirtyroom); + txn->tw.loose_count, need, spillable_entries, spilled_entries, + txn->tw.dirtyroom); ENSURE(txn->mt_env, txn->tw.loose_count + txn->tw.dirtyroom > need / 2); #endif /* xMDBX_DEBUG_SPILLING */ From db72763de049d6e4546f838277fe83b9081ad1de Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Sat, 8 Oct 2022 15:02:45 +0300 Subject: [PATCH 137/364] =?UTF-8?q?mdbx:=20=D0=BE=D1=82=D0=BA=D0=BB=D1=8E?= =?UTF-8?q?=D1=87=D0=B5=D0=BD=D0=B8=D0=B5=20=D1=83=D1=87=D0=B5=D1=82=D0=B0?= =?UTF-8?q?=20=D0=B3=D1=80=D1=8F=D0=B7=D0=BD=D1=8B=D1=85=20=D1=81=D1=82?= =?UTF-8?q?=D1=80=D0=B0=D0=BD=D0=B8=D1=86=20=D0=B2=20=D0=BD=D0=B5=20=D1=82?= =?UTF-8?q?=D1=80=D0=B5=D0=B1=D1=83=D1=8E=D1=89=D0=B8=D1=85=20=D1=8D=D1=82?= =?UTF-8?q?=D0=BE=D0=B3=D0=BE=20=D1=80=D0=B5=D0=B6=D0=B8=D0=BC=D0=B0=D1=85?= =?UTF-8?q?.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit В режиме MDBX_WRITEMAP с опцией сборки MDBX_AVOID_MSYNC=0 отслеживание грязных страниц не требуется. Эта доработка устраняет еще одну из недоделок (пункт в TODO). --- TODO.md | 2 +- src/core.c | 304 ++++++++++++++++++++++++++++++++---------------- src/internals.h | 2 +- 3 files changed, 207 insertions(+), 101 deletions(-) diff --git a/TODO.md b/TODO.md index 035f9e45..66b0fff9 100644 --- a/TODO.md +++ b/TODO.md @@ -15,7 +15,6 @@ So currently most of the links are broken due to noted malicious ~~Github~~ sabo - [Replace SRW-lock on Windows to allow shrink DB with `MDBX_NOTLS` option](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/210). - [More flexible support of asynchronous runtime/framework(s)](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/200). - [Migration guide from LMDB to MDBX](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/199). - - [Get rid of dirty-pages list in MDBX_WRITEMAP mode](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/193). - [Support for RAW devices](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/124). - [Support MessagePack for Keys & Values](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/115). - [Engage new terminology](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/137). @@ -27,3 +26,4 @@ Done - [Simple careful mode for working with corrupted DB](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/223). - [Engage an "overlapped I/O" on Windows](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/224). - [Large/Overflow pages accounting for dirty-room](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/192). + - [Get rid of dirty-pages list in MDBX_WRITEMAP mode](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/193). diff --git a/src/core.c b/src/core.c index a0a2d7a2..2e726664 100644 --- a/src/core.c +++ b/src/core.c @@ -2771,6 +2771,9 @@ static void dpl_free(MDBX_txn *txn) { } static MDBX_dpl *dpl_reserve(MDBX_txn *txn, size_t size) { + tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); + tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); + size_t bytes = dpl_size2bytes((size < MDBX_PGL_LIMIT) ? size : MDBX_PGL_LIMIT); MDBX_dpl *const dl = osal_realloc(txn->tw.dirtylist, bytes); @@ -2787,6 +2790,8 @@ static MDBX_dpl *dpl_reserve(MDBX_txn *txn, size_t size) { static int dpl_alloc(MDBX_txn *txn) { tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); + tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); + const int wanna = (txn->mt_env->me_options.dp_initial < txn->mt_geo.upper) ? txn->mt_env->me_options.dp_initial : txn->mt_geo.upper; @@ -2812,6 +2817,9 @@ RADIXSORT_IMPL(dpl, MDBX_dp, MDBX_DPL_EXTRACT_KEY, SORT_IMPL(dp_sort, false, MDBX_dp, DP_SORT_CMP) __hot __noinline static MDBX_dpl *dpl_sort_slowpath(const MDBX_txn *txn) { + tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); + tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); + MDBX_dpl *dl = txn->tw.dirtylist; assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); const size_t unsorted = dl->length - dl->sorted; @@ -2865,6 +2873,9 @@ __hot __noinline static MDBX_dpl *dpl_sort_slowpath(const MDBX_txn *txn) { } static __always_inline MDBX_dpl *dpl_sort(const MDBX_txn *txn) { + tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); + tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); + MDBX_dpl *dl = txn->tw.dirtylist; assert(dl->length <= MDBX_PGL_LIMIT); assert(dl->sorted <= dl->length); @@ -2878,6 +2889,9 @@ static __always_inline MDBX_dpl *dpl_sort(const MDBX_txn *txn) { SEARCH_IMPL(dp_bsearch, MDBX_dp, pgno_t, DP_SEARCH_CMP) __hot __noinline static size_t dpl_search(const MDBX_txn *txn, pgno_t pgno) { + tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); + tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); + MDBX_dpl *dl = txn->tw.dirtylist; assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); if (AUDIT_ENABLED()) { @@ -2933,6 +2947,9 @@ dpl_endpgno(const MDBX_dpl *dl, size_t i) { static __inline bool dpl_intersect(const MDBX_txn *txn, pgno_t pgno, pgno_t npages) { + tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); + tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); + MDBX_dpl *dl = txn->tw.dirtylist; assert(dl->sorted == dl->length); assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); @@ -2956,7 +2973,8 @@ static __inline bool dpl_intersect(const MDBX_txn *txn, pgno_t pgno, return rc; } -static __always_inline size_t dpl_exist(MDBX_txn *txn, pgno_t pgno) { +static __always_inline size_t dpl_exist(const MDBX_txn *txn, pgno_t pgno) { + tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); MDBX_dpl *dl = txn->tw.dirtylist; size_t i = dpl_search(txn, pgno); assert((int)i > 0); @@ -2965,21 +2983,31 @@ static __always_inline size_t dpl_exist(MDBX_txn *txn, pgno_t pgno) { MDBX_MAYBE_UNUSED static const MDBX_page *debug_dpl_find(const MDBX_txn *txn, const pgno_t pgno) { + tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); const MDBX_dpl *dl = txn->tw.dirtylist; - assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); - for (size_t i = dl->length; i > dl->sorted; --i) - if (dl->items[i].pgno == pgno) - return dl->items[i].ptr; + if (dl) { + tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); + assert(dl->items[0].pgno == 0 && + dl->items[dl->length + 1].pgno == P_INVALID); + for (size_t i = dl->length; i > dl->sorted; --i) + if (dl->items[i].pgno == pgno) + return dl->items[i].ptr; - if (dl->sorted) { - const size_t i = dp_bsearch(dl->items + 1, dl->sorted, pgno) - dl->items; - if (dl->items[i].pgno == pgno) - return dl->items[i].ptr; + if (dl->sorted) { + const size_t i = dp_bsearch(dl->items + 1, dl->sorted, pgno) - dl->items; + if (dl->items[i].pgno == pgno) + return dl->items[i].ptr; + } + } else { + tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) != 0 && !MDBX_AVOID_MSYNC); } return nullptr; } static void dpl_remove_ex(const MDBX_txn *txn, size_t i, pgno_t npages) { + tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); + tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); + MDBX_dpl *dl = txn->tw.dirtylist; assert((intptr_t)i > 0 && i <= dl->length); assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); @@ -2999,6 +3027,8 @@ static __always_inline int __must_check_result dpl_append(MDBX_txn *txn, pgno_t pgno, MDBX_page *page, pgno_t npages) { + tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); + tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); MDBX_dpl *dl = txn->tw.dirtylist; assert(dl->length <= MDBX_PGL_LIMIT + MDBX_PNL_GRANULATE); assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); @@ -3047,6 +3077,8 @@ static __always_inline int __must_check_result dpl_append(MDBX_txn *txn, } static __inline uint32_t dpl_age(const MDBX_txn *txn, size_t i) { + tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); + tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); const MDBX_dpl *dl = txn->tw.dirtylist; assert((intptr_t)i > 0 && i <= dl->length); /* overflow could be here */ @@ -3693,6 +3725,7 @@ static void dpage_free(MDBX_env *env, MDBX_page *dp, pgno_t npages) { /* Return all dirty pages to dpage list */ static void dlist_free(MDBX_txn *txn) { + tASSERT(txn, (txn->mt_flags & (MDBX_TXN_RDONLY | MDBX_WRITEMAP)) == 0); MDBX_env *env = txn->mt_env; MDBX_dpl *const dl = txn->tw.dirtylist; @@ -3712,7 +3745,14 @@ static __always_inline MDBX_db *outer_db(MDBX_cursor *mc) { } MDBX_MAYBE_UNUSED __cold static bool dirtylist_check(MDBX_txn *txn) { + tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); const MDBX_dpl *const dl = txn->tw.dirtylist; + if (!dl) { + tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) != 0 && !MDBX_AVOID_MSYNC); + return true; + } + tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); + assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); tASSERT(txn, txn->tw.dirtyroom + dl->length == (txn->mt_parent ? txn->mt_parent->tw.dirtyroom @@ -3822,12 +3862,17 @@ static void refund_loose(MDBX_txn *txn) { tASSERT(txn, txn->tw.loose_count > 0); MDBX_dpl *const dl = txn->tw.dirtylist; - tASSERT(txn, dl->length >= txn->tw.loose_count); + if (dl) { + tASSERT(txn, dl->length >= txn->tw.loose_count); + tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); + } else { + tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) != 0 && !MDBX_AVOID_MSYNC); + } pgno_t onstack[MDBX_CACHELINE_SIZE * 8 / sizeof(pgno_t)]; MDBX_PNL suitable = onstack; - if (dl->length - dl->sorted > txn->tw.loose_count) { + if (!dl || dl->length - dl->sorted > txn->tw.loose_count) { /* Dirty list is useless since unsorted. */ if (pnl_bytes2size(sizeof(onstack)) < txn->tw.loose_count) { suitable = pnl_alloc(txn->tw.loose_count); @@ -3873,35 +3918,36 @@ static void refund_loose(MDBX_txn *txn) { const size_t refunded = txn->mt_next_pgno - most; DEBUG("refund-suitable %zu pages %" PRIaPGNO " -> %" PRIaPGNO, refunded, most, txn->mt_next_pgno); - txn->tw.loose_count -= refunded; - txn->tw.dirtyroom += refunded; - dl->pages_including_loose -= refunded; - assert(txn->tw.dirtyroom <= txn->mt_env->me_options.dp_limit); txn->mt_next_pgno = most; + txn->tw.loose_count -= refunded; + if (dl) { + txn->tw.dirtyroom += refunded; + dl->pages_including_loose -= refunded; + assert(txn->tw.dirtyroom <= txn->mt_env->me_options.dp_limit); - /* Filter-out dirty list */ - size_t r = 0; - w = 0; - if (dl->sorted) { - do { + /* Filter-out dirty list */ + size_t r = 0; + w = 0; + if (dl->sorted) { + do { + if (dl->items[++r].pgno < most) { + if (++w != r) + dl->items[w] = dl->items[r]; + } + } while (r < dl->sorted); + dl->sorted = w; + } + while (r < dl->length) { if (dl->items[++r].pgno < most) { if (++w != r) dl->items[w] = dl->items[r]; } - } while (r < dl->sorted); - dl->sorted = w; - } - while (r < dl->length) { - if (dl->items[++r].pgno < most) { - if (++w != r) - dl->items[w] = dl->items[r]; } + dpl_setlen(dl, w); + tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length == + (txn->mt_parent ? txn->mt_parent->tw.dirtyroom + : txn->mt_env->me_options.dp_limit)); } - dpl_setlen(dl, w); - tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length == - (txn->mt_parent ? txn->mt_parent->tw.dirtyroom - : txn->mt_env->me_options.dp_limit)); - goto unlink_loose; } } else { @@ -4025,6 +4071,8 @@ __cold static void kill_page(MDBX_txn *txn, MDBX_page *mp, pgno_t pgno, /* Remove page from dirty list */ static __inline void page_wash(MDBX_txn *txn, const size_t di, MDBX_page *const mp, const pgno_t npages) { + tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); + tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); tASSERT(txn, di && di <= txn->tw.dirtylist->length && txn->tw.dirtylist->items[di].ptr == mp); dpl_remove_ex(txn, di, npages); @@ -4139,7 +4187,7 @@ static int page_retire_ex(MDBX_cursor *mc, const pgno_t pgno, tASSERT(txn, !debug_dpl_find(txn, pgno)); } - di = is_dirty ? dpl_exist(txn, pgno) : 0; + di = (is_dirty && txn->tw.dirtylist) ? dpl_exist(txn, pgno) : 0; si = is_spilled ? search_spilled(txn, pgno) : 0; tASSERT(txn, !is_dirty || di || (txn->mt_flags & MDBX_WRITEMAP)); } else { @@ -4195,7 +4243,7 @@ status_done: * Её МОЖНО вытолкнуть в нераспределенный хвост. */ kind = "spilled"; spill_remove(txn, si, npages); - } else if ((txn->mt_flags & MDBX_WRITEMAP)) { + } else if (txn->mt_flags & MDBX_WRITEMAP) { kind = "writemap"; tASSERT(txn, mp && IS_MODIFIABLE(txn, mp)); } else { @@ -4278,7 +4326,7 @@ status_done: if (MDBX_DEBUG != 0 || unlikely(txn->mt_env->me_flags & MDBX_PAGEPERTURB)) #endif kill_page(txn, mp, pgno, npages); - if (!(txn->mt_flags & MDBX_WRITEMAP)) { + if ((txn->mt_flags & MDBX_WRITEMAP) == 0) { VALGRIND_MAKE_MEM_NOACCESS(page_data(pgno2page(txn->mt_env, pgno)), pgno2bytes(txn->mt_env, npages) - PAGEHDRSZ); MDBX_ASAN_POISON_MEMORY_REGION(page_data(pgno2page(txn->mt_env, pgno)), @@ -4504,9 +4552,7 @@ __must_check_result static int iov_page(MDBX_txn *txn, iov_ctx_t *ctx, static int spill_page(MDBX_txn *txn, iov_ctx_t *ctx, MDBX_page *dp, const pgno_t npages) { -#if !MDBX_AVOID_MSYNC - tASSERT(txn, !(txn->mt_flags & MDBX_WRITEMAP)); -#endif /* MDBX_AVOID_MSYNC */ + tASSERT(txn, !(txn->mt_flags & MDBX_WRITEMAP) || MDBX_AVOID_MSYNC); #if MDBX_ENABLE_PGOP_STAT txn->mt_env->me_lck->mti_pgop_stat.spill.weak += npages; #endif /* MDBX_ENABLE_PGOP_STAT */ @@ -4521,6 +4567,8 @@ static int spill_page(MDBX_txn *txn, iov_ctx_t *ctx, MDBX_page *dp, /* Set unspillable LRU-label for dirty pages watched by txn. * Returns the number of pages marked as unspillable. */ static size_t cursor_keep(MDBX_txn *txn, MDBX_cursor *mc) { + tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); + tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); size_t keep = 0; while (mc->mc_flags & C_INITIALIZED) { for (size_t i = 0; i < mc->mc_snum; ++i) { @@ -4542,6 +4590,8 @@ static size_t cursor_keep(MDBX_txn *txn, MDBX_cursor *mc) { } static size_t txn_keep(MDBX_txn *txn, MDBX_cursor *m0) { + tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); + tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); size_t keep = m0 ? cursor_keep(txn, m0) : 0; for (size_t i = FREE_DBI; i < txn->mt_numdbs; ++i) if (F_ISSET(txn->mt_dbistate[i], DBI_DIRTY | DBI_VALID) && @@ -4642,6 +4692,9 @@ __cold static int txn_spill_slowpath(MDBX_txn *const txn, MDBX_cursor *const m0, static __inline int txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0, const size_t need) { + tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); + tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); + intptr_t wanna_spill_entries = need - txn->tw.dirtyroom - txn->tw.loose_count; intptr_t wanna_spill_npages = need + txn->tw.dirtylist->pages_including_loose - txn->tw.loose_count - @@ -4681,6 +4734,9 @@ __cold static int txn_spill_slowpath(MDBX_txn *const txn, MDBX_cursor *const m0, const intptr_t wanna_spill_entries, const intptr_t wanna_spill_npages, const size_t need) { + tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); + tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); + int rc = MDBX_SUCCESS; if (unlikely(txn->tw.dirtylist->length <= txn->tw.loose_count)) goto done; @@ -4963,6 +5019,13 @@ done: static int cursor_spill(MDBX_cursor *mc, const MDBX_val *key, const MDBX_val *data) { MDBX_txn *txn = mc->mc_txn; + tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); + if (!txn->tw.dirtylist) { + tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) != 0 && !MDBX_AVOID_MSYNC); + return MDBX_SUCCESS; + } + tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); + /* Estimate how much space this operation will take: */ /* 1) Max b-tree height, reasonable enough with including dups' sub-tree */ size_t need = CURSOR_STACK + 3; @@ -5417,6 +5480,13 @@ __cold static pgno_t find_largest_snapshot(const MDBX_env *env, /* Add a page to the txn's dirty list */ __hot static int __must_check_result page_dirty(MDBX_txn *txn, MDBX_page *mp, pgno_t npages) { + tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); + if (!txn->tw.dirtylist) { + tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) != 0 && !MDBX_AVOID_MSYNC); + return MDBX_SUCCESS; + } + tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); + #if xMDBX_DEBUG_SPILLING == 2 txn->mt_env->debug_dirtied_act += 1; ENSURE(txn->mt_env, @@ -5439,12 +5509,16 @@ __hot static int __must_check_result page_dirty(MDBX_txn *txn, MDBX_page *mp, txn->tw.loose_pages = loose->mp_next; txn->tw.loose_count--; txn->tw.dirtyroom++; - if (!(txn->mt_flags & MDBX_WRITEMAP)) + if (!MDBX_AVOID_MSYNC || !(txn->mt_flags & MDBX_WRITEMAP)) { + tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0); dpage_free(txn->mt_env, loose, 1); + } } else { ERROR("Dirtyroom is depleted, DPL length %zu", txn->tw.dirtylist->length); - if (!(txn->mt_flags & MDBX_WRITEMAP)) + if (!MDBX_AVOID_MSYNC || !(txn->mt_flags & MDBX_WRITEMAP)) { + tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0); dpage_free(txn->mt_env, mp, npages); + } return MDBX_TXN_FULL; } } @@ -8060,11 +8134,18 @@ static int txn_renew(MDBX_txn *txn, const unsigned flags) { /* Moved to here to avoid a data race in read TXNs */ txn->mt_geo = head.ptr_c->mm_geo; - rc = dpl_alloc(txn); - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; - txn->tw.dirtyroom = txn->mt_env->me_options.dp_limit; - txn->tw.dirtylru = MDBX_DEBUG ? ~42u : 0; + if ((txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC) { + rc = dpl_alloc(txn); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + txn->tw.dirtyroom = txn->mt_env->me_options.dp_limit; + txn->tw.dirtylru = MDBX_DEBUG ? ~42u : 0; + } else { + tASSERT(txn, txn->tw.dirtylist == nullptr); + txn->tw.dirtylist = nullptr; + txn->tw.dirtyroom = MAX_PAGENO; + txn->tw.dirtylru = 0; + } } /* Setup db info */ @@ -8694,6 +8775,8 @@ static void dbi_update(MDBX_txn *txn, int keep) { /* Filter-out pgno list from transaction's dirty-page list */ static void dpl_sift(MDBX_txn *const txn, MDBX_PNL pl, const bool spilled) { + tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); + tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); if (MDBX_PNL_GETSIZE(pl) && txn->tw.dirtylist->length) { tASSERT(txn, pnl_check_allocated(pl, (size_t)txn->mt_next_pgno << spilled)); MDBX_dpl *dl = dpl_sort(txn); @@ -8726,8 +8809,10 @@ static void dpl_sift(MDBX_txn *const txn, MDBX_PNL pl, const bool spilled) { remove_dl: npages = dpl_npages(dl, r); dl->pages_including_loose -= npages; - if ((txn->mt_env->me_flags & MDBX_WRITEMAP) == 0) + if (!MDBX_AVOID_MSYNC || !(txn->mt_env->me_flags & MDBX_WRITEMAP)) { + tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0); dpage_free(txn->mt_env, dl->items[r].ptr, npages); + } ++r; next_i: i += step; @@ -8874,8 +8959,7 @@ static int txn_end(MDBX_txn *txn, const unsigned mode) { parent->tw.dirtylru = txn->tw.dirtylru; tASSERT(parent, dirtylist_check(parent)); tASSERT(parent, audit_ex(parent, 0, false) == 0); - if (!(env->me_flags & MDBX_WRITEMAP)) - dlist_free(txn); + dlist_free(txn); dpl_free(txn); pnl_free(txn->tw.reclaimed_pglist); @@ -9424,30 +9508,37 @@ retry: /* filter-out list of dirty-pages from loose-pages */ MDBX_dpl *const dl = txn->tw.dirtylist; - size_t w = 0; - for (size_t r = w; ++r <= dl->length;) { - MDBX_page *dp = dl->items[r].ptr; - tASSERT(txn, dp->mp_flags == P_LOOSE || IS_MODIFIABLE(txn, dp)); - tASSERT(txn, dpl_endpgno(dl, r) <= txn->mt_next_pgno); - if ((dp->mp_flags & P_LOOSE) == 0) { - if (++w != r) - dl->items[w] = dl->items[r]; - } else { - tASSERT(txn, dp->mp_flags == P_LOOSE); - if ((env->me_flags & MDBX_WRITEMAP) == 0) - dpage_free(env, dp, 1); + if (dl) { + tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); + size_t w = 0; + for (size_t r = w; ++r <= dl->length;) { + MDBX_page *dp = dl->items[r].ptr; + tASSERT(txn, dp->mp_flags == P_LOOSE || IS_MODIFIABLE(txn, dp)); + tASSERT(txn, dpl_endpgno(dl, r) <= txn->mt_next_pgno); + if ((dp->mp_flags & P_LOOSE) == 0) { + if (++w != r) + dl->items[w] = dl->items[r]; + } else { + tASSERT(txn, dp->mp_flags == P_LOOSE); + if (!MDBX_AVOID_MSYNC || !(env->me_flags & MDBX_WRITEMAP)) { + tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0); + dpage_free(env, dp, 1); + } + } } + TRACE("%s: filtered-out loose-pages from %zu -> %zu dirty-pages", + dbg_prefix_mode, dl->length, w); + tASSERT(txn, txn->tw.loose_count == dl->length - w); + dpl_setlen(dl, w); + dl->sorted = 0; + dl->pages_including_loose -= txn->tw.loose_count; + txn->tw.dirtyroom += txn->tw.loose_count; + tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length == + (txn->mt_parent ? txn->mt_parent->tw.dirtyroom + : txn->mt_env->me_options.dp_limit)); + } else { + tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) != 0 && !MDBX_AVOID_MSYNC); } - TRACE("%s: filtered-out loose-pages from %zu -> %zu dirty-pages", - dbg_prefix_mode, dl->length, w); - tASSERT(txn, txn->tw.loose_count == dl->length - w); - dpl_setlen(dl, w); - dl->sorted = 0; - dl->pages_including_loose -= txn->tw.loose_count; - txn->tw.dirtyroom += txn->tw.loose_count; - tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length == - (txn->mt_parent ? txn->mt_parent->tw.dirtyroom - : txn->mt_env->me_options.dp_limit)); txn->tw.loose_pages = NULL; txn->tw.loose_count = 0; #if MDBX_ENABLE_REFUND @@ -10032,9 +10123,8 @@ bailout: } static int txn_write(MDBX_txn *txn, iov_ctx_t *ctx) { - MDBX_dpl *dl = txn->tw.dirtylist; - if (MDBX_AVOID_MSYNC || !(txn->mt_flags & MDBX_WRITEMAP)) - dl = dpl_sort(txn); + tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); + MDBX_dpl *const dl = dpl_sort(txn); int rc = MDBX_SUCCESS; size_t r, w; for (w = 0, r = 1; r <= dl->length; ++r) { @@ -10087,6 +10177,7 @@ int mdbx_txn_commit(MDBX_txn *txn) { return __inline_mdbx_txn_commit(txn); } /* Merge child txn into parent */ static __inline void txn_merge(MDBX_txn *const parent, MDBX_txn *const txn, const size_t parent_retired_len) { + tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0); MDBX_dpl *const src = dpl_sort(txn); /* Remove refunded pages from parent's dirty list */ @@ -10094,10 +10185,8 @@ static __inline void txn_merge(MDBX_txn *const parent, MDBX_txn *const txn, if (MDBX_ENABLE_REFUND) { size_t n = dst->length; while (n && dst->items[n].pgno >= parent->mt_next_pgno) { - if (!(txn->mt_env->me_flags & MDBX_WRITEMAP)) { - unsigned npages = dpl_npages(dst, n); - dpage_free(txn->mt_env, dst->items[n].ptr, npages); - } + const unsigned npages = dpl_npages(dst, n); + dpage_free(txn->mt_env, dst->items[n].ptr, npages); --n; } parent->tw.dirtyroom += dst->sorted - n; @@ -10298,8 +10387,7 @@ static __inline void txn_merge(MDBX_txn *const parent, MDBX_txn *const txn, ++l; } else { dst->items[d--].ptr = nullptr; - if ((txn->mt_flags & MDBX_WRITEMAP) == 0) - dpage_free(txn->mt_env, dp, d_npages); + dpage_free(txn->mt_env, dp, d_npages); } } assert(dst->sorted == dst->length); @@ -10614,13 +10702,18 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { goto provide_latency; } - tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length == - (txn->mt_parent ? txn->mt_parent->tw.dirtyroom - : txn->mt_env->me_options.dp_limit)); + if (!txn->tw.dirtylist) { + tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) != 0 && !MDBX_AVOID_MSYNC); + } else { + tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); + tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length == + (txn->mt_parent ? txn->mt_parent->tw.dirtyroom + : txn->mt_env->me_options.dp_limit)); + } cursors_eot(txn, false); end_mode |= MDBX_END_EOTDONE; - if (txn->tw.dirtylist->length == 0 && + if ((!txn->tw.dirtylist || txn->tw.dirtylist->length == 0) && (txn->mt_flags & (MDBX_TXN_DIRTY | MDBX_TXN_SPILLS)) == 0) { for (intptr_t i = txn->mt_numdbs; --i >= 0;) tASSERT(txn, (txn->mt_dbistate[i] & DBI_DIRTY) == 0); @@ -10694,14 +10787,6 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { } const meta_ptr_t head = meta_recent(env, &txn->tw.troika); - iov_ctx_t write_ctx; - rc = iov_init(txn, &write_ctx, txn->tw.dirtylist->length, - txn->tw.dirtylist->pages_including_loose - txn->tw.loose_count); - if (unlikely(rc != MDBX_SUCCESS)) { - ERROR("txn-%s: error %d", "iov-init", rc); - goto fail; - } - if (head.is_steady && atomic_load32(&env->me_lck->mti_meta_sync_txnid, mo_Relaxed) != (uint32_t)head.txnid) { /* sync prev meta */ @@ -10712,10 +10797,24 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { } } - rc = txn_write(txn, &write_ctx); - if (unlikely(rc != MDBX_SUCCESS)) { - ERROR("txn-%s: error %d", "write", rc); - goto fail; + if (txn->tw.dirtylist) { + tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); + iov_ctx_t write_ctx; + rc = iov_init(txn, &write_ctx, txn->tw.dirtylist->length, + txn->tw.dirtylist->pages_including_loose - + txn->tw.loose_count); + if (unlikely(rc != MDBX_SUCCESS)) { + ERROR("txn-%s: error %d", "iov-init", rc); + goto fail; + } + + rc = txn_write(txn, &write_ctx); + if (unlikely(rc != MDBX_SUCCESS)) { + ERROR("txn-%s: error %d", "write", rc); + goto fail; + } + } else { + tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) != 0 && !MDBX_AVOID_MSYNC); } /* TODO: use ctx.flush_begin & ctx.flush_end for range-sync */ @@ -18743,10 +18842,17 @@ __cold static int page_check(MDBX_cursor *const mc, const MDBX_page *const mp) { } __cold static int cursor_check(MDBX_cursor *mc) { - cASSERT(mc, mc->mc_txn->tw.dirtyroom + mc->mc_txn->tw.dirtylist->length == - (mc->mc_txn->mt_parent - ? mc->mc_txn->mt_parent->tw.dirtyroom - : mc->mc_txn->mt_env->me_options.dp_limit)); + if (!mc->mc_txn->tw.dirtylist) { + cASSERT(mc, + (mc->mc_txn->mt_flags & MDBX_WRITEMAP) != 0 && !MDBX_AVOID_MSYNC); + } else { + cASSERT(mc, + (mc->mc_txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); + cASSERT(mc, mc->mc_txn->tw.dirtyroom + mc->mc_txn->tw.dirtylist->length == + (mc->mc_txn->mt_parent + ? mc->mc_txn->mt_parent->tw.dirtyroom + : mc->mc_txn->mt_env->me_options.dp_limit)); + } cASSERT(mc, mc->mc_top == mc->mc_snum - 1 || (mc->mc_checking & CC_UPDATING)); if (unlikely(mc->mc_top != mc->mc_snum - 1) && (mc->mc_checking & CC_UPDATING) == 0) diff --git a/src/internals.h b/src/internals.h index 8af9dde2..8298989d 100644 --- a/src/internals.h +++ b/src/internals.h @@ -1263,7 +1263,7 @@ struct MDBX_env { #define xMDBX_DEBUG_SPILLING 0 #endif #if xMDBX_DEBUG_SPILLING == 2 - unsigned debug_dirtied_est, debug_dirtied_act; + size_t debug_dirtied_est, debug_dirtied_act; #endif /* xMDBX_DEBUG_SPILLING */ /* ------------------------------------------------- stub for lck-less mode */ From ad091646046c29ca66e35cc45156b490ee76d8cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Sat, 8 Oct 2022 17:29:29 +0300 Subject: [PATCH 138/364] =?UTF-8?q?mdbx:=20=D0=BC=D0=B8=D0=BD=D0=BE=D1=80?= =?UTF-8?q?=D0=BD=D0=BE=D0=B5=20=D0=B8=D1=81=D0=BF=D1=80=D0=B0=D0=B2=D0=BB?= =?UTF-8?q?=D0=B5=D0=BD=D0=B8=D0=B5=20=D0=B4=D0=BB=D1=8F=20=D1=83=D1=81?= =?UTF-8?q?=D1=82=D1=80=D0=B0=D0=BD=D0=B5=D0=BD=D0=B8=D1=8F=20=D1=81=D1=80?= =?UTF-8?q?=D0=B0=D0=B1=D0=B0=D1=82=D1=8B=D0=B2=D0=B0=D0=BD=D0=B8=D1=8F=20?= =?UTF-8?q?=D0=BF=D1=80=D0=BE=D0=B2=D0=B5=D1=80=D0=BE=D1=87=D0=BD=D0=BE?= =?UTF-8?q?=D0=B3=D0=BE=20=D1=83=D1=82=D0=B2=D0=B5=D1=80=D0=B6=D0=B4=D0=B5?= =?UTF-8?q?=D0=BD=D0=B8=D1=8F=20=D0=B2=20=D0=BE=D1=82=D0=BB=D0=B0=D0=B4?= =?UTF-8?q?=D0=BE=D1=87=D0=BD=D1=8B=D1=85=20=D1=81=D0=B1=D0=BE=D1=80=D0=BA?= =?UTF-8?q?=D0=B0=D1=85.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Ассерт мог срабатывать из-за отсутствия бита P_LEAF2 в передаваемом проверочном значении. На что-либо другое не влияло, но не следует понять почему этот недочет ны был выявлен тестами раньше. --- src/core.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/core.c b/src/core.c index 2e726664..12e7bd8a 100644 --- a/src/core.c +++ b/src/core.c @@ -4127,8 +4127,8 @@ static int page_retire_ex(MDBX_cursor *mc, const pgno_t pgno, check = page_get_any(mc, pgno, txn->mt_front); if (unlikely(check.err != MDBX_SUCCESS)) return check.err; - tASSERT(txn, (check.page->mp_flags & ~(P_LEAF2 | P_SPILLED)) == - (pageflags & ~P_FROZEN)); + tASSERT(txn, + (check.page->mp_flags & ~P_SPILLED) == (pageflags & ~P_FROZEN)); tASSERT(txn, !(pageflags & P_FROZEN) || IS_FROZEN(txn, check.page)); } if (pageflags & P_FROZEN) { @@ -21483,9 +21483,10 @@ static int drop_tree(MDBX_cursor *mc, const bool may_have_subDBs) { } else { cASSERT(mc, mc->mc_snum < mc->mc_db->md_depth); mc->mc_checking |= CC_RETIRING; - const unsigned pagetype = - (IS_FROZEN(txn, mp) ? P_FROZEN : 0) + - ((mc->mc_snum + 1 == mc->mc_db->md_depth) ? P_LEAF : P_BRANCH); + const unsigned pagetype = (IS_FROZEN(txn, mp) ? P_FROZEN : 0) + + ((mc->mc_snum + 1 == mc->mc_db->md_depth) + ? (mc->mc_checking & (P_LEAF | P_LEAF2)) + : P_BRANCH); for (size_t i = 0; i < nkeys; i++) { MDBX_node *node = page_node(mp, i); tASSERT(txn, (node_flags(node) & From 14eda2cd17fd861f9a9d2299eef03b805c773e0c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Sun, 9 Oct 2022 00:16:40 +0300 Subject: [PATCH 139/364] =?UTF-8?q?mdbx-windows:=20=D0=B8=D1=81=D0=BF?= =?UTF-8?q?=D0=BE=D0=BB=D1=8C=D0=B7=D0=BE=D0=B2=D0=B0=D0=BD=D0=B8=D0=B5=20?= =?UTF-8?q?=20=5FCrtDbgReport()=20=D0=B2=20=D0=BE=D1=82=D0=BB=D0=B0=D0=B4?= =?UTF-8?q?=D0=BE=D1=87=D0=BD=D1=8B=D1=85=20=D1=81=D0=B1=D0=BE=D1=80=D0=BA?= =?UTF-8?q?=D0=B0=D1=85.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/osal.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/src/osal.c b/src/osal.c index 79760a12..e120490a 100644 --- a/src/osal.c +++ b/src/osal.c @@ -20,6 +20,10 @@ #include +#if !MDBX_WITHOUT_MSVC_CRT && defined(_DEBUG) +#include +#endif + static int waitstatus2errcode(DWORD result) { switch (result) { case WAIT_OBJECT_0: @@ -252,9 +256,14 @@ MDBX_NORETURN __cold void assert_fail(const char *msg, const char *func, while (1) { #if defined(_WIN32) || defined(_WIN64) +#if !MDBX_WITHOUT_MSVC_CRT && defined(_DEBUG) + _CrtDbgReport(_CRT_ASSERT, func ? func : "unknown", line, "libmdbx", + "assertion failed: %s", msg); +#else if (IsDebuggerPresent()) DebugBreak(); - FatalExit(ERROR_UNHANDLED_ERROR); +#endif + FatalExit(STATUS_ASSERTION_FAILURE); #else abort(); #endif @@ -278,10 +287,15 @@ __cold void mdbx_panic(const char *fmt, ...) { while (1) { #if defined(_WIN32) || defined(_WIN64) +#if !MDBX_WITHOUT_MSVC_CRT && defined(_DEBUG) + _CrtDbgReport(_CRT_ASSERT, "mdbx.c", 0, "libmdbx", "panic: %s", + const_message); +#else OutputDebugStringA("\r\nMDBX-PANIC: "); OutputDebugStringA(const_message); if (IsDebuggerPresent()) DebugBreak(); +#endif FatalExit(ERROR_UNHANDLED_ERROR); #else __assert_fail(const_message, "mdbx", 0, "panic"); From ae8e37314310da16f08c0159fa3ba1efd7a600f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Sun, 9 Oct 2022 12:47:20 +0300 Subject: [PATCH 140/364] =?UTF-8?q?mdbx-test:=20=D0=B1=D0=BE=D0=BB=D1=8C?= =?UTF-8?q?=D1=88=D0=B5=20winnt-=D1=81=D1=82=D0=B0=D1=82=D1=83=D1=81=D0=BE?= =?UTF-8?q?=D0=B2=20=D0=BA=D0=B0=D0=BA=20coredump.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test/osal-windows.cc | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/test/osal-windows.cc b/test/osal-windows.cc index 7b3b4437..70b8cf5c 100644 --- a/test/osal-windows.cc +++ b/test/osal-windows.cc @@ -348,6 +348,7 @@ actor_status osal_actor_info(const mdbx_pid_t pid) { status = as_debugging; break; case STATUS_CONTROL_C_EXIT: + case /* STATUS_INTERRUPTED */ 0xC0000515L: status = as_killed; break; case EXCEPTION_ACCESS_VIOLATION: @@ -357,10 +358,16 @@ actor_status osal_actor_info(const mdbx_pid_t pid) { case EXCEPTION_INVALID_DISPOSITION: case EXCEPTION_ILLEGAL_INSTRUCTION: case EXCEPTION_NONCONTINUABLE_EXCEPTION: + case /* STATUS_STACK_BUFFER_OVERRUN, STATUS_BUFFER_OVERFLOW_PREVENTED */ + 0xC0000409L: + case /* STATUS_ASSERTION_FAILURE */ 0xC0000420L: + case /* STATUS_HEAP_CORRUPTION */ 0xC0000374L: + case /* STATUS_CONTROL_STACK_VIOLATION */ 0xC00001B2L: + log_error("pid %u, exception 0x%x", pid, ExitCode); status = as_coredump; break; default: - log_error("pid %u, ExitCode", pid, ExitCode); + log_error("pid %u, exit code %u", pid, ExitCode); status = as_failed; break; } From 688ec3e85cfd859502ebe3da9c9f629c53e8a260 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Sun, 9 Oct 2022 15:41:00 +0300 Subject: [PATCH 141/364] =?UTF-8?q?mdbx-test:=20=D0=B4=D0=BE=D0=B1=D0=B0?= =?UTF-8?q?=D0=B2=D0=BB=D0=B5=D0=BD=D0=B8=D0=B5=20=D0=B8=D1=81=D0=BA=D0=BB?= =?UTF-8?q?=D1=8E=D1=87=D0=B5=D0=BD=D0=B8=D0=B9=20Valgrind=20=D0=B4=D0=BB?= =?UTF-8?q?=D1=8F=20=D0=BD=D0=BE=D0=B2=D0=BE=D0=B3=D0=BE=20=D0=BA=D0=BE?= =?UTF-8?q?=D0=B4=D0=B0.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/osal.c | 6 +++++- test/valgrind_suppress.txt | 14 ++++++++++---- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/src/osal.c b/src/osal.c index e120490a..99bcd7cd 100644 --- a/src/osal.c +++ b/src/osal.c @@ -621,8 +621,12 @@ MDBX_INTERNAL_FUNC int osal_ioring_create(osal_ioring_t *ior, #endif /* !Windows */ #if MDBX_HAVE_PWRITEV && defined(_SC_IOV_MAX) - if (!osal_iov_max) + if (!osal_iov_max) { osal_iov_max = sysconf(_SC_IOV_MAX); + if (RUNNING_ON_VALGRIND && osal_iov_max > 64) + /* чтобы не описывать все 1024 исключения в valgrind_suppress.txt */ + osal_iov_max = 64; + } #endif ior->boundary = (char *)(ior->pool + ior->allocated); diff --git a/test/valgrind_suppress.txt b/test/valgrind_suppress.txt index 2eb3142b..2a95ff0f 100644 --- a/test/valgrind_suppress.txt +++ b/test/valgrind_suppress.txt @@ -33,16 +33,22 @@ # memcmp() inside iov_write() as workaround for todo4recovery://erased_by_github/libmdbx/issues/269 { - write-page-check-bcmp + iov-pagecheck-1 Memcheck:Cond fun:bcmp - fun:iov_write* + fun:iov_callback4dirtypages + fun:osal_ioring_walk + fun:iov_complete + fun:iov_write } { - write-page-check-memcmp + iov-pagecheck-2 Memcheck:Cond fun:memcmp* - fun:iov_write* + fun:iov_callback4dirtypages + fun:osal_ioring_walk + fun:iov_complete + fun:iov_write } # single-page flush by pwrite() From 63b4d2289d79e58d569a8a77aa326122dfeb3a36 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Sun, 9 Oct 2022 18:30:08 +0300 Subject: [PATCH 142/364] =?UTF-8?q?mdbx:=20=D1=83=D0=B4=D0=B0=D0=BB=D0=B5?= =?UTF-8?q?=D0=BD=D0=B8=D0=B5=20utf8bom=20=D0=B4=D0=BB=D1=8F=20=D1=83?= =?UTF-8?q?=D1=81=D1=82=D1=80=D0=B0=D0=BD=D0=B5=D0=BD=D0=B8=D1=8F=20=D0=BF?= =?UTF-8?q?=D1=80=D0=BE=D0=B1=D0=BB=D0=B5=D0=BC=20=D0=B0=D0=BC=D0=B0=D0=BB?= =?UTF-8?q?=D1=8C=D0=B3=D0=B0=D0=BC=D0=B0=D1=86=D0=B8=D0=B8=20=D0=BA=D0=BE?= =?UTF-8?q?=D0=B4=D0=B0.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/osal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/osal.c b/src/osal.c index 99bcd7cd..54d8a9dd 100644 --- a/src/osal.c +++ b/src/osal.c @@ -1,4 +1,4 @@ -/* https://en.wikipedia.org/wiki/Operating_system_abstraction_layer */ +/* https://en.wikipedia.org/wiki/Operating_system_abstraction_layer */ /* * Copyright 2015-2022 Leonid Yuriev From 92d203a12c3d1d16cf4d46439f29884bd745bbdc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Sun, 9 Oct 2022 19:41:13 +0300 Subject: [PATCH 143/364] =?UTF-8?q?mdbx:=20=D0=B8=D1=81=D0=BF=D1=80=D0=B0?= =?UTF-8?q?=D0=B2=D0=BB=D0=B5=D0=BD=D0=B8=D0=B5=20=D0=BB=D0=BE=D0=B6=D0=BD?= =?UTF-8?q?=D0=BE=D0=B3=D0=BE=20=D1=81=D1=80=D0=B0=D0=B1=D0=B0=D1=82=D1=8B?= =?UTF-8?q?=D0=B2=D0=B0=D0=BD=D0=B8=D1=8F=20=D0=BA=D0=BE=D0=BD=D1=82=D1=80?= =?UTF-8?q?=D0=BE=D0=BB=D1=8F=20"invalid=20page-address"=20=D0=B2=20`page?= =?UTF-8?q?=5Fcheck()`.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit При проверке использовалось глобальное значение me_dxb_mmap.current, к которому не должны обращаться читающие транзакции. В результате, в сложных много-поточных сценариях с изменением размера БД и её переполнением, проверка могла выдавать ложно-положительный результат. С точки зрения пользователя, ошибка могла проявляться как возврат `MDBX_CORRUPTED` из читающей транзакции, когда включен "безопасный режим" (дополнительный контроль), а в параллельной пишущей транзакции происходит увеличение размера БД с последующим переполнением и откатом этой транзакции. При этом никакого повреждения структуры БД нет. --- src/core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/core.c b/src/core.c index 12e7bd8a..c4ae564e 100644 --- a/src/core.c +++ b/src/core.c @@ -18476,9 +18476,9 @@ __cold static int page_check(MDBX_cursor *const mc, const MDBX_page *const mp) { unsigned flags_mask = P_ILL_BITS; unsigned flags_expected = 0; if (offset < 0 || - offset > (ptrdiff_t)(env->me_dxb_mmap.current - ((mp->mp_flags & P_SUBP) - ? PAGEHDRSZ + 1 - : env->me_psize))) { + offset > (ptrdiff_t)(pgno2bytes(env, mc->mc_txn->mt_next_pgno) - + ((mp->mp_flags & P_SUBP) ? PAGEHDRSZ + 1 + : env->me_psize))) { /* should be dirty page without MDBX_WRITEMAP, or a subpage of. */ flags_mask -= P_SUBP; if ((env->me_flags & MDBX_WRITEMAP) != 0 || From 138a83c2be63cf5c2e35d200d53ebf5a908c2617 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Sun, 9 Oct 2022 20:55:53 +0300 Subject: [PATCH 144/364] =?UTF-8?q?mdbx:=20=D0=B4=D0=BE=D0=B1=D0=B0=D0=B2?= =?UTF-8?q?=D0=BB=D0=B5=D0=BD=D0=B0=20=D0=BD=D0=B5=D1=81=D0=BA=D0=BE=D0=BB?= =?UTF-8?q?=D1=8C=D0=BA=D0=BE=20`MDBX=5FMAYBE=5FUNUSED`=20=D0=B4=D0=BB?= =?UTF-8?q?=D1=8F=20=D0=BB=D0=B8=D0=BA=D0=B2=D0=B8=D0=B4=D0=B0=D1=86=D0=B8?= =?UTF-8?q?=D0=B8=20=D0=BF=D1=80=D0=B5=D0=B4=D1=83=D0=BF=D1=80=D0=B5=D0=B6?= =?UTF-8?q?=D0=B4=D0=B5=D0=BD=D0=B8=D0=B9.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/osal.c | 6 ++++-- src/osal.h | 13 ++++++++----- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/src/osal.c b/src/osal.c index 54d8a9dd..cd736562 100644 --- a/src/osal.c +++ b/src/osal.c @@ -552,8 +552,10 @@ static const DWORD WC_ERR_INVALID_CHARS = : 0; #endif /* WC_ERR_INVALID_CHARS */ -MDBX_INTERNAL_FUNC size_t osal_mb2w(wchar_t *dst, size_t dst_n, const char *src, - size_t src_n) { +MDBX_MAYBE_UNUSED MDBX_INTERNAL_FUNC size_t osal_mb2w(wchar_t *dst, + size_t dst_n, + const char *src, + size_t src_n) { return MultiByteToWideChar(CP_THREAD_ACP, MB_ERR_INVALID_CHARS, src, (int)src_n, dst, (int)dst_n); } diff --git a/src/osal.h b/src/osal.h index 20842eff..239d3f95 100644 --- a/src/osal.h +++ b/src/osal.h @@ -372,16 +372,18 @@ MDBX_INTERNAL_FUNC void osal_ioring_walk( osal_ioring_t *ior, iov_ctx_t *ctx, void (*callback)(iov_ctx_t *ctx, size_t offset, void *data, size_t bytes)); -static inline unsigned osal_ioring_left(const osal_ioring_t *ior) { +MDBX_MAYBE_UNUSED static inline unsigned +osal_ioring_left(const osal_ioring_t *ior) { return ior->slots_left; } -static inline unsigned osal_ioring_used(const osal_ioring_t *ior) { +MDBX_MAYBE_UNUSED static inline unsigned +osal_ioring_used(const osal_ioring_t *ior) { return ior->allocated - ior->slots_left; } -static inline int osal_ioring_reserve(osal_ioring_t *ior, size_t items, - size_t bytes) { +MDBX_MAYBE_UNUSED static inline int +osal_ioring_reserve(osal_ioring_t *ior, size_t items, size_t bytes) { items = (items > 32) ? items : 32; #if defined(_WIN32) || defined(_WIN64) const size_t npages = bytes >> ior->pagesize_ln2; @@ -585,7 +587,8 @@ MDBX_INTERNAL_FUNC uint64_t osal_monotime(void); MDBX_INTERNAL_FUNC uint64_t osal_16dot16_to_monotime(uint32_t seconds_16dot16); MDBX_INTERNAL_FUNC uint32_t osal_monotime_to_16dot16(uint64_t monotime); -static inline uint32_t osal_monotime_to_16dot16_noUnderflow(uint64_t monotime) { +MDBX_MAYBE_UNUSED static inline uint32_t +osal_monotime_to_16dot16_noUnderflow(uint64_t monotime) { uint32_t seconds_16dot16 = osal_monotime_to_16dot16(monotime); return seconds_16dot16 ? seconds_16dot16 : /* fix underflow */ (monotime > 0); } From 98a2bd785a0e2a3f63d4e47689c240ebaace2430 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Sun, 9 Oct 2022 22:24:03 +0300 Subject: [PATCH 145/364] =?UTF-8?q?mdbx-windows:=20=D0=BF=D0=B5=D1=80?= =?UTF-8?q?=D0=B5=D0=BC=D0=B5=D1=89=D0=B5=D0=BD=D0=B0=20=D0=B4=D0=B5=D0=BA?= =?UTF-8?q?=D0=BB=D0=B0=D1=80=D0=B0=D1=86=D0=B8=D1=8F=20`osal=5Fmb2w()`=20?= =?UTF-8?q?=D0=B4=D0=BB=D1=8F=20=D0=BB=D0=B8=D0=BA=D0=B2=D0=B8=D0=B4=D0=B0?= =?UTF-8?q?=D1=86=D0=B8=D0=B8=20=D0=BF=D1=80=D0=B5=D0=B4=D1=83=D0=BF=D1=80?= =?UTF-8?q?=D0=B5=D0=B6=D0=B4=D0=B5=D0=BD=D0=B8=D0=B9.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/osal.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/osal.h b/src/osal.h index 239d3f95..b3eda1c0 100644 --- a/src/osal.h +++ b/src/osal.h @@ -180,9 +180,6 @@ static inline void osal_free(void *ptr) { HeapFree(GetProcessHeap(), 0, ptr); } #define vsnprintf _vsnprintf /* ntdll */ #endif -MDBX_INTERNAL_FUNC size_t osal_mb2w(wchar_t *dst, size_t dst_n, const char *src, - size_t src_n); - #else /*----------------------------------------------------------------------*/ typedef pthread_t osal_thread_t; @@ -702,6 +699,9 @@ MDBX_INTERNAL_FUNC int osal_rpid_check(MDBX_env *env, uint32_t pid); #if defined(_WIN32) || defined(_WIN64) +MDBX_INTERNAL_FUNC size_t osal_mb2w(wchar_t *dst, size_t dst_n, const char *src, + size_t src_n); + #define OSAL_MB2WIDE(FROM, TO) \ do { \ const char *const from_tmp = (FROM); \ From 98e29fe628a92d4a2c480a538f0b97d043f9e71f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Sun, 9 Oct 2022 23:05:20 +0300 Subject: [PATCH 146/364] =?UTF-8?q?mdbx-windows:=20UNICODE-=D0=B7=D0=B0?= =?UTF-8?q?=D0=B2=D0=B8=D1=81=D0=B8=D0=BC=D0=BE=D0=B5=20=D0=BE=D0=BF=D1=80?= =?UTF-8?q?=D0=B5=D0=B4=D0=B5=D0=BB=D0=B5=D0=BD=D0=B8=D0=B5=20=D0=BC=D0=B0?= =?UTF-8?q?=D0=BA=D1=80=D0=BE=D1=81=D0=BE=D0=B2=20MDBX=5FDATANAME,=20MDBX?= =?UTF-8?q?=5FLOCKNAME=20=D0=B8=20MDBX=5FLOCK=5FSUFFIX.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mdbx.h | 30 ++++++++++++++++++++++++------ src/internals.h | 4 ++++ 2 files changed, 28 insertions(+), 6 deletions(-) diff --git a/mdbx.h b/mdbx.h index b8bca567..79f444c5 100644 --- a/mdbx.h +++ b/mdbx.h @@ -830,8 +830,14 @@ enum MDBX_constants { #if !(defined(_WIN32) || defined(_WIN64)) #define MDBX_LOCKNAME "/mdbx.lck" #else -#define MDBX_LOCKNAME L"\\mdbx.lck" -#endif +#define MDBX_LOCKNAME_W L"\\mdbx.lck" +#define MDBX_LOCKNAME_A "\\mdbx.lck" +#ifdef UNICODE +#define MDBX_LOCKNAME MDBX_LOCKNAME_W +#else +#define MDBX_LOCKNAME MDBX_LOCKNAME_A +#endif /* UNICODE */ +#endif /* Windows */ #endif /* MDBX_LOCKNAME */ #ifndef MDBX_DATANAME /** \brief The name of the data file in the environment @@ -839,8 +845,14 @@ enum MDBX_constants { #if !(defined(_WIN32) || defined(_WIN64)) #define MDBX_DATANAME "/mdbx.dat" #else -#define MDBX_DATANAME L"\\mdbx.dat" -#endif +#define MDBX_DATANAME_W L"\\mdbx.dat" +#define MDBX_DATANAME_A "\\mdbx.dat" +#ifdef UNICODE +#define MDBX_DATANAME MDBX_DATANAME_W +#else +#define MDBX_DATANAME MDBX_DATANAME_A +#endif /* UNICODE */ +#endif /* Windows */ #endif /* MDBX_DATANAME */ #ifndef MDBX_LOCK_SUFFIX @@ -848,8 +860,14 @@ enum MDBX_constants { #if !(defined(_WIN32) || defined(_WIN64)) #define MDBX_LOCK_SUFFIX "-lck" #else -#define MDBX_LOCK_SUFFIX L"-lck" -#endif +#define MDBX_LOCK_SUFFIX_W L"-lck" +#define MDBX_LOCK_SUFFIX_A "-lck" +#ifdef UNICODE +#define MDBX_LOCK_SUFFIX MDBX_LOCK_SUFFIX_W +#else +#define MDBX_LOCK_SUFFIX MDBX_LOCK_SUFFIX_A +#endif /* UNICODE */ +#endif /* Windows */ #endif /* MDBX_LOCK_SUFFIX */ /* DEBUG & LOGGING ************************************************************/ diff --git a/src/internals.h b/src/internals.h index 8298989d..44cd4347 100644 --- a/src/internals.h +++ b/src/internals.h @@ -128,6 +128,10 @@ #define __USE_MINGW_ANSI_STDIO 1 #endif /* __USE_MINGW_ANSI_STDIO */ +#if (defined(_WIN32) || defined(_WIN64)) && !defined(UNICODE) +#define UNICODE +#endif /* UNICODE */ + #include "../mdbx.h" #include "base.h" From 9cdee2adb5c9bf8afc33f26e532c06205d4a16e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Mon, 10 Oct 2022 15:52:58 +0300 Subject: [PATCH 147/364] =?UTF-8?q?mdbx-cmake:=20=D0=B4=D0=BE=D0=B1=D0=B0?= =?UTF-8?q?=D0=B2=D0=BB=D0=B5=D0=BD=D0=B0=20=D0=BF=D0=BE=D0=B4=D0=B4=D0=B5?= =?UTF-8?q?=D1=80=D0=B6=D0=BA=D0=B0=20=D0=BE=D0=BF=D1=86=D0=B8=D0=B8=20`MD?= =?UTF-8?q?BX=5FAVOID=5FMSYNC`.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CMakeLists.txt | 5 ++++- src/config.h.in | 1 + 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 0b9155aa..a631fa78 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -494,11 +494,14 @@ if(${CMAKE_SYSTEM_NAME} STREQUAL "Windows") if(MDBX_NTDLL_EXTRA_IMPLIB) add_mdbx_option(MDBX_WITHOUT_MSVC_CRT "Avoid dependence from MSVC CRT and use ntdll.dll instead" OFF) endif() + set(MDBX_AVOID_MSYNC_DEFAULT ON) else() add_mdbx_option(MDBX_USE_OFDLOCKS "Use Open file description locks (aka OFD locks, non-POSIX)" AUTO) mark_as_advanced(MDBX_USE_OFDLOCKS) + set(MDBX_AVOID_MSYNC_DEFAULT OFF) endif() -add_mdbx_option(MDBX_LOCKING "Locking method (Win32=-1, SysV=5, POSIX=1988, POSIX=2001, POSIX=2008, Futexes=1995)" AUTO) +option(MDBX_AVOID_MSYNC "Controls dirty pages tracking, spilling and persisting in MDBX_WRITEMAP mode" ${MDBX_AVOID_MSYNC_DEFAULT}) +add_mdbx_option(MDBX_LOCKING "Locking method (Windows=-1, SysV=5, POSIX=1988, POSIX=2001, POSIX=2008, Futexes=1995)" AUTO) mark_as_advanced(MDBX_LOCKING) add_mdbx_option(MDBX_TRUST_RTC "Does a system have battery-backed Real-Time Clock or just a fake" AUTO) mark_as_advanced(MDBX_TRUST_RTC) diff --git a/src/config.h.in b/src/config.h.in index 58119c33..786a8c0b 100644 --- a/src/config.h.in +++ b/src/config.h.in @@ -27,6 +27,7 @@ #cmakedefine01 MDBX_TRUST_RTC #endif #cmakedefine01 MDBX_DISABLE_VALIDATION +#cmakedefine01 MDBX_AVOID_MSYNC /* Windows */ #cmakedefine01 MDBX_WITHOUT_MSVC_CRT From c3dd60fcb6e3ac38e4e4048efd939a3b66f777a3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Mon, 10 Oct 2022 16:33:51 +0300 Subject: [PATCH 148/364] =?UTF-8?q?mdbx:=20=D0=B4=D0=BE=D0=B1=D0=B0=D0=B2?= =?UTF-8?q?=D0=BB=D0=B5=D0=BD=D0=B8=D0=B5=20`mdbx=5Fenv=5Fget=5Fpairsize4p?= =?UTF-8?q?age=5Fmax()`=20=D0=B8=20`mdbx=5Fenv=5Fget=5Fvalsize4page=5Fmax(?= =?UTF-8?q?)`.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mdbx.h | 26 ++++++++++++++++++++++++++ src/core.c | 20 ++++++++++++++++++-- 2 files changed, 44 insertions(+), 2 deletions(-) diff --git a/mdbx.h b/mdbx.h index 79f444c5..fa72b73d 100644 --- a/mdbx.h +++ b/mdbx.h @@ -3307,6 +3307,32 @@ mdbx_env_get_maxvalsize_ex(const MDBX_env *env, MDBX_db_flags_t flags); MDBX_DEPRECATED MDBX_NOTHROW_PURE_FUNCTION LIBMDBX_API int mdbx_env_get_maxkeysize(const MDBX_env *env); +/** \brief Returns maximal size of key-value pair to fit in a single page + * for specified database flags. + * \ingroup c_statinfo + * + * \param [in] env An environment handle returned by \ref mdbx_env_create(). + * \param [in] flags Database options (\ref MDBX_DUPSORT, \ref MDBX_INTEGERKEY + * and so on). \see db_flags + * + * \returns The maximum size of a data can write, + * or -1 if something is wrong. */ +MDBX_NOTHROW_PURE_FUNCTION LIBMDBX_API int +mdbx_env_get_pairsize4page_max(const MDBX_env *env, MDBX_db_flags_t flags); + +/** \brief Returns maximal data size in bytes to fit in a leaf-page or + * single overflow/large-page for specified database flags. + * \ingroup c_statinfo + * + * \param [in] env An environment handle returned by \ref mdbx_env_create(). + * \param [in] flags Database options (\ref MDBX_DUPSORT, \ref MDBX_INTEGERKEY + * and so on). \see db_flags + * + * \returns The maximum size of a data can write, + * or -1 if something is wrong. */ +MDBX_NOTHROW_PURE_FUNCTION LIBMDBX_API int +mdbx_env_get_valsize4page_max(const MDBX_env *env, MDBX_db_flags_t flags); + /** \brief Sets application information (a context pointer) associated with * the environment. * \see mdbx_env_get_userctx() diff --git a/src/core.c b/src/core.c index c4ae564e..295d8305 100644 --- a/src/core.c +++ b/src/core.c @@ -546,8 +546,16 @@ __cold intptr_t mdbx_limits_pairsize4page_max(intptr_t pagesize, return LEAF_NODE_MAX(pagesize) - NODESIZE; } -intptr_t mdbx_limits_valsize4page_max(intptr_t pagesize, - MDBX_db_flags_t flags) { +__cold int mdbx_env_get_pairsize4page_max(const MDBX_env *env, + MDBX_db_flags_t flags) { + if (unlikely(!env || env->me_signature.weak != MDBX_ME_SIGNATURE)) + return -1; + + return (int)mdbx_limits_pairsize4page_max((intptr_t)env->me_psize, flags); +} + +__cold intptr_t mdbx_limits_valsize4page_max(intptr_t pagesize, + MDBX_db_flags_t flags) { if (pagesize < 1) pagesize = (intptr_t)mdbx_default_pagesize(); if (unlikely(pagesize < (intptr_t)MIN_PAGESIZE || @@ -562,6 +570,14 @@ intptr_t mdbx_limits_valsize4page_max(intptr_t pagesize, return PAGEROOM(pagesize); } +__cold int mdbx_env_get_valsize4page_max(const MDBX_env *env, + MDBX_db_flags_t flags) { + if (unlikely(!env || env->me_signature.weak != MDBX_ME_SIGNATURE)) + return -1; + + return (int)mdbx_limits_valsize4page_max((intptr_t)env->me_psize, flags); +} + /* Calculate the size of a leaf node. * * The size depends on the environment's page size; if a data item From 25ab65b470b953b0fc688158fd37ffa502f05452 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Mon, 10 Oct 2022 16:37:59 +0300 Subject: [PATCH 149/364] =?UTF-8?q?mdbx++:=20=D0=B4=D0=BE=D0=B1=D0=B0?= =?UTF-8?q?=D0=B2=D0=BB=D0=B5=D0=BD=D0=B8=D0=B5=20`env::limits::pairsize4p?= =?UTF-8?q?age=5Fmax()`=20=D0=B8=20`env::limits::valsize4page=5Fmax()`.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mdbx.h++ | 80 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) diff --git a/mdbx.h++ b/mdbx.h++ index 473def91..dddc1880 100644 --- a/mdbx.h++ +++ b/mdbx.h++ @@ -3177,6 +3177,7 @@ public: /// \brief Returns the minimal values size in bytes for specified values /// mode. static inline size_t value_min(value_mode) noexcept; + /// \brief Returns the maximal value size in bytes for specified page size /// and database flags. static inline size_t value_max(intptr_t pagesize, MDBX_db_flags_t flags); @@ -3189,6 +3190,35 @@ public: /// \brief Returns the maximal value size in bytes for specified page size /// and values mode. static inline size_t value_max(const env &, value_mode); + + /// \brief Returns maximal size of key-value pair to fit in a single page + /// for specified size and database flags. + static inline size_t pairsize4page_max(intptr_t pagesize, + MDBX_db_flags_t flags); + /// \brief Returns maximal size of key-value pair to fit in a single page + /// for specified page size and values mode. + static inline size_t pairsize4page_max(intptr_t pagesize, value_mode); + /// \brief Returns maximal size of key-value pair to fit in a single page + /// for given environment and database flags. + static inline size_t pairsize4page_max(const env &, MDBX_db_flags_t flags); + /// \brief Returns maximal size of key-value pair to fit in a single page + /// for specified page size and values mode. + static inline size_t pairsize4page_max(const env &, value_mode); + + /// \brief Returns maximal data size in bytes to fit in a leaf-page or + /// single overflow/large-page for specified size and database flags. + static inline size_t valsize4page_max(intptr_t pagesize, + MDBX_db_flags_t flags); + /// \brief Returns maximal data size in bytes to fit in a leaf-page or + /// single overflow/large-page for specified page size and values mode. + static inline size_t valsize4page_max(intptr_t pagesize, value_mode); + /// \brief Returns maximal data size in bytes to fit in a leaf-page or + /// single overflow/large-page for given environment and database flags. + static inline size_t valsize4page_max(const env &, MDBX_db_flags_t flags); + /// \brief Returns maximal data size in bytes to fit in a leaf-page or + /// single overflow/large-page for specified page size and values mode. + static inline size_t valsize4page_max(const env &, value_mode); + /// \brief Returns the maximal write transaction size (i.e. limit for /// summary volume of dirty pages) in bytes for specified page size. static inline size_t transaction_size_max(intptr_t pagesize); @@ -4882,6 +4912,56 @@ inline size_t env::limits::value_max(const env &env, value_mode mode) { return value_max(env, MDBX_db_flags_t(mode)); } +inline size_t env::limits::pairsize4page_max(intptr_t pagesize, + MDBX_db_flags_t flags) { + const intptr_t result = mdbx_limits_pairsize4page_max(pagesize, flags); + if (result < 0) + MDBX_CXX20_UNLIKELY error::throw_exception(MDBX_EINVAL); + return static_cast(result); +} + +inline size_t env::limits::pairsize4page_max(intptr_t pagesize, + value_mode mode) { + return pairsize4page_max(pagesize, MDBX_db_flags_t(mode)); +} + +inline size_t env::limits::pairsize4page_max(const env &env, + MDBX_db_flags_t flags) { + const intptr_t result = mdbx_env_get_pairsize4page_max(env, flags); + if (result < 0) + MDBX_CXX20_UNLIKELY error::throw_exception(MDBX_EINVAL); + return static_cast(result); +} + +inline size_t env::limits::pairsize4page_max(const env &env, value_mode mode) { + return pairsize4page_max(env, MDBX_db_flags_t(mode)); +} + +inline size_t env::limits::valsize4page_max(intptr_t pagesize, + MDBX_db_flags_t flags) { + const intptr_t result = mdbx_limits_valsize4page_max(pagesize, flags); + if (result < 0) + MDBX_CXX20_UNLIKELY error::throw_exception(MDBX_EINVAL); + return static_cast(result); +} + +inline size_t env::limits::valsize4page_max(intptr_t pagesize, + value_mode mode) { + return valsize4page_max(pagesize, MDBX_db_flags_t(mode)); +} + +inline size_t env::limits::valsize4page_max(const env &env, + MDBX_db_flags_t flags) { + const intptr_t result = mdbx_env_get_valsize4page_max(env, flags); + if (result < 0) + MDBX_CXX20_UNLIKELY error::throw_exception(MDBX_EINVAL); + return static_cast(result); +} + +inline size_t env::limits::valsize4page_max(const env &env, value_mode mode) { + return valsize4page_max(env, MDBX_db_flags_t(mode)); +} + inline size_t env::limits::transaction_size_max(intptr_t pagesize) { const intptr_t result = mdbx_limits_txnsize_max(pagesize); if (result < 0) From e46ca81abdd83fe4393285071085217310709c09 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Mon, 10 Oct 2022 17:03:07 +0300 Subject: [PATCH 150/364] =?UTF-8?q?mdbx:=20=D0=BE=D0=B1=D0=BD=D0=BE=D0=B2?= =?UTF-8?q?=D0=BB=D0=B5=D0=BD=D0=B8=D0=B5=20ChangeLog.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ChangeLog.md | 99 ++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 93 insertions(+), 6 deletions(-) diff --git a/ChangeLog.md b/ChangeLog.md index 543b5d83..e9b3696e 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -1,20 +1,107 @@ ChangeLog --------- -## Underway v0.12.2 +## В разработке v0.12.2 -New: - - Added `MDBX_HAVE_BUILT IN_CPU_SUPPORTS` build option to control use GCC's `__builtin_cpu_supports()` function, - which could be unavailable on a fake OSes (macos, ios, android, etc). +Новое: -Fixes: + - В C++ API добавлены методы фиксации транзакции с получением информации + о задержках. + - Отключение учета «грязных» страниц в не требующих этого режимах + (`MDBX_WRITEMAP` при `MDBX_AVOID_MSYNC=0`). Доработка позволяет снизить + накладные расходы и была запланирована давно, но откладывалась так как + требовала других изменений. + + - Вытеснение из памяти (спиллинг) «грязных» страниц с учетом размера + large/overflow-страниц. Доработка позволяет корректно соблюдать политику + задаваемую опциями `MDBX_opt_txn_dp_limit`, + `MDBX_opt_spill_max_denominator`, `MDBX_opt_spill_min_denominator` и + была запланирована давно, но откладывалась так как требовала других + изменений. + + - Для Windows в API добавлены UNICODE-зависимые определения макросов + `MDBX_DATANAME`, `MDBX_LOCKNAME` и `MDBX_LOCK_SUFFIX`. + + - Переход на преимущественное использование типа `size_t` для + уменьшения накладных расходов на платформе Эльбрус. + + - В API добавлены функции `mdbx_limits_valsize4page_max()` и + `mdbx_env_get_valsize4page_max()` возвращающие максимальный размер в + байтах значения, которое может быть размещена в одной + large/overflow-странице, а не последовательности из двух или более таких + страниц. Для таблиц с поддержкой дубликатов вынос значений на + large/overflow-страницы не поддерживается, поэтому результат совпадает с + `mdbx_limits_valsize_max()`. + + - В API добавлены функции `mdbx_limits_pairsize4page_max()`и + `mdbx_env_get_pairsize4page_max()` возвращающие в байтах максимальный + суммарный размер пары ключ-значение для их размещения на одной листовой + страницы, без выноса значения на отдельную large/overflow-страницу. Для + таблиц с поддержкой дубликатов вынос значений на large/overflow-страницы + не поддерживается, поэтому результат определяет максимальный/допустимый + суммарный размер пары ключ-значение. + + - Реализовано использование асинхронной (overlapped) записи в Windows, + включая использования небуфферизированного ввода-вывода и WriteGather(). + Это позволяет сократить накладные расходы и частично обойти проблемы + Windows с низкой производительностью ввода-вывода, включая большие + задержки FlushFileBuffers(). Новый код также обеспечивает консолидацию + записываемых регионов на всех платформах, а на Windows использование + событий (events) сведено к минимум, одновременно с автоматических + использованием WriteGather(). Поэтому ожидается существенное снижение + накладных расходов взаимодействия с ОС, а в Windows это ускорение, в + некоторых сценариях, может быть кратным в сравнении с LMDB. + + - Добавлена опция сборки `MDBX_AVOID_MSYNC`, которая определяет + поведение libmdbx в режиме `MDBX_WRITE_MAP` (когда данные изменяются + непосредственно в отображенных в ОЗУ страницах БД): + + * Если `MDBX_AVOID_MSYNC=0` (по умолчанию на всех системах кроме Windows), + то (как прежде) сохранение данных выполняется посредством `msync()`, + либо `FlushViewOfFile()` на Windows. На платформах с полноценной + подсистемой виртуальной памяти и адекватным файловым вводом-выводом + это обеспечивает минимум накладных расходов (один системный вызов) + и максимальную производительность. Однако, на Windows приводит + к значительной деградации, в том числе из-за того что после + `FlushViewOfFile()` требуется также вызов `FlushFileBuffers()` + с массой проблем и суеты внутри ядра ОС. + + * Если `MDBX_AVOID_MSYNC=1` (по умолчанию только на Windows), то + сохранение данных выполняется явной записью в файл каждой измененной + страницы БД. Это требует дополнительных накладных расходов, как + на отслеживание измененных страниц (ведение списков "грязных" + страниц), так и на системные вызовы для их записи. + Кроме этого, с точки зрения подсистемы виртуальной памяти ядра ОС, + страницы БД измененные в ОЗУ и явно записанные в файл, могут либо + оставаться "грязными" и быть повторно записаны ядром ОС позже, + либо требовать дополнительных накладных расходов для отслеживания + PTE (Page Table Entries), их модификации и дополнительного копирования + данных. Тем не менее, по имеющейся информации, на Windows такой путь + записи данных в целом обеспечивает более высокую производительность. + + - Added `MDBX_HAVE_BUILT IN_CPU_SUPPORTS` build option to control use GCC's + `__builtin_cpu_supports()` function, which could be unavailable on a fake + OSes (macos, ios, android, etc). + +Исправления: + + - Доработан сбор информации о задержках при фиксации транзакций: + * Устранено искажение замеров длительности обновления GC + при включении отладочного внутреннего аудита; + * Защита от undeflow-нуля только общей задержки в метриках, + чтобы исключить ситуации, когда сумма отдельных стадий + больше общей длительности. + - Ряд исправлений для устранения срабатываний проверочных утверждения в отладочных сборках. + - Исправление лишнего сброса данных на диск в режиме `MDBX_SAFE_NOSYNC` при обновлении GC. - Fixed an extra check for `MDBX_APPENDDUP` inside `mdbx_cursor_put()` which could result in returning `MDBX_EKEYMISMATCH` for valid cases. - Fixed nasty `clz()` bug (by using `_BitScanReverse()`, only MSVC builds affected). -Minors: +Мелочи: + - Добавлено описание использования файловых дескрипторов в различных режимах. + - Добавлено использование _CrtDbgReport() в отладочных сборках. - Fixed an extra ensure/assertion check of `oldest_reader` inside `txn_end()`. - Removed description of deprecated usage of `MDBX_NODUPDATA`. - Fixed regression ASAN/Valgring-enabled builds. From 22a84d656b95000ebb9a566b434859cac06ee348 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Mon, 10 Oct 2022 19:15:17 +0300 Subject: [PATCH 151/364] =?UTF-8?q?mdbx:=20=D0=BF=D1=80=D0=BE=D0=B2=D0=B5?= =?UTF-8?q?=D1=80=D0=BA=D0=B0=20=D0=B0=D1=82=D0=BE=D0=BC=D0=B0=D1=80=D0=BD?= =?UTF-8?q?=D0=BE=D1=81=D1=82=D0=B8=20C11-=D0=BE=D0=BF=D0=B5=D1=80=D0=B0?= =?UTF-8?q?=D1=86=D0=B8=D0=B9=20c=2032/64-=D0=B1=D0=B8=D1=82=D0=BD=D1=8B?= =?UTF-8?q?=D0=BC=D0=B8=20=D0=B4=D0=B0=D0=BD=D0=BD=D1=8B=D0=BC=D0=B8.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/core.c b/src/core.c index 295d8305..c0291d5d 100644 --- a/src/core.c +++ b/src/core.c @@ -11791,6 +11791,19 @@ __cold int mdbx_env_create(MDBX_env **penv) { return MDBX_EINVAL; *penv = nullptr; +#ifdef MDBX_HAVE_C11ATOMICS + if (unlikely(!atomic_is_lock_free((const volatile uint32_t *)penv))) { + ERROR("lock-free atomic ops for %u-bit types is required", 32); + return MDBX_INCOMPATIBLE; + } +#if MDBX_64BIT_ATOMIC + if (unlikely(!atomic_is_lock_free((const volatile uint64_t *)penv))) { + ERROR("lock-free atomic ops for %u-bit types is required", 64); + return MDBX_INCOMPATIBLE; + } +#endif /* MDBX_64BIT_ATOMIC */ +#endif /* MDBX_HAVE_C11ATOMICS */ + const size_t os_psize = osal_syspagesize(); if (unlikely(!is_powerof2(os_psize) || os_psize < MIN_PAGESIZE)) { ERROR("unsuitable system pagesize %" PRIuPTR, os_psize); From 329af93436f4a91a2fb626d09824276f38c346e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Mon, 10 Oct 2022 19:22:58 +0300 Subject: [PATCH 152/364] =?UTF-8?q?mdbx:=20=D1=83=D0=BC=D0=B5=D0=BD=D1=8C?= =?UTF-8?q?=D1=88=D0=B5=D0=BD=D0=B8=D0=B5=20=D0=B2=2042=20=D1=80=D0=B0?= =?UTF-8?q?=D0=B7=D0=B0=20=D0=B7=D0=BD=D0=B0=D1=87=D0=B5=D0=BD=D0=B8=D1=8F?= =?UTF-8?q?=20=D0=BF=D0=BE-=D1=83=D0=BC=D0=BE=D0=BB=D1=87=D0=B0=D0=BD?= =?UTF-8?q?=D0=B8=D1=8E=20=D0=B4=D0=BB=D1=8F=20`me=5Foptions.dp=5Flimit`?= =?UTF-8?q?=20=D0=B2=20=D0=BE=D1=82=D0=BB=D0=B0=D0=B4=D0=BE=D1=87=D0=BD?= =?UTF-8?q?=D1=8B=D1=85=20=D1=81=D0=B1=D0=BE=D1=80=D0=BA=D0=B0=D1=85.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/core.c b/src/core.c index c0291d5d..7f427fa6 100644 --- a/src/core.c +++ b/src/core.c @@ -11837,7 +11837,7 @@ __cold int mdbx_env_create(MDBX_env **penv) { env->me_options.dp_reserve_limit = 1024; env->me_options.rp_augment_limit = 256 * 1024; - env->me_options.dp_limit = 64 * 1024; + env->me_options.dp_limit = MDBX_DEBUG ? 64 * 1024 / 42 : 64 * 1024; if (env->me_options.dp_limit > MAX_PAGENO + 1 - NUM_METAS) env->me_options.dp_limit = MAX_PAGENO + 1 - NUM_METAS; env->me_options.dp_initial = MDBX_PNL_INITIAL; From f5a6e0c04fa107462a2d3bf5f6faf9f1e8f25381 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Mon, 10 Oct 2022 19:27:34 +0300 Subject: [PATCH 153/364] =?UTF-8?q?mdbx-make:=20=D0=B4=D0=BE=D0=B1=D0=B0?= =?UTF-8?q?=D0=B2=D0=BB=D0=B5=D0=BD=D0=B8=D0=B5=20`gcc-riscv64-linux-gnu`?= =?UTF-8?q?=20=D0=B2=20=D1=81=D0=BF=D0=B8=D1=81=D0=BE=D0=BA=20=D0=B4=D0=BB?= =?UTF-8?q?=D1=8F=20=D1=86=D0=B5=D0=BB=D0=B8=20`cross-gcc`.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- GNUmakefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/GNUmakefile b/GNUmakefile index 997795a6..fd7dc74c 100644 --- a/GNUmakefile +++ b/GNUmakefile @@ -726,12 +726,12 @@ CROSS_LIST = mips-linux-gnu-gcc \ # s390x-linux-gnu-gcc - works (previously: qemu hang/abort) # sparc64-linux-gnu-gcc - coredump (qemu mmap-troubles, previously: qemu fails fcntl for F_SETLK/F_GETLK) # alpha-linux-gnu-gcc - coredump (qemu mmap-troubles) -CROSS_LIST_NOQEMU = sparc64-linux-gnu-gcc alpha-linux-gnu-gcc +CROSS_LIST_NOQEMU = sparc64-linux-gnu-gcc alpha-linux-gnu-gcc riscv64-linux-gnu-gcc cross-gcc: @echo ' Re-building by cross-compiler for: $(CROSS_LIST_NOQEMU) $(CROSS_LIST)' @echo "CORRESPONDING CROSS-COMPILERs ARE REQUIRED." - @echo "FOR INSTANCE: apt install g++-aarch64-linux-gnu g++-alpha-linux-gnu g++-arm-linux-gnueabihf g++-hppa-linux-gnu g++-mips-linux-gnu g++-mips64-linux-gnuabi64 g++-powerpc-linux-gnu g++-powerpc64-linux-gnu g++-s390x-linux-gnu g++-sh4-linux-gnu g++-sparc64-linux-gnu" + @echo "FOR INSTANCE: apt install g++-aarch64-linux-gnu g++-alpha-linux-gnu g++-arm-linux-gnueabihf g++-hppa-linux-gnu g++-mips-linux-gnu g++-mips64-linux-gnuabi64 g++-powerpc-linux-gnu g++-powerpc64-linux-gnu g++-s390x-linux-gnu g++-sh4-linux-gnu g++-sparc64-linux-gnu riscv64-linux-gnu-gcc" $(QUIET)for CC in $(CROSS_LIST_NOQEMU) $(CROSS_LIST); do \ echo "===================== $$CC"; \ $(MAKE) IOARENA=false CXXSTD= clean && CC=$$CC CXX=$$(echo $$CC | sed 's/-gcc/-g++/') EXE_LDFLAGS=-static $(MAKE) IOARENA=false all || exit $$?; \ From 5242c5bfdc1c3515460de6134c2b28603667eb2b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Mon, 10 Oct 2022 21:06:33 +0300 Subject: [PATCH 154/364] =?UTF-8?q?mdbx:=20=D1=83=D0=BB=D1=83=D1=87=D1=88?= =?UTF-8?q?=D0=B5=D0=BD=D0=B8=D0=B5=20=D1=8D=D0=B2=D1=80=D0=B8=D1=81=D1=82?= =?UTF-8?q?=D0=B8=D0=BA=D0=B8=20=D0=B2=D0=BA=D0=BB=D1=8E=D1=87=D0=B5=D0=BD?= =?UTF-8?q?=D0=B8=D1=8F=20=D0=B0=D0=B2=D1=82=D0=BE-=D1=81=D0=BB=D0=B8?= =?UTF-8?q?=D1=8F=D0=BD=D0=B8=D1=8F=20=D0=B7=D0=B0=D0=BF=D0=B8=D1=81=D0=B5?= =?UTF-8?q?=D0=B9=20GC.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/core.c b/src/core.c index 7f427fa6..772b1adb 100644 --- a/src/core.c +++ b/src/core.c @@ -6509,21 +6509,23 @@ static pgr_t page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num, int flags) { const unsigned coalesce_threshold = env->me_maxgc_ov1page >> 2; if (likely(flags & MDBX_ALLOC_GC)) { - flags |= env->me_flags & MDBX_LIFORECLAIM; - if (txn->mt_dbs[FREE_DBI].md_branch_pages && - MDBX_PNL_GETSIZE(txn->tw.reclaimed_pglist) < coalesce_threshold) - flags |= MDBX_ALLOC_COALESCE; if (unlikely( /* If mc is updating the GC, then the retired-list cannot play catch-up with itself by growing while trying to save it. */ - (mc->mc_flags & C_RECLAIMING) || + (mc->mc_flags & (C_RECLAIMING | C_GCFREEZE)) || /* avoid (recursive) search inside empty tree and while tree is updating, todo4recovery://erased_by_github/libmdbx/issues/31 */ txn->mt_dbs[FREE_DBI].md_entries == 0 || /* If our dirty list is already full, we can't touch GC */ (txn->tw.dirtyroom < txn->mt_dbs[FREE_DBI].md_depth && !(txn->mt_dbistate[FREE_DBI] & DBI_DIRTY)))) - flags &= ~(MDBX_ALLOC_GC | MDBX_ALLOC_COALESCE); + flags -= MDBX_ALLOC_GC; + else { + flags |= env->me_flags & MDBX_LIFORECLAIM; + if (txn->mt_dbs[FREE_DBI].md_branch_pages && + MDBX_PNL_GETSIZE(txn->tw.reclaimed_pglist) < coalesce_threshold) + flags |= MDBX_ALLOC_COALESCE; + } } eASSERT(env, pnl_check_allocated(txn->tw.reclaimed_pglist, From dd9780606b37dda2b91efad5cc0377e22ae6e181 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Mon, 10 Oct 2022 21:59:51 +0300 Subject: [PATCH 155/364] =?UTF-8?q?mdbx-test:=20=D0=BD=D0=B5=D0=B1=D0=BE?= =?UTF-8?q?=D0=BB=D1=8C=D1=88=D0=B8=D0=B5=20=D0=BF=D1=80=D0=B0=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=20=D1=81=D0=BA=D1=80=D0=B8=D0=BF=D1=82=D0=B0=20`long=5Fs?= =?UTF-8?q?tochastic.sh`=20=D0=B4=D0=BB=D1=8F=20=D1=80=D0=B0=D0=B1=D0=BE?= =?UTF-8?q?=D1=82=D1=8B=20=D0=B2=20Windows.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test/long_stochastic.sh | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/test/long_stochastic.sh b/test/long_stochastic.sh index 21e371e2..29e70949 100755 --- a/test/long_stochastic.sh +++ b/test/long_stochastic.sh @@ -1,8 +1,4 @@ #!/usr/bin/env bash -if ! which make cc c++ tee >/dev/null; then - echo "Please install the following prerequisites: make cc c++ tee banner" >&2 - exit 1 -fi LIST=basic FROM=1 @@ -111,7 +107,7 @@ do --no-geometry-jitter) GEOMETRY_JITTER=no ;; - --pagesize) + --pagesize|--page-size) case "$2" in min|max|256|512|1024|2048|4096|8192|16386|32768|65536) PAGESIZE=$2 @@ -163,6 +159,11 @@ if [ -z "$MONITOR" ]; then export MALLOC_CHECK_=7 MALLOC_PERTURB_=42 fi +if ! which $([ "$SKIP_MAKE" == "no" ] && echo make cc c++) tee >/dev/null; then + echo "Please install the following prerequisites: make cc c++ tee banner" >&2 + exit 1 +fi + ############################################################################### # 1. clean data from prev runs and examine available RAM @@ -220,6 +221,11 @@ case ${UNAME} in echo "pagesize ${pagesize}K, freepages ${freepages}, ram_avail_mb ${ram_avail_mb}" ;; + MSYS*|MINGW*) + echo "FIXME: Fake support for ${UNAME}" + ram_avail_mb=32768 + ;; + *) echo "FIXME: ${UNAME} not supported by this script" exit 2 @@ -288,6 +294,10 @@ case ${UNAME} in fi ;; + MSYS*|MINGW*) + echo "FIXME: Fake support for ${UNAME}" + ;; + *) echo "FIXME: ${UNAME} not supported by this script" exit 2 From e5fc056035f194d682acd1a7bc9ba3a37459aebc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Tue, 11 Oct 2022 13:11:12 +0300 Subject: [PATCH 156/364] =?UTF-8?q?mdbx:=20=D0=B8=D0=B7=D0=BC=D0=B5=D0=BD?= =?UTF-8?q?=D0=B5=D0=BD=D0=B8=D0=B5=20=D1=84=D0=BE=D1=80=D0=BC=D0=B0=D1=82?= =?UTF-8?q?=D0=B0=20LCK=20=D0=B8=20=D1=81=D0=B5=D0=BC=D0=B0=D0=BD=D1=82?= =?UTF-8?q?=D0=B8=D0=BA=D0=B8=20=D0=BD=D0=B5=D0=BA=D0=BE=D1=82=D0=BE=D1=80?= =?UTF-8?q?=D1=8B=D1=85=20=D0=B2=D0=BD=D1=83=D1=82=D1=80=D0=B5=D0=BD=D0=BD?= =?UTF-8?q?=D0=B8=D1=85=20=D0=BF=D0=BE=D0=BB=D0=B5=D0=B9.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Изменение формата LCK-файла означает что версии libmdbx использующие разный формат не смогут работать с одной БД одновременно, а только поочередно (LCK-файл переписывается при открытии первым открывающим БД процессом). 1. Поле mti_unsynced_pages теперь 64-битное (чтобы не контролировать переполнение) и перемещено для соблюдения выравнивания. 2. Поле mti_sync_timestamp переименовано в mti_eoos_timestamp одновременно со сменой семантики. Теперь время отсчитывается не от момента сброса данных на диск, а с момента входа в «грязное» состояние. Скорее всего, текущая версия формата LCK не окончательная и изменится до релиза. --- mdbx.h | 4 +++- src/core.c | 56 ++++++++++++++++++++++++++----------------------- src/internals.h | 18 ++++++++-------- 3 files changed, 42 insertions(+), 36 deletions(-) diff --git a/mdbx.h b/mdbx.h index fa72b73d..17d1a17b 100644 --- a/mdbx.h +++ b/mdbx.h @@ -2515,7 +2515,9 @@ struct MDBX_envinfo { uint64_t mi_unsync_volume; /** Current auto-sync threshold, see \ref mdbx_env_set_syncbytes(). */ uint64_t mi_autosync_threshold; - /** Time since the last steady sync in 1/65536 of second */ + /** Time since entering to a "dirty" out-of-sync state in units of 1/65536 of + * second. In other words, this is the time since the last non-steady commit + * or zero if it was steady. */ uint32_t mi_since_sync_seconds16dot16; /** Current auto-sync period in 1/65536 of second, * see \ref mdbx_env_set_syncperiod(). */ diff --git a/src/core.c b/src/core.c index 772b1adb..a19dbb6e 100644 --- a/src/core.c +++ b/src/core.c @@ -4506,6 +4506,8 @@ __must_check_result static int iov_write(iov_ctx_t *ctx) { #if MDBX_ENABLE_PGOP_STAT ctx->env->me_lck->mti_pgop_stat.wops.weak += r.wops; #endif /* MDBX_ENABLE_PGOP_STAT */ + if (!ctx->env->me_lck->mti_eoos_timestamp.weak) + ctx->env->me_lck->mti_eoos_timestamp.weak = osal_monotime(); ctx->err = r.err; if (unlikely(ctx->err != MDBX_SUCCESS)) ERROR("Write error: %s", mdbx_strerror(ctx->err)); @@ -6787,6 +6789,7 @@ static pgr_t page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num, int flags) { atomic_load32(&env->me_lck->mti_autosync_threshold, mo_Relaxed); const uint64_t autosync_period = atomic_load64(&env->me_lck->mti_autosync_period, mo_Relaxed); + uint64_t eoos_timestamp; /* wipe the last steady-point if one of: * - UTTERLY_NOSYNC mode AND auto-sync threshold is NOT specified * - UTTERLY_NOSYNC mode AND free space at steady-point is exhausted @@ -6806,13 +6809,12 @@ static pgr_t page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num, int flags) { meta_prefer_steady(env, &txn->tw.troika).ptr_c); } else if ((flags & (MDBX_ALLOC_BACKLOG | MDBX_ALLOC_NEW)) == 0 || (autosync_threshold && - atomic_load32(&env->me_lck->mti_unsynced_pages, + atomic_load64(&env->me_lck->mti_unsynced_pages, mo_Relaxed) >= autosync_threshold) || (autosync_period && - osal_monotime() - - atomic_load64(&env->me_lck->mti_sync_timestamp, - mo_Relaxed) >= - autosync_period) || + (eoos_timestamp = atomic_load64( + &env->me_lck->mti_eoos_timestamp, mo_Relaxed)) && + osal_monotime() - eoos_timestamp >= autosync_period) || next >= txn->mt_geo.upper || (next >= txn->mt_end_pgno && (autosync_threshold | autosync_period) == 0)) { @@ -7311,8 +7313,8 @@ retry:; const meta_troika_t troika = meta_tap(env); head = meta_recent(env, &troika); } - const pgno_t unsynced_pages = - atomic_load32(&env->me_lck->mti_unsynced_pages, mo_Relaxed); + const uint64_t unsynced_pages = + atomic_load64(&env->me_lck->mti_unsynced_pages, mo_Relaxed); if (unsynced_pages == 0) { const uint32_t synched_meta_txnid_u32 = atomic_load32(&env->me_lck->mti_meta_sync_txnid, mo_Relaxed); @@ -7320,15 +7322,16 @@ retry:; goto bailout; } - const pgno_t autosync_threshold = + const size_t autosync_threshold = atomic_load32(&env->me_lck->mti_autosync_threshold, mo_Relaxed); const uint64_t autosync_period = atomic_load64(&env->me_lck->mti_autosync_period, mo_Relaxed); + uint64_t eoos_timestamp; if (force || (autosync_threshold && unsynced_pages >= autosync_threshold) || (autosync_period && - osal_monotime() - - atomic_load64(&env->me_lck->mti_sync_timestamp, mo_Relaxed) >= - autosync_period)) + (eoos_timestamp = + atomic_load64(&env->me_lck->mti_eoos_timestamp, mo_Relaxed)) && + osal_monotime() - eoos_timestamp >= autosync_period)) flags &= MDBX_WRITEMAP /* clear flags for full steady sync */; if (!inside_txn) { @@ -7396,7 +7399,7 @@ retry:; eASSERT(env, !inside_txn || (flags & MDBX_SHRINK_ALLOWED) == 0); if (!head.is_steady || ((flags & MDBX_SAFE_NOSYNC) == 0 && unsynced_pages)) { - DEBUG("meta-head %" PRIaPGNO ", %s, sync_pending %" PRIaPGNO, + DEBUG("meta-head %" PRIaPGNO ", %s, sync_pending %" PRIu64, data_page(head.ptr_c)->mp_pgno, durable_caption(head.ptr_c), unsynced_pages); MDBX_meta meta = *head.ptr_c; @@ -11341,13 +11344,14 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending, atomic_load32(&env->me_lck->mti_autosync_threshold, mo_Relaxed); const uint64_t autosync_period = atomic_load64(&env->me_lck->mti_autosync_period, mo_Relaxed); + uint64_t eoos_timestamp; if ((autosync_threshold && - atomic_load32(&env->me_lck->mti_unsynced_pages, mo_Relaxed) >= + atomic_load64(&env->me_lck->mti_unsynced_pages, mo_Relaxed) >= autosync_threshold) || (autosync_period && - osal_monotime() - - atomic_load64(&env->me_lck->mti_sync_timestamp, mo_Relaxed) >= - autosync_period)) + (eoos_timestamp = + atomic_load64(&env->me_lck->mti_eoos_timestamp, mo_Relaxed)) && + osal_monotime() - eoos_timestamp >= autosync_period)) flags &= MDBX_WRITEMAP | MDBX_SHRINK_ALLOWED; /* force steady */ } @@ -11459,7 +11463,7 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending, /* LY: step#1 - sync previously written/updated data-pages */ rc = MDBX_RESULT_FALSE /* carry steady */; - if (atomic_load32(&env->me_lck->mti_unsynced_pages, mo_Relaxed)) { + if (atomic_load64(&env->me_lck->mti_unsynced_pages, mo_Relaxed)) { eASSERT(env, ((flags ^ env->me_flags) & MDBX_WRITEMAP) == 0); enum osal_syncmode_bits mode_bits = MDBX_SYNC_NONE; unsigned sync_op = 0; @@ -11494,10 +11498,9 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending, /* Steady or Weak */ if (rc == MDBX_RESULT_FALSE /* carry steady */) { - atomic_store64(&env->me_lck->mti_sync_timestamp, osal_monotime(), - mo_Relaxed); unaligned_poke_u64(4, pending->mm_sign, meta_sign(pending)); - atomic_store32(&env->me_lck->mti_unsynced_pages, 0, mo_Relaxed); + atomic_store64(&env->me_lck->mti_eoos_timestamp, 0, mo_Relaxed); + atomic_store64(&env->me_lck->mti_unsynced_pages, 0, mo_Relaxed); } else { assert(rc == MDBX_RESULT_TRUE /* carry non-steady */); unaligned_poke_u64(4, pending->mm_sign, MDBX_DATASIGN_WEAK); @@ -12652,8 +12655,9 @@ __cold static int setup_dxb(MDBX_env *env, const int lck_rc, "rollback NOT needed, steady-sync NEEDED%s", "opening after an unclean shutdown", bootid.x, bootid.y, ""); header = clone; - atomic_store32(&env->me_lck->mti_unsynced_pages, header.mm_geo.next, - mo_Relaxed); + env->me_lck->mti_unsynced_pages.weak = header.mm_geo.next; + if (!env->me_lck->mti_eoos_timestamp.weak) + env->me_lck->mti_eoos_timestamp.weak = osal_monotime(); break; } if (unlikely(!prefer_steady.is_steady)) { @@ -20947,8 +20951,8 @@ __cold static int fetch_envinfo_ex(const MDBX_env *env, const MDBX_txn *txn, arg->mi_geo.upper = pgno2bytes(env, txn_meta->mm_geo.upper); arg->mi_geo.shrink = pgno2bytes(env, pv2pages(txn_meta->mm_geo.shrink_pv)); arg->mi_geo.grow = pgno2bytes(env, pv2pages(txn_meta->mm_geo.grow_pv)); - const pgno_t unsynced_pages = - atomic_load32(&env->me_lck->mti_unsynced_pages, mo_Relaxed) + + const uint64_t unsynced_pages = + atomic_load64(&env->me_lck->mti_unsynced_pages, mo_Relaxed) + (atomic_load32(&env->me_lck->mti_meta_sync_txnid, mo_Relaxed) != (uint32_t)arg->mi_recent_txnid); @@ -20963,9 +20967,9 @@ __cold static int fetch_envinfo_ex(const MDBX_env *env, const MDBX_txn *txn, arg->mi_sys_pagesize = env->me_os_psize; if (likely(bytes > size_before_bootid)) { - arg->mi_unsync_volume = pgno2bytes(env, unsynced_pages); + arg->mi_unsync_volume = pgno2bytes(env, (size_t)unsynced_pages); const uint64_t monotime_now = osal_monotime(); - uint64_t ts = atomic_load64(&lck->mti_sync_timestamp, mo_Relaxed); + uint64_t ts = atomic_load64(&lck->mti_eoos_timestamp, mo_Relaxed); arg->mi_since_sync_seconds16dot16 = ts ? osal_monotime_to_16dot16_noUnderflow(monotime_now - ts) : 0; ts = atomic_load64(&lck->mti_reader_check_timestamp, mo_Relaxed); diff --git a/src/internals.h b/src/internals.h index 44cd4347..ab748f3a 100644 --- a/src/internals.h +++ b/src/internals.h @@ -370,7 +370,7 @@ MDBX_MAYBE_UNUSED static __always_inline uint32_t atomic_load32( /* FROZEN: The version number for a database's datafile format. */ #define MDBX_DATA_VERSION 3 /* The version number for a database's lockfile format. */ -#define MDBX_LOCK_VERSION 4 +#define MDBX_LOCK_VERSION 5 /* handle for the DB used to track free pages. */ #define FREE_DBI 0 @@ -748,20 +748,20 @@ typedef struct MDBX_lockinfo { atomic_txnid_t mti_oldest_reader; - /* Timestamp of the last steady sync. Value is represented in a suitable - * system-dependent form, for example clock_gettime(CLOCK_BOOTTIME) or - * clock_gettime(CLOCK_MONOTONIC). */ - MDBX_atomic_uint64_t mti_sync_timestamp; + /* Timestamp of entering an out-of-sync state. Value is represented in a + * suitable system-dependent form, for example clock_gettime(CLOCK_BOOTTIME) + * or clock_gettime(CLOCK_MONOTONIC). */ + MDBX_atomic_uint64_t mti_eoos_timestamp; /* Number un-synced-with-disk pages for auto-sync feature. */ - atomic_pgno_t mti_unsynced_pages; - - /* Number of page which was discarded last time by madvise(MADV_FREE). */ - atomic_pgno_t mti_discarded_tail; + MDBX_atomic_uint64_t mti_unsynced_pages; /* Timestamp of the last readers check. */ MDBX_atomic_uint64_t mti_reader_check_timestamp; + /* Number of page which was discarded last time by madvise(MADV_FREE). */ + atomic_pgno_t mti_discarded_tail; + /* Shared anchor for tracking readahead edge and enabled/disabled status. */ pgno_t mti_readahead_anchor; From 686c908a9564b62d44763f1eec61fe6bb291440a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Wed, 12 Oct 2022 13:48:44 +0300 Subject: [PATCH 157/364] =?UTF-8?q?mdbx:=20=D0=B1=D0=BE=D0=BB=D0=B5=D0=B5?= =?UTF-8?q?=20=D0=BE=D1=81=D1=82=D0=BE=D1=80=D0=BE=D0=B6=D0=BD=D0=BE=D0=B5?= =?UTF-8?q?=20=D0=BF=D1=80=D0=B5=D0=BE=D0=B1=D1=80=D0=B0=D0=B7=D0=BE=D0=B2?= =?UTF-8?q?=D0=B0=D0=BD=D0=B8=D0=B5=20=D0=BA=20=D1=82=D0=B8=D0=BF=D1=83=20?= =?UTF-8?q?`mdbx=5Ftid=5Ft`=20=D0=B4=D0=BB=D1=8F=20=D1=83=D1=81=D1=82?= =?UTF-8?q?=D1=80=D0=B0=D0=BD=D0=B5=D0=BD=D0=B8=D1=8F=20=D0=BF=D1=80=D0=B5?= =?UTF-8?q?=D0=B4=D1=83=D0=BF=D1=80=D0=B5=D0=B6=D0=B4=D0=B5=D0=BD=D0=B8?= =?UTF-8?q?=D0=B9.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/core.c b/src/core.c index a19dbb6e..479ca8de 100644 --- a/src/core.c +++ b/src/core.c @@ -21704,8 +21704,8 @@ __cold int mdbx_reader_list(const MDBX_env *env, MDBX_reader_list_func *func, reader_pages_retired)) : 0; } - rc = func(ctx, ++serial, (unsigned)i, pid, (mdbx_tid_t)tid, txnid, lag, - bytes_used, bytes_retained); + rc = func(ctx, ++serial, (unsigned)i, pid, (mdbx_tid_t)((intptr_t)tid), + txnid, lag, bytes_used, bytes_retained); if (unlikely(rc != MDBX_SUCCESS)) break; } @@ -21940,7 +21940,7 @@ __cold static txnid_t kick_longlived_readers(MDBX_env *env, ? pgno2bytes(env, (pgno_t)(head_retired - hold_retired)) : 0; int rc = - callback(env, env->me_txn, pid, (mdbx_tid_t)tid, laggard, + callback(env, env->me_txn, pid, (mdbx_tid_t)((intptr_t)tid), laggard, (gap < UINT_MAX) ? (unsigned)gap : UINT_MAX, space, retry); if (rc < 0) /* hsr returned error and/or agree MDBX_MAP_FULL error */ From 5a45c4a2109bb01160acdad1f071439d89e7f04b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Wed, 12 Oct 2022 18:00:05 +0300 Subject: [PATCH 158/364] =?UTF-8?q?mdbx-windows:=20=D1=83=D0=B4=D0=B0?= =?UTF-8?q?=D0=BB=D0=B5=D0=BD=D0=B8=D0=B5=20=D0=BD=D0=B5=D0=BD=D1=83=D0=B6?= =?UTF-8?q?=D0=BD=D0=BE=D0=B3=D0=BE=20=D0=B2=D1=8B=D0=B7=D0=BE=D0=B2=D0=B0?= =?UTF-8?q?=20`LockFileEx()`=20=D0=B2=D0=BD=D1=83=D1=82=D1=80=D0=B8=20`mdb?= =?UTF-8?q?x=5Fenv=5Fcopy()`.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/src/core.c b/src/core.c index 479ca8de..634969a5 100644 --- a/src/core.c +++ b/src/core.c @@ -20532,14 +20532,10 @@ LIBMDBX_API int mdbx_env_copyW(MDBX_env *env, const wchar_t *dest_path, #endif ); - if (rc == MDBX_SUCCESS) { #if defined(_WIN32) || defined(_WIN64) - OVERLAPPED ov; - memset(&ov, 0, sizeof(ov)); - if (!LockFileEx(newfd, LOCKFILE_EXCLUSIVE_LOCK | LOCKFILE_FAIL_IMMEDIATELY, - 0, 0, INT32_MAX, &ov)) - rc = GetLastError(); + /* no locking required since the file opened with ShareMode == 0 */ #else + if (rc == MDBX_SUCCESS) { struct flock lock_op; memset(&lock_op, 0, sizeof(lock_op)); lock_op.l_type = F_WRLCK; @@ -20554,8 +20550,8 @@ LIBMDBX_API int mdbx_env_copyW(MDBX_env *env, const wchar_t *dest_path, #endif /* Linux */ ) rc = errno; -#endif /* Windows / POSIX */ } +#endif /* Windows / POSIX */ if (rc == MDBX_SUCCESS) rc = mdbx_env_copy2fd(env, newfd, flags); From d94e65b870d5b9b68042bc55f5e90e34a0f86e0b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Wed, 12 Oct 2022 20:47:22 +0300 Subject: [PATCH 159/364] =?UTF-8?q?mdbx:=20=D0=B8=D1=81=D0=BF=D0=BE=D0=BB?= =?UTF-8?q?=D1=8C=D0=B7=D0=BE=D0=B2=D0=B0=D0=BD=D0=B8=D0=B5=20`fcntl64(F?= =?UTF-8?q?=5FGETLK64/F=5FSETLK64/F=5FSETLKW64)`=20=D0=BF=D1=80=D0=B8=20?= =?UTF-8?q?=D0=BD=D0=B0=D0=BB=D0=B8=D1=87=D0=B8=D0=B8.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Это решает проблему срабатывания проверочного утверждения при сборке для платформ где тип off_t шире соответствующих полей структуры flock, используемой для блокировки файлов. --- src/core.c | 7 +++---- src/lck-posix.c | 47 +++++++++++++++++++++-------------------------- src/options.h | 5 ++++- src/osal.h | 34 ++++++++++++++++++++++++++++++++++ 4 files changed, 62 insertions(+), 31 deletions(-) diff --git a/src/core.c b/src/core.c index 634969a5..a47a9c08 100644 --- a/src/core.c +++ b/src/core.c @@ -20536,14 +20536,13 @@ LIBMDBX_API int mdbx_env_copyW(MDBX_env *env, const wchar_t *dest_path, /* no locking required since the file opened with ShareMode == 0 */ #else if (rc == MDBX_SUCCESS) { - struct flock lock_op; + MDBX_STRUCT_FLOCK lock_op; memset(&lock_op, 0, sizeof(lock_op)); lock_op.l_type = F_WRLCK; lock_op.l_whence = SEEK_SET; lock_op.l_start = 0; - lock_op.l_len = - (sizeof(lock_op.l_len) > 4 ? INT64_MAX : INT32_MAX) & ~(size_t)0xffff; - if (fcntl(newfd, F_SETLK, &lock_op) + lock_op.l_len = OFF_T_MAX; + if (MDBX_FCNTL(newfd, MDBX_F_SETLK, &lock_op) #if (defined(__linux__) || defined(__gnu_linux__)) && defined(LOCK_EX) && \ (!defined(__ANDROID_API__) || __ANDROID_API__ >= 24) || flock(newfd, LOCK_EX | LOCK_NB) diff --git a/src/lck-posix.c b/src/lck-posix.c index b881698d..afbe542b 100644 --- a/src/lck-posix.c +++ b/src/lck-posix.c @@ -178,26 +178,21 @@ __cold static void choice_fcntl(void) { of reliability reasons */ #endif /* linux */ ) { - op_setlk = F_OFD_SETLK; - op_setlkw = F_OFD_SETLKW; - op_getlk = F_OFD_GETLK; + op_setlk = MDBX_F_OFD_SETLK; + op_setlkw = MDBX_F_OFD_SETLKW; + op_getlk = MDBX_F_OFD_GETLK; return; } - op_setlk = F_SETLK; - op_setlkw = F_SETLKW; - op_getlk = F_GETLK; + op_setlk = MDBX_F_SETLK; + op_setlkw = MDBX_F_SETLKW; + op_getlk = MDBX_F_GETLK; } #else -#define op_setlk F_SETLK -#define op_setlkw F_SETLKW -#define op_getlk F_GETLK +#define op_setlk MDBX_F_SETLK +#define op_setlkw MDBX_F_SETLKW +#define op_getlk MDBX_F_GETLK #endif /* MDBX_USE_OFDLOCKS */ -#ifndef OFF_T_MAX -#define OFF_T_MAX \ - (((sizeof(off_t) > 4) ? INT64_MAX : INT32_MAX) & ~(size_t)0xffff) -#endif - static int lck_op(const mdbx_filehandle_t fd, int cmd, const int lck, const off_t offset, off_t len) { STATIC_ASSERT(sizeof(off_t) >= sizeof(void *) && @@ -220,7 +215,7 @@ static int lck_op(const mdbx_filehandle_t fd, int cmd, const int lck, assert((uint64_t)((off_t)((uint64_t)offset + (uint64_t)len)) == ((uint64_t)offset + (uint64_t)len)); for (;;) { - struct flock lock_op; + MDBX_STRUCT_FLOCK lock_op; STATIC_ASSERT_MSG(sizeof(off_t) <= sizeof(lock_op.l_start) && sizeof(off_t) <= sizeof(lock_op.l_len) && OFF_T_MAX == (off_t)OFF_T_MAX, @@ -232,7 +227,7 @@ static int lck_op(const mdbx_filehandle_t fd, int cmd, const int lck, lock_op.l_whence = SEEK_SET; lock_op.l_start = offset; lock_op.l_len = len; - int rc = fcntl(fd, cmd, &lock_op); + int rc = MDBX_FCNTL(fd, cmd, &lock_op); jitter4testing(true); if (rc != -1) { if (cmd == op_getlk) { @@ -246,18 +241,18 @@ static int lck_op(const mdbx_filehandle_t fd, int cmd, const int lck, } rc = errno; #if MDBX_USE_OFDLOCKS - if (rc == EINVAL && - (cmd == F_OFD_SETLK || cmd == F_OFD_SETLKW || cmd == F_OFD_GETLK)) { + if (rc == EINVAL && (cmd == MDBX_F_OFD_SETLK || cmd == MDBX_F_OFD_SETLKW || + cmd == MDBX_F_OFD_GETLK)) { /* fallback to non-OFD locks */ - if (cmd == F_OFD_SETLK) - cmd = F_SETLK; - else if (cmd == F_OFD_SETLKW) - cmd = F_SETLKW; + if (cmd == MDBX_F_OFD_SETLK) + cmd = MDBX_F_SETLK; + else if (cmd == MDBX_F_OFD_SETLKW) + cmd = MDBX_F_SETLKW; else - cmd = F_GETLK; - op_setlk = F_SETLK; - op_setlkw = F_SETLKW; - op_getlk = F_GETLK; + cmd = MDBX_F_GETLK; + op_setlk = MDBX_F_SETLK; + op_setlkw = MDBX_F_SETLKW; + op_getlk = MDBX_F_GETLK; continue; } #endif /* MDBX_USE_OFDLOCKS */ diff --git a/src/options.h b/src/options.h index c81529fd..762bd2eb 100644 --- a/src/options.h +++ b/src/options.h @@ -269,7 +269,10 @@ /** Advanced: Using POSIX OFD-locks (autodetection by default). */ #ifndef MDBX_USE_OFDLOCKS -#if defined(F_OFD_SETLK) && defined(F_OFD_SETLKW) && defined(F_OFD_GETLK) && \ +#if ((defined(F_OFD_SETLK) && defined(F_OFD_SETLKW) && \ + defined(F_OFD_GETLK)) || \ + (defined(F_OFD_SETLK64) && defined(F_OFD_SETLKW64) && \ + defined(F_OFD_GETLK64))) && \ !defined(MDBX_SAFE4QEMU) && \ !defined(__sun) /* OFD-lock are broken on Solaris */ #define MDBX_USE_OFDLOCKS 1 diff --git a/src/osal.h b/src/osal.h index b3eda1c0..6876911b 100644 --- a/src/osal.h +++ b/src/osal.h @@ -425,6 +425,40 @@ MDBX_MAYBE_UNUSED static __inline void jitter4testing(bool tiny); #define MAX_WRITE UINT32_C(0x04000000) #else #define MAX_WRITE UINT32_C(0x3f000000) + +#if defined(F_GETLK64) && defined(F_SETLK64) && defined(F_SETLKW64) && \ + !defined(__ANDROID_API__) +#define MDBX_F_SETLK F_SETLK64 +#define MDBX_F_SETLKW F_SETLKW64 +#define MDBX_F_GETLK F_GETLK64 +#define MDBX_FCNTL fcntl64 +#define MDBX_STRUCT_FLOCK struct flock64 +#ifndef OFF_T_MAX +#define OFF_T_MAX UINT64_C(0x7fffFFFFfff00000) +#endif /* OFF_T_MAX */ +#else +#define MDBX_F_SETLK F_SETLK +#define MDBX_F_SETLKW F_SETLKW +#define MDBX_F_GETLK F_GETLK +#define MDBX_FCNTL fcntl +#define MDBX_STRUCT_FLOCK struct flock +#endif /* MDBX_F_SETLK, MDBX_F_SETLKW, MDBX_F_GETLK */ + +#if defined(F_OFD_SETLK64) && defined(F_OFD_SETLKW64) && \ + defined(F_OFD_GETLK64) && !defined(__ANDROID_API__) +#define MDBX_F_OFD_SETLK F_OFD_SETLK64 +#define MDBX_F_OFD_SETLKW F_OFD_SETLKW64 +#define MDBX_F_OFD_GETLK F_OFD_GETLK64 +#else +#define MDBX_F_OFD_SETLK F_OFD_SETLK +#define MDBX_F_OFD_SETLKW F_OFD_SETLKW +#define MDBX_F_OFD_GETLK F_OFD_GETLK +#ifndef OFF_T_MAX +#define OFF_T_MAX \ + (((sizeof(off_t) > 4) ? INT64_MAX : INT32_MAX) & ~(size_t)0xFffff) +#endif /* OFF_T_MAX */ +#endif /* MDBX_F_OFD_SETLK64, MDBX_F_OFD_SETLKW64, MDBX_F_OFD_GETLK64 */ + #endif #if defined(__linux__) || defined(__gnu_linux__) From f5fee949e3dcbe9cc6761927f69057196990e1f4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Thu, 13 Oct 2022 17:30:43 +0300 Subject: [PATCH 160/364] =?UTF-8?q?mdbx:=20=D0=BF=D0=BE=D1=87=D0=B8=D0=BD?= =?UTF-8?q?=D0=BA=D0=B0=20=D1=81=D0=B1=D0=BE=D1=80=D0=BA=D0=B8=20=D0=B4?= =?UTF-8?q?=D0=BB=D1=8F=20=D1=81=D1=82=D0=B0=D1=80=D1=8B=D1=85=20=D0=B2?= =?UTF-8?q?=D0=B5=D1=80=D1=81=D0=B8=D0=B9=20glibc=20=D0=BF=D0=BE=D1=81?= =?UTF-8?q?=D0=BB=D0=B5=20=D0=B7=D0=B0=D0=B4=D0=B5=D0=B9=D1=81=D1=82=D0=B2?= =?UTF-8?q?=D0=BE=D0=B2=D0=B0=D0=BD=D0=B8=D1=8F=20`fcntl64()`.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/osal.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/osal.h b/src/osal.h index 6876911b..017abcb4 100644 --- a/src/osal.h +++ b/src/osal.h @@ -431,7 +431,14 @@ MDBX_MAYBE_UNUSED static __inline void jitter4testing(bool tiny); #define MDBX_F_SETLK F_SETLK64 #define MDBX_F_SETLKW F_SETLKW64 #define MDBX_F_GETLK F_GETLK64 +#if (__GLIBC_PREREQ(2, 28) && \ + (defined(__USE_LARGEFILE64) || defined(__LARGEFILE64_SOURCE) || \ + defined(_USE_LARGEFILE64) || defined(_LARGEFILE64_SOURCE))) || \ + defined(fcntl64) #define MDBX_FCNTL fcntl64 +#else +#define MDBX_FCNTL fcntl +#endif #define MDBX_STRUCT_FLOCK struct flock64 #ifndef OFF_T_MAX #define OFF_T_MAX UINT64_C(0x7fffFFFFfff00000) From 6c986ce904d5d1a8ccf636fb49800a90021cdd1e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Thu, 13 Oct 2022 17:36:53 +0300 Subject: [PATCH 161/364] =?UTF-8?q?mdbx:=20=D0=BA=D0=BE=D1=81=D1=82=D1=8B?= =?UTF-8?q?=D0=BB=D1=8C=20=D0=B4=D0=BB=D1=8F=20=D1=81=D1=82=D0=B0=D1=80?= =?UTF-8?q?=D1=8B=D1=85=20=D0=B2=D0=B5=D1=80=D1=81=D0=B8=D0=B9=20`stdatomi?= =?UTF-8?q?c.h`,=20=D0=B3=D0=B4=D0=B5=20=D0=BC=D0=B0=D0=BA=D1=80=D0=BE?= =?UTF-8?q?=D1=81=D1=8B=20`ATOMIC=5F*=5FLOCK=5FFREE`=20=D0=BE=D1=88=D0=B8?= =?UTF-8?q?=D0=B1=D0=BE=D1=87=D0=BD=D0=BE=20=D0=BF=D0=B5=D1=80=D0=B5=D0=BE?= =?UTF-8?q?=D0=BF=D1=80=D0=B5=D0=B4=D0=B5=D0=BB=D1=8F=D1=8E=D1=82=D1=81?= =?UTF-8?q?=D1=8F=20=D1=87=D0=B5=D1=80=D0=B5=D0=B7=20=D1=84=D1=83=D0=BD?= =?UTF-8?q?=D0=BA=D1=86=D0=B8=D0=B8.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core.c | 21 --------------------- src/options.h | 14 +++++++------- 2 files changed, 7 insertions(+), 28 deletions(-) diff --git a/src/core.c b/src/core.c index a47a9c08..3e7ebd23 100644 --- a/src/core.c +++ b/src/core.c @@ -934,14 +934,7 @@ static __always_inline bool atomic_cas64(MDBX_atomic_uint64_t *p, uint64_t c, uint64_t v) { #ifdef MDBX_HAVE_C11ATOMICS STATIC_ASSERT(sizeof(long long) >= sizeof(uint64_t)); -#ifdef ATOMIC_LLONG_LOCK_FREE - STATIC_ASSERT(ATOMIC_LLONG_LOCK_FREE > 0); -#if ATOMIC_LLONG_LOCK_FREE < 2 assert(atomic_is_lock_free(MDBX_c11a_rw(uint64_t, p))); -#endif /* ATOMIC_LLONG_LOCK_FREE < 2 */ -#else /* defined(ATOMIC_LLONG_LOCK_FREE) */ - assert(atomic_is_lock_free(MDBX_c11a_rw(uint64_t, p))); -#endif return atomic_compare_exchange_strong(MDBX_c11a_rw(uint64_t, p), &c, v); #elif defined(__GNUC__) || defined(__clang__) return __sync_bool_compare_and_swap(&p->weak, c, v); @@ -960,14 +953,7 @@ static __always_inline bool atomic_cas32(MDBX_atomic_uint32_t *p, uint32_t c, uint32_t v) { #ifdef MDBX_HAVE_C11ATOMICS STATIC_ASSERT(sizeof(int) >= sizeof(uint32_t)); -#ifdef ATOMIC_INT_LOCK_FREE - STATIC_ASSERT(ATOMIC_INT_LOCK_FREE > 0); -#if ATOMIC_INT_LOCK_FREE < 2 assert(atomic_is_lock_free(MDBX_c11a_rw(uint32_t, p))); -#endif -#else - assert(atomic_is_lock_free(MDBX_c11a_rw(uint32_t, p))); -#endif return atomic_compare_exchange_strong(MDBX_c11a_rw(uint32_t, p), &c, v); #elif defined(__GNUC__) || defined(__clang__) return __sync_bool_compare_and_swap(&p->weak, c, v); @@ -986,14 +972,7 @@ static __always_inline uint32_t atomic_add32(MDBX_atomic_uint32_t *p, uint32_t v) { #ifdef MDBX_HAVE_C11ATOMICS STATIC_ASSERT(sizeof(int) >= sizeof(uint32_t)); -#ifdef ATOMIC_INT_LOCK_FREE - STATIC_ASSERT(ATOMIC_INT_LOCK_FREE > 0); -#if ATOMIC_INT_LOCK_FREE < 2 assert(atomic_is_lock_free(MDBX_c11a_rw(uint32_t, p))); -#endif -#else - assert(atomic_is_lock_free(MDBX_c11a_rw(uint32_t, p))); -#endif return atomic_fetch_add(MDBX_c11a_rw(uint32_t, p), v); #elif defined(__GNUC__) || defined(__clang__) return __sync_fetch_and_add(&p->weak, v); diff --git a/src/options.h b/src/options.h index 762bd2eb..127a13be 100644 --- a/src/options.h +++ b/src/options.h @@ -358,13 +358,7 @@ #endif /* MDBX_64BIT_ATOMIC */ #ifndef MDBX_64BIT_CAS -#if defined(ATOMIC_LLONG_LOCK_FREE) -#if ATOMIC_LLONG_LOCK_FREE > 1 -#define MDBX_64BIT_CAS 1 -#else -#define MDBX_64BIT_CAS 0 -#endif -#elif defined(__GCC_ATOMIC_LLONG_LOCK_FREE) +#if defined(__GCC_ATOMIC_LLONG_LOCK_FREE) #if __GCC_ATOMIC_LLONG_LOCK_FREE > 1 #define MDBX_64BIT_CAS 1 #else @@ -376,6 +370,12 @@ #else #define MDBX_64BIT_CAS 0 #endif +#elif defined(ATOMIC_LLONG_LOCK_FREE) +#if ATOMIC_LLONG_LOCK_FREE > 1 +#define MDBX_64BIT_CAS 1 +#else +#define MDBX_64BIT_CAS 0 +#endif #elif defined(_MSC_VER) || defined(__APPLE__) || defined(DOXYGEN) #define MDBX_64BIT_CAS 1 #else From c4beb5a4a0c9173d394a03235367d68ac6fd35d7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Thu, 13 Oct 2022 17:56:42 +0300 Subject: [PATCH 162/364] =?UTF-8?q?mdbx-test:=20=D0=BD=D0=B5=20=D0=B2?= =?UTF-8?q?=D1=8B=D0=B7=D1=8B=D0=B2=D0=B0=D0=B5=D0=BC=20`sudo`=20=D0=BF?= =?UTF-8?q?=D1=80=D0=B8=20=D0=BE=D1=82=D1=81=D1=83=D1=82=D1=81=D1=82=D0=B2?= =?UTF-8?q?=D0=B8=D0=B8.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test/long_stochastic.sh | 6 +++++- test/stochastic_small.sh | 6 +++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/test/long_stochastic.sh b/test/long_stochastic.sh index 29e70949..d75607ec 100755 --- a/test/long_stochastic.sh +++ b/test/long_stochastic.sh @@ -275,7 +275,11 @@ case ${UNAME} in ulimit -c unlimited if [ "$(cat /proc/sys/kernel/core_pattern)" != "core.%p" ]; then echo "core.%p > /proc/sys/kernel/core_pattern" >&2 - echo "core.%p" | sudo tee /proc/sys/kernel/core_pattern || true + if [ $(id -u) -ne 0 -a -n "$(which sudo 2>/dev/null)" ]; then + echo "core.%p" | sudo tee /proc/sys/kernel/core_pattern || true + else + (echo "core.%p" > /proc/sys/kernel/core_pattern) || true + fi fi ;; diff --git a/test/stochastic_small.sh b/test/stochastic_small.sh index 5e216ced..b0aa1cb7 100755 --- a/test/stochastic_small.sh +++ b/test/stochastic_small.sh @@ -213,7 +213,11 @@ case ${UNAME} in ulimit -c unlimited if [ "$(cat /proc/sys/kernel/core_pattern)" != "core.%p" ]; then echo "core.%p > /proc/sys/kernel/core_pattern" >&2 - echo "core.%p" | sudo tee /proc/sys/kernel/core_pattern || true + if [ $(id -u) -ne 0 -a -n "$(which sudo 2>/dev/null)" ]; then + echo "core.%p" | sudo tee /proc/sys/kernel/core_pattern || true + else + (echo "core.%p" > /proc/sys/kernel/core_pattern) || true + fi fi ;; From 51a765a5a74f10291d218456aaaf2eee51854f85 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Thu, 13 Oct 2022 18:59:28 +0300 Subject: [PATCH 163/364] =?UTF-8?q?mdbx-make:=20=D0=B2=D1=8B=D0=B2=D0=BE?= =?UTF-8?q?=D0=B4=20=D0=BF=D1=80=D0=BE=D1=82=D0=BE=D0=BA=D0=BE=D0=BB=D0=B0?= =?UTF-8?q?=20=D0=BF=D1=80=D0=B8=20=D1=81=D0=B1=D0=BE=D0=B5=20=D1=82=D0=B5?= =?UTF-8?q?=D1=81=D1=82=D0=BE=D0=B2=D1=8B=D1=85=20=D1=86=D0=B5=D0=BB=D0=B5?= =?UTF-8?q?=D0=B9.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- GNUmakefile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/GNUmakefile b/GNUmakefile index fd7dc74c..92c23dbc 100644 --- a/GNUmakefile +++ b/GNUmakefile @@ -418,7 +418,7 @@ smoke-fault: build-test test: build-test @echo ' RUNNING `test/long_stochastic.sh --loops 2`...' - $(QUIET)test/long_stochastic.sh --loops 2 --db-upto-mb 256 --skip-make >$(TEST_LOG) + $(QUIET)test/long_stochastic.sh --loops 2 --db-upto-mb 256 --skip-make >$(TEST_LOG) || (cat $(TEST_LOG) && false) long-test: build-test @echo ' RUNNING `test/long_stochastic.sh --loops 42`...' @@ -426,12 +426,12 @@ long-test: build-test test-singleprocess: build-test @echo ' RUNNING `test/long_stochastic.sh --single --loops 2`...' - $(QUIET)test/long_stochastic.sh --single --loops 2 --db-upto-mb 256 --skip-make >$(TEST_LOG) + $(QUIET)test/long_stochastic.sh --single --loops 2 --db-upto-mb 256 --skip-make >$(TEST_LOG) || (cat $(TEST_LOG) && false) test-valgrind: CFLAGS_EXTRA=-Ofast -DMDBX_USE_VALGRIND test-valgrind: build-test @echo ' RUNNING `test/long_stochastic.sh --with-valgrind --loops 2`...' - $(QUIET)test/long_stochastic.sh --with-valgrind --loops 2 --db-upto-mb 256 --skip-make >$(TEST_LOG) + $(QUIET)test/long_stochastic.sh --with-valgrind --loops 2 --db-upto-mb 256 --skip-make >$(TEST_LOG) || (cat $(TEST_LOG) && false) memcheck: VALGRIND=valgrind --trace-children=yes --log-file=valgrind-%p.log --leak-check=full --track-origins=yes --error-exitcode=42 --suppressions=test/valgrind_suppress.txt memcheck: CFLAGS_EXTRA=-Ofast -DMDBX_USE_VALGRIND From 80f9f73a5ec32f0511c941c4677828f510e371ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Fri, 14 Oct 2022 00:20:37 +0300 Subject: [PATCH 164/364] =?UTF-8?q?mdbx:=20=D1=87=D1=83=D1=82=D1=8C=20?= =?UTF-8?q?=D0=B1=D0=BE=D0=BB=D1=8C=D1=88=D0=B5=20=D0=BA=D0=BE=D0=BD=D1=82?= =?UTF-8?q?=D1=80=D0=BE=D0=BB=D1=8F=20=D0=B8=20=D0=BF=D0=B0=D1=80=D0=B0?= =?UTF-8?q?=D0=BD=D0=BE=D0=B9=D0=B8=20=D0=B4=D0=BB=D1=8F=20=D1=81=D1=82?= =?UTF-8?q?=D1=80=D0=B0=D1=85=D0=BE=D0=B2=D0=BA=D0=B8=20=D0=BE=D1=82=20?= =?UTF-8?q?=D0=B4=D0=B5=D1=84=D0=B5=D0=BA=D1=82=D0=BE=D0=B2=20`mremap()`.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Есть основание полагать, что mremap() может возвращать MAP_FAILED, но НЕ устанавливать errno в некоторых пограничных ситуациях. Например, когда системных ресурсов не хватает на актуализацию/копирование/клонирование состояния отображения на финальной стадии, в том числе из-за раскраски исходного отображения разными флагами через madvise(). --- src/osal.c | 39 +++++++++++++++++++++++++++++++-------- 1 file changed, 31 insertions(+), 8 deletions(-) diff --git a/src/osal.c b/src/osal.c index cd736562..a1a85bac 100644 --- a/src/osal.c +++ b/src/osal.c @@ -2166,6 +2166,7 @@ MDBX_INTERNAL_FUNC int osal_mmap(const int flags, osal_mmap_t *map, map->limit = 0; map->current = 0; map->address = nullptr; + assert(errno != 0); return errno; } map->limit = limit; @@ -2204,8 +2205,10 @@ MDBX_INTERNAL_FUNC int osal_munmap(osal_mmap_t *map) { if (!NT_SUCCESS(rc)) ntstatus2errcode(rc); #else - if (unlikely(munmap(map->address, map->limit))) + if (unlikely(munmap(map->address, map->limit))) { + assert(errno != 0); return errno; + } #endif /* ! Windows */ map->limit = 0; @@ -2428,8 +2431,10 @@ retry_mapview:; if (limit < map->limit) { /* unmap an excess at end of mapping. */ // coverity[offset_free : FALSE] - if (unlikely(munmap(map->dxb + limit, map->limit - limit))) + if (unlikely(munmap(map->dxb + limit, map->limit - limit))) { + assert(errno != 0); return errno; + } map->limit = limit; return rc; } @@ -2441,14 +2446,19 @@ retry_mapview:; assert(limit > map->limit); uint8_t *ptr = MAP_FAILED; -#if defined(MREMAP_MAYMOVE) +#if (defined(__linux__) || defined(__gnu_linux__)) && defined(_GNU_SOURCE) ptr = mremap(map->address, map->limit, limit, - (flags & MDBX_MRESIZE_MAY_MOVE) ? MREMAP_MAYMOVE : 0); +#if defined(MREMAP_MAYMOVE) + (flags & MDBX_MRESIZE_MAY_MOVE) ? MREMAP_MAYMOVE : +#endif /* MREMAP_MAYMOVE */ + 0); if (ptr == MAP_FAILED) { err = errno; + assert(err != 0); switch (err) { default: return err; + case 0 /* paranoia */: case EAGAIN: case ENOMEM: return MDBX_UNABLE_EXTEND_MAPSIZE; @@ -2456,7 +2466,7 @@ retry_mapview:; break; } } -#endif /* MREMAP_MAYMOVE */ +#endif /* Linux & _GNU_SOURCE */ const unsigned mmap_flags = MAP_CONCEAL | MAP_SHARED | MAP_FILE | MAP_NORESERVE | @@ -2469,17 +2479,22 @@ retry_mapview:; ptr = mmap(map->dxb + map->limit, limit - map->limit, mmap_prot, mmap_flags | MAP_FIXED_NOREPLACE, map->fd, map->limit); if (ptr == map->dxb + map->limit) + /* успешно прилепили отображение в конец */ ptr = map->dxb; else if (ptr != MAP_FAILED) { /* the desired address is busy, unmap unsuitable one */ - if (unlikely(munmap(ptr, limit - map->limit))) + if (unlikely(munmap(ptr, limit - map->limit))) { + assert(errno != 0); return errno; + } ptr = MAP_FAILED; } else { err = errno; + assert(err != 0); switch (err) { default: return err; + case 0 /* paranoia */: case EAGAIN: case ENOMEM: return MDBX_UNABLE_EXTEND_MAPSIZE; @@ -2498,8 +2513,10 @@ retry_mapview:; return MDBX_UNABLE_EXTEND_MAPSIZE; } - if (unlikely(munmap(map->address, map->limit))) + if (unlikely(munmap(map->address, map->limit))) { + assert(errno != 0); return errno; + } // coverity[pass_freed_arg : FALSE] ptr = mmap(map->address, limit, mmap_prot, @@ -2543,6 +2560,7 @@ retry_mapview:; map->limit = 0; map->current = 0; map->address = nullptr; + assert(errno != 0); return errno; } rc = MDBX_UNABLE_EXTEND_MAPSIZE; @@ -2569,8 +2587,10 @@ retry_mapview:; #if MDBX_ENABLE_MADVISE #ifdef MADV_DONTFORK - if (unlikely(madvise(map->address, map->limit, MADV_DONTFORK) != 0)) + if (unlikely(madvise(map->address, map->limit, MADV_DONTFORK) != 0)) { + assert(errno != 0); return errno; + } #endif /* MADV_DONTFORK */ #ifdef MADV_NOHUGEPAGE (void)madvise(map->address, map->limit, MADV_NOHUGEPAGE); @@ -2579,6 +2599,9 @@ retry_mapview:; #endif /* POSIX / Windows */ + assert(rc != MDBX_SUCCESS || + (map->address != nullptr && map->address != MAP_FAILED && + map->current == size && map->limit == limit)); return rc; } From 39c6387d231cd23ad1d1597f7ba357fdec5d3b2f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Fri, 14 Oct 2022 11:47:34 +0300 Subject: [PATCH 165/364] =?UTF-8?q?mdbx:=20=D0=9E=D0=B1=D0=BD=D0=BE=D0=B2?= =?UTF-8?q?=D0=BB=D0=B5=D0=BD=D0=B8=D0=B5=20ChangeLog.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ChangeLog.md | 38 +++++++++++++++++++++++++++++++------- 1 file changed, 31 insertions(+), 7 deletions(-) diff --git a/ChangeLog.md b/ChangeLog.md index e9b3696e..8842b125 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -3,12 +3,8 @@ ChangeLog ## В разработке v0.12.2 - Новое: - - В C++ API добавлены методы фиксации транзакции с получением информации - о задержках. - - Отключение учета «грязных» страниц в не требующих этого режимах (`MDBX_WRITEMAP` при `MDBX_AVOID_MSYNC=0`). Доработка позволяет снизить накладные расходы и была запланирована давно, но откладывалась так как @@ -81,25 +77,53 @@ ChangeLog данных. Тем не менее, по имеющейся информации, на Windows такой путь записи данных в целом обеспечивает более высокую производительность. + - Улучшение эвристики включения авто-слияния записей GC. + + - Изменение формата LCK и семантики некоторых внутренних полей. Версии + libmdbx использующие разный формат не смогут работать с одной БД + одновременно, а только поочередно (LCK-файл переписывается при открытии + первым открывающим БД процессом). + + - В C++ API добавлены методы фиксации транзакции с получением информации + о задержках. + - Added `MDBX_HAVE_BUILT IN_CPU_SUPPORTS` build option to control use GCC's `__builtin_cpu_supports()` function, which could be unavailable on a fake OSes (macos, ios, android, etc). Исправления: + - Больше контроля и осторожности (паранойи) для страховки от дефектов `mremap()`. + - Костыль для починки сборки со старыми версиями `stdatomic.h` из GNU Lib C, + где макросы `ATOMIC_*_LOCK_FREE` ошибочно переопределяются через функции. + - Использование `fcntl64(F_GETLK64/F_SETLK64/F_SETLKW64)` при наличии. + Это решает проблему срабатывания проверочного утверждения при сборке для + платформ где тип `off_t` шире соответствующих полей `структуры flock`, + используемой для блокировки файлов. - Доработан сбор информации о задержках при фиксации транзакций: * Устранено искажение замеров длительности обновления GC при включении отладочного внутреннего аудита; * Защита от undeflow-нуля только общей задержки в метриках, чтобы исключить ситуации, когда сумма отдельных стадий больше общей длительности. - - Ряд исправлений для устранения срабатываний проверочных утверждения в отладочных сборках. - - Исправление лишнего сброса данных на диск в режиме `MDBX_SAFE_NOSYNC` при обновлении GC. - - Fixed an extra check for `MDBX_APPENDDUP` inside `mdbx_cursor_put()` which could result in returning `MDBX_EKEYMISMATCH` for valid cases. + - Ряд исправлений для устранения срабатываний проверочных утверждения в + отладочных сборках. + - Более осторожное преобразование к типу `mdbx_tid_t` для устранения + предупреждений. + - Исправление лишнего сброса данных на диск в режиме `MDBX_SAFE_NOSYNC` + при обновлении GC. + - Fixed an extra check for `MDBX_APPENDDUP` inside `mdbx_cursor_put()` + which could result in returning `MDBX_EKEYMISMATCH` for valid cases. - Fixed nasty `clz()` bug (by using `_BitScanReverse()`, only MSVC builds affected). Мелочи: + - Проверка атомарности C11-операций c 32/64-битными данными. + - Уменьшение в 42 раза значения по-умолчанию для `me_options.dp_limit` + в отладочных сборках. + - Добавление платформы `gcc-riscv64-linux-gnu` в список для цели `cross-gcc`. + - Небольшие правки скрипта `long_stochastic.sh` для работы в Windows. + - Удаление ненужного вызова `LockFileEx()` внутри `mdbx_env_copy()`. - Добавлено описание использования файловых дескрипторов в различных режимах. - Добавлено использование _CrtDbgReport() в отладочных сборках. - Fixed an extra ensure/assertion check of `oldest_reader` inside `txn_end()`. From 206dbecccfb745db77e6598c278841661748d175 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Fri, 14 Oct 2022 16:20:57 +0300 Subject: [PATCH 166/364] =?UTF-8?q?mdbx:=20=D0=B4=D0=BE=D0=B1=D0=B0=D0=B2?= =?UTF-8?q?=D0=BB=D0=B5=D0=BD=D0=B8=D0=B5=20=D0=B2=20ChangeLog=20=D1=81?= =?UTF-8?q?=D1=81=D1=8B=D0=BB=D0=BE=D0=BA=20=D1=81=20=D0=BC=D0=B0=D1=88?= =?UTF-8?q?=D0=B8=D0=BD=D0=BD=D1=8B=D0=BC=20=D0=BF=D0=B5=D1=80=D0=B5=D0=B2?= =?UTF-8?q?=D0=BE=D0=B4=D0=BE=D0=BC.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ChangeLog.md | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/ChangeLog.md b/ChangeLog.md index 8842b125..1777bfe5 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -1,6 +1,10 @@ ChangeLog --------- +English version [by Google](https://gitflic-ru.translate.goog/project/erthink/libmdbx/blob?file=ChangeLog.md&_x_tr_sl=ru&_x_tr_tl=en) +and [by Yandex](https://translated.turbopages.org/proxy_u/ru-en.en/https/gitflic.ru/project/erthink/libmdbx/blob?file=ChangeLog.md). + + ## В разработке v0.12.2 Новое: @@ -40,13 +44,13 @@ ChangeLog суммарный размер пары ключ-значение. - Реализовано использование асинхронной (overlapped) записи в Windows, - включая использования небуфферизированного ввода-вывода и WriteGather(). + включая использования небуфферизированного ввода-вывода и `WriteGather()`. Это позволяет сократить накладные расходы и частично обойти проблемы Windows с низкой производительностью ввода-вывода, включая большие - задержки FlushFileBuffers(). Новый код также обеспечивает консолидацию + задержки `FlushFileBuffers()`. Новый код также обеспечивает консолидацию записываемых регионов на всех платформах, а на Windows использование событий (events) сведено к минимум, одновременно с автоматических - использованием WriteGather(). Поэтому ожидается существенное снижение + использованием `WriteGather()`. Поэтому ожидается существенное снижение накладных расходов взаимодействия с ОС, а в Windows это ускорение, в некоторых сценариях, может быть кратным в сравнении с LMDB. @@ -84,7 +88,7 @@ ChangeLog одновременно, а только поочередно (LCK-файл переписывается при открытии первым открывающим БД процессом). - - В C++ API добавлены методы фиксации транзакции с получением информации + - В `C++` API добавлены методы фиксации транзакции с получением информации о задержках. - Added `MDBX_HAVE_BUILT IN_CPU_SUPPORTS` build option to control use GCC's @@ -125,7 +129,7 @@ ChangeLog - Небольшие правки скрипта `long_stochastic.sh` для работы в Windows. - Удаление ненужного вызова `LockFileEx()` внутри `mdbx_env_copy()`. - Добавлено описание использования файловых дескрипторов в различных режимах. - - Добавлено использование _CrtDbgReport() в отладочных сборках. + - Добавлено использование `_CrtDbgReport()` в отладочных сборках. - Fixed an extra ensure/assertion check of `oldest_reader` inside `txn_end()`. - Removed description of deprecated usage of `MDBX_NODUPDATA`. - Fixed regression ASAN/Valgring-enabled builds. From 8833dc6871abd15e999c34b96c9e007cc72f4acd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Sat, 22 Oct 2022 01:38:33 +0300 Subject: [PATCH 167/364] =?UTF-8?q?mdbx:=20=D0=BA=D0=BE=D1=81=D1=82=D1=8B?= =?UTF-8?q?=D0=BB=D1=8C=20=D0=B4=D0=BB=D1=8F=20=D0=BE=D0=B1=D1=85=D0=BE?= =?UTF-8?q?=D0=B4=D0=B0=20=D0=BE=D1=88=D0=B8=D0=B1=D0=BE=D0=BA=20encryptfs?= =?UTF-8?q?.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Выяснилось что утилита `mdbx_copy` и функции `mdbx_env_copy()` могут создавать ПРОБЛЕМЫ если целевой файл расположен в encryptfs (такая файловая система в Linux). При этом может быть четыре исхода в зависимости от версии ядра и положения звезд на небе: - всё хорошо; - плохие данные в копии без возврата ошибок; - ошибка EINVAL(22) при копировании; - oops или зависание ядра, отвал смонтированной encryptfs и т.п. В текущем понимании, причина обусловлена ошибой в коде fs, которая проявляется при использовании системного вызова `copy_file_range`. --- src/core.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/core.c b/src/core.c index 3e7ebd23..1723a9ad 100644 --- a/src/core.c +++ b/src/core.c @@ -20339,6 +20339,11 @@ __cold static int env_copy_asis(MDBX_env *env, MDBX_txn *read_txn, #if MDBX_USE_COPYFILERANGE static bool copyfilerange_unavailable; bool not_the_same_filesystem = false; + struct statfs statfs_info; + if (fstatfs(fd, &statfs_info) || + statfs_info.f_type == /* ECRYPTFS_SUPER_MAGIC */ 0xf15f) + /* avoid use copyfilerange_unavailable() to ecryptfs due bugs */ + not_the_same_filesystem = true; #endif /* MDBX_USE_COPYFILERANGE */ for (size_t offset = meta_bytes; rc == MDBX_SUCCESS && offset < used_size;) { #if MDBX_USE_SENDFILE @@ -20372,7 +20377,9 @@ __cold static int env_copy_asis(MDBX_env *env, MDBX_txn *read_txn, if (bytes_copied == 0) break; rc = errno; - if (rc == EXDEV) + if (rc == EXDEV || rc == /* workaround for ecryptfs bug(s), + maybe usefull for others fs */ + EINVAL) not_the_same_filesystem = true; else if (ignore_enosys(rc) == MDBX_RESULT_TRUE) copyfilerange_unavailable = true; From cd616447dabd27db409f9899f696cbbb08d44ba3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Sat, 15 Oct 2022 20:49:39 +0300 Subject: [PATCH 168/364] mdbx-cmake: set X86_32/X86_64/ARM/MIPS for Windows and Android. --- cmake/compiler.cmake | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/cmake/compiler.cmake b/cmake/compiler.cmake index 842c8016..ef308fc2 100644 --- a/cmake/compiler.cmake +++ b/cmake/compiler.cmake @@ -203,9 +203,38 @@ endif() if(NOT CMAKE_SYSTEM_ARCH) if(CMAKE_${CMAKE_PRIMARY_LANG}_COMPILER_ARCHITECTURE_ID) - set(CMAKE_SYSTEM_ARCH "${CMAKE_${CMAKE_PRIMARY_LANG}_COMPILER_ARCHITECTURE_ID}") + string(TOLOWER "${CMAKE_${CMAKE_PRIMARY_LANG}_COMPILER_ARCHITECTURE_ID}" CMAKE_SYSTEM_ARCH) + if(CMAKE_SYSTEM_ARCH STREQUAL "x86") + set(X86_32 TRUE) + elseif(CMAKE_SYSTEM_ARCH STREQUAL "x86_64" OR CMAKE_SYSTEM_ARCH STREQUAL "x64") + set(X86_64 TRUE) + set(CMAKE_SYSTEM_ARCH "x86_64") + elseif(CMAKE_SYSTEM_ARCH MATCHES "^(aarch.*|arm.*)") + if(CMAKE_TARGET_BITNESS EQUAL 64) + set(AARCH64 TRUE) + else() + set(ARM32 TRUE) + endif() + endif() elseif(CMAKE_ANDROID_ARCH_ABI) set(CMAKE_SYSTEM_ARCH "${CMAKE_ANDROID_ARCH_ABI}") + if(CMAKE_SYSTEM_ARCH STREQUAL "x86") + set(X86_32 TRUE) + elseif(CMAKE_SYSTEM_ARCH STREQUAL "x86_64") + set(X86_64 TRUE) + elseif(CMAKE_SYSTEM_ARCH MATCHES "^(aarch.*|AARCH.*|arm.*|ARM.*)") + if(CMAKE_TARGET_BITNESS EQUAL 64) + set(AARCH64 TRUE) + else() + set(ARM32 TRUE) + endif() + elseif(CMAKE_SYSTEM_ARCH MATCHES "^(mips|MIPS).*") + if(CMAKE_TARGET_BITNESS EQUAL 64) + set(MIPS64 TRUE) + else() + set(MIPS32 TRUE) + endif() + endif() elseif(CMAKE_COMPILER_IS_ELBRUSC OR CMAKE_COMPILER_IS_ELBRUSCXX OR CMAKE_${CMAKE_PRIMARY_LANG}_COMPILER_ID STREQUAL "LCC" OR CMAKE_SYSTEM_PROCESSOR MATCHES "e2k.*|E2K.*|elbrus.*|ELBRUS.*") From 64d0e639c2eabb3bb11c216c194683835aaa1c68 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Sun, 16 Oct 2022 18:10:44 +0300 Subject: [PATCH 169/364] =?UTF-8?q?mdbx-cmake:=20=D1=81=D0=B8=D0=BD=D1=85?= =?UTF-8?q?=D1=80=D0=BE=D0=BD=D0=B8=D0=B7=D0=B0=D1=86=D0=B8=D1=8F=20LTO-?= =?UTF-8?q?=D1=80=D0=B5=D1=86=D0=B5=D0=BF=D1=82=D0=BE=D0=B2.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CMakeLists.txt | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index a631fa78..ad18b945 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -241,17 +241,16 @@ else() option(BUILD_FOR_NATIVE_CPU "Generate code for the compiling machine CPU" OFF) endif() - if(CMAKE_CONFIGURATION_TYPES OR NOT CMAKE_BUILD_TYPE_UPPERCASE STREQUAL "DEBUG") + if((CMAKE_CONFIGURATION_TYPES OR NOT CMAKE_BUILD_TYPE_UPPERCASE STREQUAL "DEBUG") + AND (NOT CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 6.0) + AND (NOT CMAKE_COMPILER_IS_CLANG OR CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 4.0)) set(INTERPROCEDURAL_OPTIMIZATION_DEFAULT ON) else() set(INTERPROCEDURAL_OPTIMIZATION_DEFAULT OFF) endif() if(CMAKE_INTERPROCEDURAL_OPTIMIZATION_AVAILABLE - OR GCC_LTO_AVAILABLE OR MSVC_LTO_AVAILABLE OR - (CLANG_LTO_AVAILABLE AND - ((DEFINED MDBX_ENABLE_TESTS AND NOT MDBX_ENABLE_TESTS) - OR NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.0))) + OR GCC_LTO_AVAILABLE OR MSVC_LTO_AVAILABLE OR CLANG_LTO_AVAILABLE) option(INTERPROCEDURAL_OPTIMIZATION "Enable interprocedural/LTO optimization" ${INTERPROCEDURAL_OPTIMIZATION_DEFAULT}) else() set(INTERPROCEDURAL_OPTIMIZATION OFF) @@ -261,14 +260,25 @@ else() if(GCC_LTO_AVAILABLE) set(LTO_ENABLED TRUE) set(CMAKE_AR ${CMAKE_GCC_AR} CACHE PATH "Path to ar program with LTO-plugin" FORCE) + set(CMAKE_C_COMPILER_AR ${CMAKE_AR} CACHE PATH "Path to ar program with LTO-plugin" FORCE) + set(CMAKE_CXX_COMPILER_AR ${CMAKE_AR} CACHE PATH "Path to ar program with LTO-plugin" FORCE) set(CMAKE_NM ${CMAKE_GCC_NM} CACHE PATH "Path to nm program with LTO-plugin" FORCE) set(CMAKE_RANLIB ${CMAKE_GCC_RANLIB} CACHE PATH "Path to ranlib program with LTO-plugin" FORCE) + set(CMAKE_C_COMPILER_RANLIB ${CMAKE_RANLIB} CACHE PATH "Path to ranlib program with LTO-plugin" FORCE) + set(CMAKE_CXX_COMPILER_RANLIB ${CMAKE_RANLIB} CACHE PATH "Path to ranlib program with LTO-plugin" FORCE) message(STATUS "MDBX indulge Link-Time Optimization by GCC") elseif(CLANG_LTO_AVAILABLE) set(LTO_ENABLED TRUE) + if(CMAKE_CLANG_LD) + set(CMAKE_LINKER ${CMAKE_CLANG_LD} CACHE PATH "Path to lld or ld program with LTO-plugin" FORCE) + endif() set(CMAKE_AR ${CMAKE_CLANG_AR} CACHE PATH "Path to ar program with LTO-plugin" FORCE) + set(CMAKE_C_COMPILER_AR ${CMAKE_AR} CACHE PATH "Path to ar program with LTO-plugin" FORCE) + set(CMAKE_CXX_COMPILER_AR ${CMAKE_AR} CACHE PATH "Path to ar program with LTO-plugin" FORCE) set(CMAKE_NM ${CMAKE_CLANG_NM} CACHE PATH "Path to nm program with LTO-plugin" FORCE) set(CMAKE_RANLIB ${CMAKE_CLANG_RANLIB} CACHE PATH "Path to ranlib program with LTO-plugin" FORCE) + set(CMAKE_C_COMPILER_RANLIB ${CMAKE_RANLIB} CACHE PATH "Path to ranlib program with LTO-plugin" FORCE) + set(CMAKE_CXX_COMPILER_RANLIB ${CMAKE_RANLIB} CACHE PATH "Path to ranlib program with LTO-plugin" FORCE) message(STATUS "MDBX indulge Link-Time Optimization by CLANG") elseif(MSVC_LTO_AVAILABLE) set(LTO_ENABLED TRUE) From bbd139b2aea23250f51f5ce506ba1f19e3115ee3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Sun, 16 Oct 2022 19:05:11 +0300 Subject: [PATCH 170/364] =?UTF-8?q?mdbx-cmake:=20=D1=81=D0=BE=D0=B7=D0=B4?= =?UTF-8?q?=D0=B0=D0=BD=D0=B8=D0=B5=20VERSION.txt?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index ad18b945..6252d52a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -970,6 +970,7 @@ if (NOT SUBPROJECT) set(CPACK_PACKAGE_VERSION_COMMIT ${MDBX_VERSION_REVISION}) set(PACKAGE_VERSION "${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH}.${CPACK_PACKAGE_VERSION_COMMIT}") message(STATUS "libmdbx package version is ${PACKAGE_VERSION}") + file(WRITE "${CMAKE_CURRENT_BINARY_DIR}/VERSION.txt" "${MDBX_VERSION_MAJOR}.${MDBX_VERSION_MINOR}.${MDBX_VERSION_RELEASE}.${MDBX_VERSION_REVISION}") endif() cmake_policy(POP) From 753fa13048624d0e85d167e841294d79a5ca629d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Mon, 17 Oct 2022 11:10:22 +0300 Subject: [PATCH 171/364] =?UTF-8?q?mdbx:=20=D1=83=D0=B4=D0=B0=D0=BB=D0=B5?= =?UTF-8?q?=D0=BD=D0=B8=D0=B5=20=D0=BB=D0=B8=D1=88=D0=BD=D0=B8=D1=85=20?= =?UTF-8?q?=D0=BA=D0=BE=D0=BC=D0=BC=D0=B5=D0=BD=D1=82=D0=B0=D1=80=D0=B8?= =?UTF-8?q?=D0=B5=D0=B2.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/core.c b/src/core.c index 1723a9ad..e23bf107 100644 --- a/src/core.c +++ b/src/core.c @@ -20280,7 +20280,6 @@ __cold static int env_compact(MDBX_env *env, MDBX_txn *read_txn, const size_t chunk = ((size_t)MDBX_ENVCOPY_WRITEBUF < whole_size - offset) ? (size_t)MDBX_ENVCOPY_WRITEBUF : whole_size - offset; - /* copy to avoid EFAULT in case swapped-out */ int rc = osal_write(fd, data_buffer, chunk); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -20410,7 +20409,6 @@ __cold static int env_copy_asis(MDBX_env *env, MDBX_txn *read_txn, ((size_t)MDBX_ENVCOPY_WRITEBUF < whole_size - offset) ? (size_t)MDBX_ENVCOPY_WRITEBUF : whole_size - offset; - /* copy to avoid EFAULT in case swapped-out */ rc = osal_write(fd, data_buffer, chunk); offset += chunk; } From 4e95a079eeecbc648835b2e3dfaf646804e75053 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Thu, 20 Oct 2022 19:00:29 +0300 Subject: [PATCH 172/364] =?UTF-8?q?mdbx:=20=D0=BF=D0=B5=D1=80=D0=B5=D0=B8?= =?UTF-8?q?=D0=BC=D0=B5=D0=BD=D0=BE=D0=B2=D0=B0=D0=BD=D0=B8=D0=B5=20`MDBX?= =?UTF-8?q?=5FCOMMIT=5FPAGES`=20=D0=B2=20`MDBX=5FAUXILARY=5FIOV=5FMAX`.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core.c | 8 ++++---- src/internals.h | 12 ++++++------ 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/core.c b/src/core.c index e23bf107..f8a2b827 100644 --- a/src/core.c +++ b/src/core.c @@ -4047,15 +4047,15 @@ __cold static void kill_page(MDBX_txn *txn, MDBX_page *mp, pgno_t pgno, if ((txn->mt_flags & MDBX_WRITEMAP) == 0) osal_pwrite(env->me_lazy_fd, mp, bytes, pgno2bytes(env, pgno)); } else { - struct iovec iov[MDBX_COMMIT_PAGES]; + struct iovec iov[MDBX_AUXILARY_IOV_MAX]; iov[0].iov_len = env->me_psize; iov[0].iov_base = (char *)env->me_pbuf + env->me_psize; size_t iov_off = pgno2bytes(env, pgno), n = 1; while (--npages) { iov[n] = iov[0]; - if (++n == MDBX_COMMIT_PAGES) { - osal_pwritev(env->me_lazy_fd, iov, MDBX_COMMIT_PAGES, iov_off); - iov_off += pgno2bytes(env, MDBX_COMMIT_PAGES); + if (++n == MDBX_AUXILARY_IOV_MAX) { + osal_pwritev(env->me_lazy_fd, iov, MDBX_AUXILARY_IOV_MAX, iov_off); + iov_off += pgno2bytes(env, MDBX_AUXILARY_IOV_MAX); n = 0; } } diff --git a/src/internals.h b/src/internals.h index ab748f3a..721a6131 100644 --- a/src/internals.h +++ b/src/internals.h @@ -1579,12 +1579,12 @@ typedef struct MDBX_node { #error "Oops, some flags overlapped or wrong" #endif -/* max number of pages to commit in one writev() call */ -#define MDBX_COMMIT_PAGES 64 -#if defined(IOV_MAX) && IOV_MAX < MDBX_COMMIT_PAGES /* sysconf(_SC_IOV_MAX) */ -#undef MDBX_COMMIT_PAGES -#define MDBX_COMMIT_PAGES IOV_MAX -#endif +/* Max length of iov-vector passed to writev() call, used for auxilary writes */ +#define MDBX_AUXILARY_IOV_MAX 64 +#if defined(IOV_MAX) && IOV_MAX < MDBX_AUXILARY_IOV_MAX +#undef MDBX_AUXILARY_IOV_MAX +#define MDBX_AUXILARY_IOV_MAX IOV_MAX +#endif /* MDBX_AUXILARY_IOV_MAX */ /* * / From b04f7814efb45280336ced9342a44bfee312220a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Fri, 21 Oct 2022 23:09:56 +0300 Subject: [PATCH 173/364] =?UTF-8?q?mdbx-cmake:=20=D0=B8=D1=81=D0=BF=D1=80?= =?UTF-8?q?=D0=B0=D0=B2=D0=BB=D0=B5=D0=BD=D0=B8=D0=B5=20=D0=B4=D0=BB=D1=8F?= =?UTF-8?q?=20=D1=81=D0=BE=D0=B2=D0=BC=D0=B5=D1=81=D1=82=D0=B8=D0=BC=D0=BE?= =?UTF-8?q?=D1=81=D1=82=D0=B8=20=D1=81=20CMake=203.8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CMakeLists.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6252d52a..dfbaa3de 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -762,7 +762,6 @@ if(MDBX_BUILD_SHARED_LIBRARY) if(CMAKE_VERSION VERSION_LESS 3.12) install(TARGETS mdbx EXPORT libmdbx LIBRARY DESTINATION ${MDBX_DLL_INSTALL_DESTINATION} COMPONENT runtime - OBJECTS DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT devel ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT devel PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} COMPONENT devel INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} COMPONENT devel) From d661d4bac765e116c1b10c5fc0ed199a9e779de0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Mon, 24 Oct 2022 01:02:38 +0300 Subject: [PATCH 174/364] =?UTF-8?q?mdbx:=20=D0=B4=D0=BE=D0=B1=D0=B0=D0=B2?= =?UTF-8?q?=D0=BB=D0=B5=D0=BD=D0=B8=D0=B5=20`mdbx=5Fenv=5Fwarmup()`?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mdbx.h | 88 +++++++++++ src/base.h | 2 + src/core.c | 380 ++++++++++++++++++++++++++++++++++++++++++++++-- src/internals.h | 8 +- 4 files changed, 465 insertions(+), 13 deletions(-) diff --git a/mdbx.h b/mdbx.h index 17d1a17b..72b377f9 100644 --- a/mdbx.h +++ b/mdbx.h @@ -2808,6 +2808,94 @@ LIBMDBX_INLINE_API(int, mdbx_env_close, (MDBX_env * env)) { return mdbx_env_close_ex(env, false); } +/** \brief Warming up options + * \ingroup c_settings + * \anchor warmup_flags + * \see mdbx_env_warmup() */ +enum MDBX_warmup_flags_t { + /** By default \ref mdbx_env_warmup() just ask OS kernel to asynchronously + * prefetch database pages. */ + MDBX_warmup_default = 0, + + /** Peeking all pages of allocated portion of the database + * to force ones to be loaded into memory. However, the pages are just peeks + * sequentially, so unused pages that are in GC will be loaded in the same + * way as those that contain payload. */ + MDBX_warmup_force = 1, + + /** Using system calls to peeks pages instead of directly accessing ones, + * which at the cost of additional overhead avoids killing the current + * process by OOM-killer in a lack of memory condition. + * \note Has effect only on POSIX (non-Windows) systems with conjunction + * to \ref MDBX_warmup_force option. */ + MDBX_warmup_oomsafe = 2, + + /** Try to lock database pages in memory by `mlock()` on POSIX-systems + * or `VirtualLock()` on Windows. Please refer to description of these + * functions for reasonability of such locking and the information of + * effects, including the system as a whole. + * + * Such locking in memory requires that the corresponding resource limits + * (e.g. `RLIMIT_RSS`, `RLIMIT_MEMLOCK` or process working set size) + * and the availability of system RAM are sufficiently high. + * + * On successful, all currently allocated pages, both unused in GC and + * containing payload, will be locked in memory until the environment closes, + * or explicitly unblocked by using \ref MDBX_warmup_release, or the + * database geomenry will changed, including its auto-shrinking. */ + MDBX_warmup_lock = 4, + + /** Alters corresponding current resource limits to be enough for lock pages + * by \ref MDBX_warmup_lock. However, this option should be used in simpliest + * applications since takes into account only current size of this environment + * disregarding all other factors. For real-world database application you + * will need full-fledged management of resources and their limits with + * respective engineering. */ + MDBX_warmup_touchlimit = 8, + + /** Release the lock that was performed before by \ref MDBX_warmup_lock. */ + MDBX_warmup_release = 16, +}; +#ifndef __cplusplus +typedef enum MDBX_warmup_flags_t MDBX_warmup_flags_t; +#else +DEFINE_ENUM_FLAG_OPERATORS(MDBX_warmup_flags_t) +#endif + +/** \brief Warms up the database by loading pages into memory, optionally lock + * ones. \ingroup c_settings + * + * Depending on the specified flags, notifies OS kernel about following access, + * force loads the database pages, including locks ones in memory or releases + * such a lock. However, the function does not analyze the b-tree nor the GC. + * Therefore an unused pages that are in GC handled (i.e. will be loaded) in + * the same way as those that contain payload. + * + * At least one of env or txn argument must be non-null. + * + * \param [in] env An environment handle returned + * by \ref mdbx_env_create(). + * \param [in] txn A transaction handle returned + * by \ref mdbx_txn_begin(). + * \param [in] flags The \ref warmup_flags, bitwise OR'ed together. + * + * \param [in] timeout_seconds_16dot16 Optional timeout which checking only + * during explicitly peeking database pages + * for loading ones if the \ref MDBX_warmup_force + * option was spefified. + * + * \returns A non-zero error value on failure and 0 on success. + * Some possible errors are: + * + * \retval MDBX_ENOSYS The system does not support requested + * operation(s). + * + * \retval MDBX_RESULT_TRUE The specified timeout is reached during load + * data into memory. */ +LIBMDBX_API int mdbx_env_warmup(const MDBX_env *env, const MDBX_txn *txn, + MDBX_warmup_flags_t flags, + unsigned timeout_seconds_16dot16); + /** \brief Set environment flags. * \ingroup c_settings * diff --git a/src/base.h b/src/base.h index 1596d26a..187b5270 100644 --- a/src/base.h +++ b/src/base.h @@ -263,8 +263,10 @@ __extern_C key_t ftok(const char *, int); #include #include #include +#include #include #include +#include #include #endif /*---------------------------------------------------------------------*/ diff --git a/src/core.c b/src/core.c index f8a2b827..5c0d8815 100644 --- a/src/core.c +++ b/src/core.c @@ -5561,7 +5561,7 @@ MDBX_MAYBE_UNUSED static __always_inline int ignore_enosys(int err) { #if MDBX_ENABLE_MADVISE /* Turn on/off readahead. It's harmful when the DB is larger than RAM. */ -__cold static int set_readahead(MDBX_env *env, const pgno_t edge, +__cold static int set_readahead(const MDBX_env *env, const pgno_t edge, const bool enable, const bool force_whole) { eASSERT(env, edge >= NUM_METAS && edge <= MAX_PAGENO + 1); eASSERT(env, (enable & 1) == (enable != 0)); @@ -5687,6 +5687,82 @@ __cold static int set_readahead(MDBX_env *env, const pgno_t edge, } #endif /* MDBX_ENABLE_MADVISE */ +__cold static void update_mlocked(const MDBX_env *env, + const pgno_t new_aligned_mlocked_pgno, + const bool lock_not_release) { + for (;;) { + const pgno_t mlock_pgno_snap = + atomic_load32(&env->me_mlocked_pgno, mo_AcquireRelease); + eASSERT(env, pgno_align2os_pgno(env, mlock_pgno_snap) == mlock_pgno_snap); + eASSERT(env, pgno_align2os_pgno(env, new_aligned_mlocked_pgno) == + new_aligned_mlocked_pgno); + if (lock_not_release ? (mlock_pgno_snap >= new_aligned_mlocked_pgno) + : (mlock_pgno_snap <= new_aligned_mlocked_pgno)) + break; + if (likely(atomic_cas32(&((MDBX_env *)env)->me_mlocked_pgno, + mlock_pgno_snap, new_aligned_mlocked_pgno))) + for (;;) { + MDBX_atomic_uint32_t *const mlock_counter = + &env->me_lck->mti_mlock_counter; + const uint32_t snap_counter = atomic_load32(mlock_counter, mo_Relaxed); + if (mlock_pgno_snap == 0 && snap_counter < INT_MAX) { + eASSERT(env, lock_not_release); + if (unlikely( + !atomic_cas32(mlock_counter, snap_counter, snap_counter + 1))) + continue; + } + if (new_aligned_mlocked_pgno == 0 && snap_counter > 0) { + eASSERT(env, !lock_not_release); + if (unlikely( + !atomic_cas32(mlock_counter, snap_counter, snap_counter - 1))) + continue; + } + NOTICE("%s-pages %u..%u, mlocked-process(es) %u -> %u", + lock_not_release ? "lock" : "unlock", + lock_not_release ? mlock_pgno_snap : new_aligned_mlocked_pgno, + lock_not_release ? new_aligned_mlocked_pgno : mlock_pgno_snap, + snap_counter, atomic_load32(mlock_counter, mo_Relaxed)); + return; + } + } +} + +__cold static void munlock_after(const MDBX_env *env, const pgno_t aligned_pgno, + const size_t end_bytes) { + if (atomic_load32(&env->me_mlocked_pgno, mo_AcquireRelease) > aligned_pgno) { + int err = MDBX_ENOSYS; + const size_t munlock_begin = pgno2bytes(env, aligned_pgno); + const size_t munlock_size = end_bytes - munlock_begin; + eASSERT(env, end_bytes % env->me_os_psize == 0 && + munlock_begin % env->me_os_psize == 0 && + munlock_size % env->me_os_psize == 0); +#if defined(_WIN32) || defined(_WIN64) + err = VirtualUnlock(env->me_map + munlock_begin, munlock_size) + ? MDBX_SUCCESS + : (int)GetLastError(); + if (err == ERROR_NOT_LOCKED) + err = MDBX_SUCCESS; +#elif defined(_POSIX_MEMLOCK_RANGE) + err = munlock(env->me_map + munlock_begin, munlock_size) ? errno + : MDBX_SUCCESS; +#endif + if (likely(err == MDBX_SUCCESS)) + update_mlocked(env, aligned_pgno, false); + else { +#if defined(_WIN32) || defined(_WIN64) + WARNING("VirtualUnlock(%zu, %zu) error %d", munlock_begin, munlock_size, + err); +#else + WARNING("munlock(%zu, %zu) error %d", munlock_begin, munlock_size, err); +#endif + } + } +} + +__cold static void munlock_all(const MDBX_env *env) { + munlock_after(env, 0, bytes_align2os_bytes(env, env->me_dxb_mmap.current)); +} + __cold static int map_resize(MDBX_env *env, const pgno_t used_pgno, const pgno_t size_pgno, const pgno_t limit_pgno, const bool implicit) { @@ -5790,6 +5866,12 @@ __cold static int map_resize(MDBX_env *env, const pgno_t used_pgno, goto bailout; } + const pgno_t aligned_munlock_pgno = + (mresize_flags & (MDBX_MRESIZE_MAY_UNMAP | MDBX_MRESIZE_MAY_MOVE)) + ? 0 + : bytes2pgno(env, size_bytes); + munlock_after(env, aligned_munlock_pgno, size_bytes); + #if MDBX_ENABLE_MADVISE if (size_bytes < prev_size) { NOTICE("resize-MADV_%s %u..%u", @@ -5820,10 +5902,23 @@ __cold static int map_resize(MDBX_env *env, const pgno_t used_pgno, prev_size - size_bytes, POSIX_FADV_DONTNEED)); #endif /* MADV_DONTNEED */ - if (unlikely(MDBX_IS_ERROR(rc))) - goto bailout; - if (env->me_lck->mti_discarded_tail.weak > size_pgno) - env->me_lck->mti_discarded_tail.weak = size_pgno; + uint32_t snap_mlock_counter; + if (unlikely(rc == MDBX_EINVAL) && + (snap_mlock_counter = + atomic_load32(&env->me_lck->mti_mlock_counter, mo_Relaxed)) > 0) { + NOTICE("%s-madvise: ignore EINVAL (%d) since some pages locked (have %u " + "mlocked-process(es))", + "resize", rc, snap_mlock_counter); + } else { + if (unlikely(MDBX_IS_ERROR(rc))) { + ERROR("%s-madvise(%s, %zu..%zu), %u mlocked-process(es), err %d", + "mresize", "DONTNEED", size_bytes, prev_size - size_bytes, + atomic_load32(&env->me_lck->mti_mlock_counter, mo_Relaxed), rc); + goto bailout; + } + if (env->me_lck->mti_discarded_tail.weak > size_pgno) + env->me_lck->mti_discarded_tail.weak = size_pgno; + } } #endif /* MDBX_ENABLE_MADVISE */ @@ -11368,13 +11463,15 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending, const pgno_t prev_discarded_pgno = atomic_load32(&env->me_lck->mti_discarded_tail, mo_Relaxed); if (prev_discarded_pgno >= discard_edge_pgno + bytes2pgno(env, threshold)) { - NOTICE("open-MADV_%s %u..%u", "DONTNEED", largest_pgno, + NOTICE("shrink-MADV_%s %u..%u", "DONTNEED", largest_pgno, prev_discarded_pgno); atomic_store32(&env->me_lck->mti_discarded_tail, discard_edge_pgno, mo_Relaxed); const size_t prev_discarded_bytes = ceil_powerof2(pgno2bytes(env, prev_discarded_pgno), env->me_os_psize); ENSURE(env, prev_discarded_bytes > discard_edge_bytes); + munlock_after(env, discard_edge_pgno, + bytes_align2os_bytes(env, env->me_dxb_mmap.current)); #if defined(MADV_DONTNEED) int advise = MADV_DONTNEED; #if defined(MADV_FREE) && \ @@ -11391,8 +11488,23 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending, env->me_map + discard_edge_bytes, prev_discarded_bytes - discard_edge_bytes, POSIX_MADV_DONTNEED)); #endif - if (unlikely(MDBX_IS_ERROR(err))) + uint32_t snap_mlock_counter; + if (unlikely(err == MDBX_EINVAL) && + (snap_mlock_counter = atomic_load32(&env->me_lck->mti_mlock_counter, + mo_Relaxed)) > 0) { + NOTICE("%s-madvise: ignore EINVAL (%d) since some pages locked (have " + "%u mlocked-process(es))", + "shrink", err, snap_mlock_counter); + } else if (unlikely(MDBX_IS_ERROR(err))) { + ERROR("%s-madvise(%s, %zu..%zu), err %d", "shrink", "DONTNEED", + discard_edge_bytes, prev_discarded_bytes - discard_edge_bytes, + err); + ERROR("%s-madvise(%s, %zu..%zu), %u mlocked-process(es), err %d", + "shrink", "DONTNEED", discard_edge_bytes, + prev_discarded_bytes - discard_edge_bytes, + atomic_load32(&env->me_lck->mti_mlock_counter, mo_Relaxed), err); return err; + } } #endif /* MDBX_ENABLE_MADVISE && (MADV_DONTNEED || POSIX_MADV_DONTNEED) */ @@ -13896,17 +14008,21 @@ __cold static int env_close(MDBX_env *env) { } env->me_flags &= ~ENV_INTERNAL_FLAGS; - env->me_lck = nullptr; if (flags & MDBX_ENV_TXKEY) { rthc_remove(env->me_txkey); env->me_txkey = (osal_thread_key_t)0; } + munlock_all(env); + osal_ioring_destroy(&env->me_ioring); + lcklist_lock(); const int rc = lcklist_detach_locked(env); lcklist_unlock(); - osal_ioring_destroy(&env->me_ioring); + env->me_lck = nullptr; + if (env->me_lck_mmap.lck) + osal_munmap(&env->me_lck_mmap); if (env->me_map) { osal_munmap(&env->me_dxb_mmap); @@ -13934,9 +14050,6 @@ __cold static int env_close(MDBX_env *env) { env->me_lazy_fd = INVALID_HANDLE_VALUE; } - if (env->me_lck_mmap.lck) - osal_munmap(&env->me_lck_mmap); - if (env->me_lfd != INVALID_HANDLE_VALUE) { (void)osal_closefile(env->me_lfd); env->me_lfd = INVALID_HANDLE_VALUE; @@ -23589,6 +23702,249 @@ __cold int mdbx_env_get_option(const MDBX_env *env, const MDBX_option_t option, return MDBX_SUCCESS; } +static size_t estimate_rss(size_t database_bytes) { + return database_bytes + database_bytes / 64 + + (512 + MDBX_WORDBITS * 16) * MEGABYTE; +} + +__cold int mdbx_env_warmup(const MDBX_env *env, const MDBX_txn *txn, + MDBX_warmup_flags_t flags, + unsigned timeout_seconds_16dot16) { + if (unlikely(env == NULL && txn == NULL)) + return MDBX_EINVAL; + if (unlikely(flags > + (MDBX_warmup_force | MDBX_warmup_oomsafe | MDBX_warmup_lock | + MDBX_warmup_touchlimit | MDBX_warmup_release))) + return MDBX_EINVAL; + + if (txn) { + int err = check_txn(txn, MDBX_TXN_BLOCKED - MDBX_TXN_ERROR); + if (unlikely(err != MDBX_SUCCESS)) + return err; + } + if (env) { + int err = check_env(env, false); + if (unlikely(err != MDBX_SUCCESS)) + return err; + if (txn && unlikely(txn->mt_env != env)) + return MDBX_EINVAL; + } else { + env = txn->mt_env; + } + + const uint64_t timeout_monotime = + (timeout_seconds_16dot16 && (flags & MDBX_warmup_force)) + ? osal_monotime() + osal_16dot16_to_monotime(timeout_seconds_16dot16) + : 0; + + if (flags & MDBX_warmup_release) + munlock_all(env); + + pgno_t used_pgno; + if (txn) { + used_pgno = txn->mt_geo.next; + } else { + const meta_troika_t troika = meta_tap(env); + used_pgno = meta_recent(env, &troika).ptr_v->mm_geo.next; + } + const size_t used_range = pgno_align2os_bytes(env, used_pgno); + const pgno_t mlock_pgno = bytes2pgno(env, used_range); + + int rc = MDBX_SUCCESS; + if (flags & MDBX_warmup_touchlimit) { + const size_t estimated_rss = estimate_rss(used_range); +#if defined(_WIN32) || defined(_WIN64) + SIZE_T current_ws_lower, current_ws_upper; + if (GetProcessWorkingSetSize(GetCurrentProcess(), ¤t_ws_lower, + ¤t_ws_upper) && + current_ws_lower < estimated_rss) { + const SIZE_T ws_lower = estimated_rss; + const SIZE_T ws_upper = + (MDBX_WORDBITS == 32 && ws_lower > MEGABYTE * 2048) + ? ws_lower + : ws_lower + MDBX_WORDBITS * MEGABYTE * 32; + if (!SetProcessWorkingSetSize(GetCurrentProcess(), ws_lower, ws_upper)) { + rc = (int)GetLastError(); + WARNING("SetProcessWorkingSetSize(%zu, %zu) error %d", ws_lower, + ws_upper, rc); + } + } +#endif /* Windows */ +#ifdef RLIMIT_RSS + struct rlimit rss; + if (getrlimit(RLIMIT_RSS, &rss) == 0 && rss.rlim_cur < estimated_rss) { + rss.rlim_cur = estimated_rss; + if (rss.rlim_max < estimated_rss) + rss.rlim_max = used_range; + if (setrlimit(RLIMIT_RSS, &rss)) { + rc = errno; + WARNING("setrlimit(%s, {%zu, %zu}) error %d", "RLIMIT_RSS", + (size_t)rss.rlim_cur, (size_t)rss.rlim_max, rc); + } + } +#endif /* RLIMIT_RSS */ +#ifdef RLIMIT_MEMLOCK + if (flags & MDBX_warmup_lock) { + struct rlimit memlock; + if (getrlimit(RLIMIT_MEMLOCK, &memlock) == 0 && + memlock.rlim_cur < estimated_rss) { + memlock.rlim_cur = estimated_rss; + if (memlock.rlim_max < estimated_rss) + memlock.rlim_max = estimated_rss; + if (setrlimit(RLIMIT_MEMLOCK, &memlock)) { + rc = errno; + WARNING("setrlimit(%s, {%zu, %zu}) error %d", "RLIMIT_MEMLOCK", + (size_t)memlock.rlim_cur, (size_t)memlock.rlim_max, rc); + } + } + } +#endif /* RLIMIT_MEMLOCK */ + (void)estimated_rss; + } + +#if defined(MLOCK_ONFAULT) && \ + ((defined(_GNU_SOURCE) && __GLIBC_PREREQ(2, 27)) || \ + (defined(__ANDROID_API__) && __ANDROID_API__ >= 30)) && \ + (defined(__linux__) || defined(__gnu_linux__)) + if ((flags & MDBX_warmup_lock) != 0 && linux_kernel_version >= 0x04040000 && + atomic_load32(&env->me_mlocked_pgno, mo_AcquireRelease) < mlock_pgno) { + if (mlock2(env->me_map, used_range, MLOCK_ONFAULT)) { + rc = errno; + WARNING("mlock2(%zu, %s) error %d", used_range, "MLOCK_ONFAULT", rc); + } else { + update_mlocked(env, mlock_pgno, true); + rc = MDBX_SUCCESS; + } + if (rc != EINVAL) + flags -= MDBX_warmup_lock; + } +#endif /* MLOCK_ONFAULT */ + + int err = MDBX_ENOSYS; +#if MDBX_ENABLE_MADVISE + err = set_readahead(env, used_pgno, true, true); +#else +#if defined(_WIN32) || defined(_WIN64) + if (mdbx_PrefetchVirtualMemory) { + WIN32_MEMORY_RANGE_ENTRY hint; + hint.VirtualAddress = env->me_map; + hint.NumberOfBytes = used_range; + if (mdbx_PrefetchVirtualMemory(GetCurrentProcess(), 1, &hint, 0)) + err = MDBX_SUCCESS; + else { + err = (int)GetLastError(); + ERROR("%s(%zu) error %d", "PrefetchVirtualMemory", used_range, err); + } + } +#endif /* Windows */ + +#if defined(POSIX_MADV_WILLNEED) + err = posix_madvise(env->me_map, used_range, POSIX_MADV_WILLNEED) + ? ignore_enosys(errno) + : MDBX_SUCCESS; +#elif defined(MADV_WILLNEED) + err = madvise(env->me_map, used_range, MADV_WILLNEED) ? ignore_enosys(errno) + : MDBX_SUCCESS; +#endif + +#if defined(F_RDADVISE) + if (err) { + fcntl(env->me_lazy_fd, F_RDAHEAD, true); + struct radvisory hint; + hint.ra_offset = 0; + hint.ra_count = unlikely(used_range > INT_MAX && + sizeof(used_range) > sizeof(hint.ra_count)) + ? INT_MAX + : (int)used_range; + err = fcntl(env->me_lazy_fd, F_RDADVISE, &hint) ? ignore_enosys(errno) + : MDBX_SUCCESS; + if (err == ENOTTY) + err = MDBX_SUCCESS /* Ignore ENOTTY for DB on the ram-disk */; + } +#endif /* F_RDADVISE */ +#endif /* MDBX_ENABLE_MADVISE */ + if (err != MDBX_SUCCESS && rc == MDBX_SUCCESS) + rc = err; + + if ((flags & MDBX_warmup_force) != 0 && + (rc == MDBX_SUCCESS || rc == MDBX_ENOSYS)) { + const volatile uint8_t *ptr = env->me_map; + size_t offset = 0, unused = 42; +#if !(defined(_WIN32) || defined(_WIN64)) + if (flags & MDBX_warmup_oomsafe) { + const int null_fd = open("/dev/null", O_WRONLY); + if (unlikely(null_fd < 0)) + rc = errno; + else { + struct iovec iov[MDBX_AUXILARY_IOV_MAX]; + for (;;) { + unsigned i; + for (i = 0; i < MDBX_AUXILARY_IOV_MAX && offset < used_range; ++i) { + iov[i].iov_base = (void *)(ptr + offset); + iov[i].iov_len = 1; + offset += env->me_os_psize; + } + if (unlikely(writev(null_fd, iov, i) < 0)) { + rc = errno; + if (rc == EFAULT) + rc = ENOMEM; + break; + } + if (offset >= used_range) { + rc = MDBX_SUCCESS; + break; + } + if (timeout_seconds_16dot16 && osal_monotime() > timeout_monotime) { + rc = MDBX_RESULT_TRUE; + break; + } + } + close(null_fd); + } + } else +#endif /* Windows */ + for (;;) { + unused += ptr[offset]; + offset += env->me_os_psize; + if (offset >= used_range) { + rc = MDBX_SUCCESS; + break; + } + if (timeout_seconds_16dot16 && osal_monotime() > timeout_monotime) { + rc = MDBX_RESULT_TRUE; + break; + } + } + (void)unused; + } + + if ((flags & MDBX_warmup_lock) != 0 && + (rc == MDBX_SUCCESS || rc == MDBX_ENOSYS) && + atomic_load32(&env->me_mlocked_pgno, mo_AcquireRelease) < mlock_pgno) { +#if defined(_WIN32) || defined(_WIN64) + if (VirtualLock(env->me_map, used_range)) { + update_mlocked(env, mlock_pgno, true); + rc = MDBX_SUCCESS; + } else { + rc = (int)GetLastError(); + WARNING("%s(%zu) error %d", "VirtualLock", used_range, rc); + } +#elif defined(_POSIX_MEMLOCK_RANGE) + if (mlock(env->me_map, used_range) == 0) { + update_mlocked(env, mlock_pgno, true); + rc = MDBX_SUCCESS; + } else { + rc = errno; + WARNING("%s(%zu) error %d", "mlock", used_range, rc); + } +#else + rc = MDBX_ENOSYS; +#endif + } + + return rc; +} + __cold void global_ctor(void) { rthc_limit = RTHC_INITIAL_LIMIT; rthc_table = rthc_table_static; diff --git a/src/internals.h b/src/internals.h index 721a6131..f35d9aad 100644 --- a/src/internals.h +++ b/src/internals.h @@ -731,6 +731,11 @@ typedef struct MDBX_lockinfo { /* Marker to distinguish uniqueness of DB/CLK. */ MDBX_atomic_uint64_t mti_bait_uniqueness; + /* Counter of processes which had mlock()'ed some of mmapped DB pages. + * Non-zero means at least one process lock at leat one page, + * and therefore madvise() could return EINVAL. */ + MDBX_atomic_uint32_t mti_mlock_counter; + MDBX_ALIGNAS(MDBX_CACHELINE_SIZE) /* cacheline ----------------------------*/ #if MDBX_ENABLE_PGOP_STAT @@ -1169,7 +1174,8 @@ struct MDBX_env { unsigned me_psize; /* DB page size, initialized from me_os_psize */ unsigned me_leaf_nodemax; /* max size of a leaf-node */ unsigned me_branch_nodemax; /* max size of a branch-node */ - uint8_t me_psize2log; /* log2 of DB page size */ + atomic_pgno_t me_mlocked_pgno; + uint8_t me_psize2log; /* log2 of DB page size */ int8_t me_stuck_meta; /* recovery-only: target meta page or less that zero */ uint16_t me_merge_threshold, me_merge_threshold_gc; /* pages emptier than this are candidates for From 7902b97a3d5393ac1a5b8aff8402a56bd684b284 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Fri, 21 Oct 2022 23:09:44 +0300 Subject: [PATCH 175/364] =?UTF-8?q?mdbx-test:=20=D0=BF=D1=80=D0=BE=D1=81?= =?UTF-8?q?=D1=82=D0=B0=D1=8F=20=D0=BF=D1=80=D0=BE=D0=B2=D0=B5=D1=80=D0=BA?= =?UTF-8?q?=D0=B0=20warmup.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test/test.cc | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/test/test.cc b/test/test.cc index 6f620223..60fd6914 100644 --- a/test/test.cc +++ b/test/test.cc @@ -199,6 +199,19 @@ void testcase::txn_begin(bool readonly, MDBX_txn_flags_t flags) { log_trace("<< txn_begin(%s, 0x%04X)", readonly ? "read-only" : "read-write", flags); + + if (flipcoin_n(5)) { + const unsigned mask = + unsigned(MDBX_warmup_default | MDBX_warmup_force | MDBX_warmup_oomsafe | + MDBX_warmup_lock | MDBX_warmup_touchlimit); + static unsigned counter; + MDBX_warmup_flags_t warmup_flags = MDBX_warmup_flags_t( + (counter > MDBX_warmup_release) ? prng64() & mask : counter); + counter += 1; + int err = mdbx_env_warmup(db_guard.get(), txn, warmup_flags, 0); + log_trace("== counter %u, env_warmup(flags %u), rc %d", counter, + warmup_flags, err); + } } int testcase::breakable_commit() { From 9eaf86bde117d9fc341dc6db0b87213fa971c5e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Mon, 24 Oct 2022 12:49:37 +0300 Subject: [PATCH 176/364] =?UTF-8?q?mdbx-tools:=20=D0=B4=D0=BE=D0=B1=D0=B0?= =?UTF-8?q?=D0=B2=D0=BB=D0=B5=D0=BD=D0=B8=D0=B5=20=D0=BE=D0=BF=D1=86=D0=B8?= =?UTF-8?q?=D0=B9=20`-u`=20=D0=B8=20`-U`=20=D0=B4=D0=BB=D1=8F=20=D0=B8?= =?UTF-8?q?=D1=81=D0=BF=D0=BE=D0=BB=D1=8C=D0=B7=D0=BE=D0=B2=D0=B0=D0=BD?= =?UTF-8?q?=D0=B8=D1=8F=20`mdbx=5Fenv=5Fwarmup()`.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/mdbx_chk.c | 65 +++++++++++++++++++++++++++++++++++++------------ src/mdbx_copy.c | 38 ++++++++++++++++++++--------- src/mdbx_dump.c | 49 +++++++++++++++++++++++++++---------- 3 files changed, 113 insertions(+), 39 deletions(-) diff --git a/src/mdbx_chk.c b/src/mdbx_chk.c index d2dea1e3..311695c8 100644 --- a/src/mdbx_chk.c +++ b/src/mdbx_chk.c @@ -930,21 +930,24 @@ bailout: } static void usage(char *prog) { - fprintf(stderr, - "usage: %s [-V] [-v] [-q] [-c] [-0|1|2] [-w] [-d] [-i] [-s subdb] " - "dbpath\n" - " -V\t\tprint version and exit\n" - " -v\t\tmore verbose, could be used multiple times\n" - " -q\t\tbe quiet\n" - " -c\t\tforce cooperative mode (don't try exclusive)\n" - " -w\t\twrite-mode checking\n" - " -d\t\tdisable page-by-page traversal of B-tree\n" - " -i\t\tignore wrong order errors (for custom comparators case)\n" - " -s subdb\tprocess a specific subdatabase only\n" - " -0|1|2\tforce using specific meta-page 0, or 2 for checking\n" - " -t\t\tturn to a specified meta-page on successful check\n" - " -T\t\tturn to a specified meta-page EVEN ON UNSUCCESSFUL CHECK!\n", - prog); + fprintf( + stderr, + "usage: %s " + "[-V] [-v] [-q] [-c] [-0|1|2] [-w] [-d] [-i] [-s subdb] [-u|U] dbpath\n" + " -V\t\tprint version and exit\n" + " -v\t\tmore verbose, could be used multiple times\n" + " -q\t\tbe quiet\n" + " -c\t\tforce cooperative mode (don't try exclusive)\n" + " -w\t\twrite-mode checking\n" + " -d\t\tdisable page-by-page traversal of B-tree\n" + " -i\t\tignore wrong order errors (for custom comparators case)\n" + " -s subdb\tprocess a specific subdatabase only\n" + " -u\t\twarmup database before checking\n" + " -U\t\twarmup and try lock database pages in memory before checking\n" + " -0|1|2\tforce using specific meta-page 0, or 2 for checking\n" + " -t\t\tturn to a specified meta-page on successful check\n" + " -T\t\tturn to a specified meta-page EVEN ON UNSUCCESSFUL CHECK!\n", + prog); exit(EXIT_INTERRUPTED); } @@ -1083,6 +1086,8 @@ int main(int argc, char *argv[]) { bool write_locked = false; bool turn_meta = false; bool force_turn_meta = false; + bool warmup = false; + MDBX_warmup_flags_t warmup_flags = MDBX_warmup_default; double elapsed; #if defined(_WIN32) || defined(_WIN64) @@ -1106,6 +1111,7 @@ int main(int argc, char *argv[]) { usage(prog); for (int i; (i = getopt(argc, argv, + "uU" "0" "1" "2" @@ -1183,6 +1189,14 @@ int main(int argc, char *argv[]) { case 'i': ignore_wrong_order = true; break; + case 'u': + warmup = true; + break; + case 'U': + warmup = true; + warmup_flags = + MDBX_warmup_force | MDBX_warmup_touchlimit | MDBX_warmup_lock; + break; default: usage(prog); } @@ -1284,14 +1298,35 @@ int main(int argc, char *argv[]) { (envflags & MDBX_EXCLUSIVE) ? "monopolistic" : "cooperative"); if ((envflags & (MDBX_RDONLY | MDBX_EXCLUSIVE)) == 0) { + if (verbose) { + print(" - taking write lock..."); + fflush(nullptr); + } rc = mdbx_txn_lock(env, false); if (rc != MDBX_SUCCESS) { error("mdbx_txn_lock() failed, error %d %s\n", rc, mdbx_strerror(rc)); goto bailout; } + if (verbose) + print(" done\n"); write_locked = true; } + if (warmup) { + if (verbose) { + print(" - warming up..."); + fflush(nullptr); + } + rc = mdbx_env_warmup(env, nullptr, warmup_flags, 3600 * 65536); + if (MDBX_IS_ERROR(rc)) { + error("mdbx_env_warmup(flags %u) failed, error %d %s\n", warmup_flags, rc, + mdbx_strerror(rc)); + goto bailout; + } + if (verbose) + print(" %s\n", rc ? "timeout" : "done"); + } + rc = mdbx_txn_begin(env, nullptr, MDBX_TXN_RDONLY, &txn); if (rc) { error("mdbx_txn_begin() failed, error %d %s\n", rc, mdbx_strerror(rc)); diff --git a/src/mdbx_copy.c b/src/mdbx_copy.c index 18eafca0..b070449b 100644 --- a/src/mdbx_copy.c +++ b/src/mdbx_copy.c @@ -44,14 +44,17 @@ static void signal_handler(int sig) { #endif /* !WINDOWS */ static void usage(const char *prog) { - fprintf(stderr, - "usage: %s [-V] [-q] [-c] src_path [dest_path]\n" - " -V\t\tprint version and exit\n" - " -q\t\tbe quiet\n" - " -c\t\tenable compactification (skip unused pages)\n" - " src_path\tsource database\n" - " dest_path\tdestination (stdout if not specified)\n", - prog); + fprintf( + stderr, + "usage: %s [-V] [-q] [-c] [-u|U] src_path [dest_path]\n" + " -V\t\tprint version and exit\n" + " -q\t\tbe quiet\n" + " -c\t\tenable compactification (skip unused pages)\n" + " -u\t\twarmup database before copying\n" + " -U\t\twarmup and try lock database pages in memory before copying\n" + " src_path\tsource database\n" + " dest_path\tdestination (stdout if not specified)\n", + prog); exit(EXIT_FAILURE); } @@ -62,6 +65,8 @@ int main(int argc, char *argv[]) { unsigned flags = MDBX_RDONLY; unsigned cpflags = 0; bool quiet = false; + bool warmup = false; + MDBX_warmup_flags_t warmup_flags = MDBX_warmup_default; for (; argc > 1 && argv[1][0] == '-'; argc--, argv++) { if (argv[1][1] == 'n' && argv[1][2] == '\0') @@ -70,8 +75,14 @@ int main(int argc, char *argv[]) { cpflags |= MDBX_CP_COMPACT; else if (argv[1][1] == 'q' && argv[1][2] == '\0') quiet = true; - else if ((argv[1][1] == 'h' && argv[1][2] == '\0') || - strcmp(argv[1], "--help") == 0) + else if (argv[1][1] == 'u' && argv[1][2] == '\0') + warmup = true; + else if (argv[1][1] == 'U' && argv[1][2] == '\0') { + warmup = true; + warmup_flags = + MDBX_warmup_force | MDBX_warmup_touchlimit | MDBX_warmup_lock; + } else if ((argv[1][1] == 'h' && argv[1][2] == '\0') || + strcmp(argv[1], "--help") == 0) usage(progname); else if (argv[1][1] == 'V' && argv[1][2] == '\0') { printf("mdbx_copy version %d.%d.%d.%d\n" @@ -120,7 +131,12 @@ int main(int argc, char *argv[]) { if (rc == MDBX_SUCCESS) rc = mdbx_env_open(env, argv[1], flags, 0); - if (rc == MDBX_SUCCESS) { + if (rc == MDBX_SUCCESS && warmup) { + act = "warming up"; + rc = mdbx_env_warmup(env, nullptr, warmup_flags, 3600 * 65536); + } + + if (!MDBX_IS_ERROR(rc)) { act = "copying"; if (argc == 2) { mdbx_filehandle_t fd; diff --git a/src/mdbx_dump.c b/src/mdbx_dump.c index 364e03ab..f710d33d 100644 --- a/src/mdbx_dump.c +++ b/src/mdbx_dump.c @@ -217,19 +217,23 @@ static int dump_sdb(MDBX_txn *txn, MDBX_dbi dbi, char *name) { } static void usage(void) { - fprintf(stderr, - "usage: %s [-V] [-q] [-f file] [-l] [-p] [-r] [-a|-s subdb] " - "dbpath\n" - " -V\t\tprint version and exit\n" - " -q\t\tbe quiet\n" - " -f\t\twrite to file instead of stdout\n" - " -l\t\tlist subDBs and exit\n" - " -p\t\tuse printable characters\n" - " -r\t\trescue mode (ignore errors to dump corrupted DB)\n" - " -a\t\tdump main DB and all subDBs\n" - " -s name\tdump only the specified named subDB\n" - " \t\tby default dump only the main DB\n", - prog); + fprintf( + stderr, + "usage: %s " + "[-V] [-q] [-f file] [-l] [-p] [-r] [-a|-s subdb] [-u|U] " + "dbpath\n" + " -V\t\tprint version and exit\n" + " -q\t\tbe quiet\n" + " -f\t\twrite to file instead of stdout\n" + " -l\t\tlist subDBs and exit\n" + " -p\t\tuse printable characters\n" + " -r\t\trescue mode (ignore errors to dump corrupted DB)\n" + " -a\t\tdump main DB and all subDBs\n" + " -s name\tdump only the specified named subDB\n" + " -u\t\twarmup database before dumping\n" + " -U\t\twarmup and try lock database pages in memory before dumping\n" + " \t\tby default dump only the main DB\n", + prog); exit(EXIT_FAILURE); } @@ -250,11 +254,14 @@ int main(int argc, char *argv[]) { char *subname = nullptr, *buf4free = nullptr; unsigned envflags = 0; bool alldbs = false, list = false; + bool warmup = false; + MDBX_warmup_flags_t warmup_flags = MDBX_warmup_default; if (argc < 2) usage(); while ((i = getopt(argc, argv, + "uU" "a" "f:" "l" @@ -311,6 +318,14 @@ int main(int argc, char *argv[]) { case 'r': rescue = true; break; + case 'u': + warmup = true; + break; + case 'U': + warmup = true; + warmup_flags = + MDBX_warmup_force | MDBX_warmup_touchlimit | MDBX_warmup_lock; + break; default: usage(); } @@ -364,6 +379,14 @@ int main(int argc, char *argv[]) { goto env_close; } + if (warmup) { + rc = mdbx_env_warmup(env, nullptr, warmup_flags, 3600 * 65536); + if (MDBX_IS_ERROR(rc)) { + error("mdbx_env_warmup", rc); + goto env_close; + } + } + rc = mdbx_txn_begin(env, nullptr, MDBX_TXN_RDONLY, &txn); if (unlikely(rc != MDBX_SUCCESS)) { error("mdbx_txn_begin", rc); From 836f6c27233e50e84dc9193b45339ae388f6c483 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Mon, 24 Oct 2022 12:58:41 +0300 Subject: [PATCH 177/364] =?UTF-8?q?mdbx:=20=D0=BE=D0=B1=D0=BD=D0=BE=D0=B2?= =?UTF-8?q?=D0=BB=D0=B5=D0=BD=D0=B8=D0=B5=20ChangeLog.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ChangeLog.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/ChangeLog.md b/ChangeLog.md index 1777bfe5..b0848699 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -9,6 +9,11 @@ and [by Yandex](https://translated.turbopages.org/proxy_u/ru-en.en/https/gitflic Новое: + - Добавлена функция `mdbx_env_warmup()` для "прогрева" БД с возможностью + закрепления страниц в памяти. + В утилиты `mdbx_chk`, `mdbx_copy` и `mdbx_dump` добавлены опции `-u` и `-U` + для активации соответствующего функционала. + - Отключение учета «грязных» страниц в не требующих этого режимах (`MDBX_WRITEMAP` при `MDBX_AVOID_MSYNC=0`). Доработка позволяет снизить накладные расходы и была запланирована давно, но откладывалась так как @@ -97,6 +102,7 @@ and [by Yandex](https://translated.turbopages.org/proxy_u/ru-en.en/https/gitflic Исправления: + - Небольшие исправления для совместимости с CMake 3.8 - Больше контроля и осторожности (паранойи) для страховки от дефектов `mremap()`. - Костыль для починки сборки со старыми версиями `stdatomic.h` из GNU Lib C, где макросы `ATOMIC_*_LOCK_FREE` ошибочно переопределяются через функции. From 8f8b9f3d2a5601d4efcc10ac9ad21f7fb162a7e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Sat, 29 Oct 2022 14:07:56 +0300 Subject: [PATCH 178/364] =?UTF-8?q?mdbx:=20=D1=87=D1=83=D1=82=D0=BE=D0=BA?= =?UTF-8?q?=20=D0=B0=D0=BF=D0=BE=D1=81=D1=82=D0=BE=D1=84=D0=BE=D0=B2=20?= =?UTF-8?q?=D0=B4=D0=BB=D1=8F=20=D0=B8=D0=BC=D0=B5=D0=BD=20=D0=B2=20doxyge?= =?UTF-8?q?n-=D0=BA=D0=BE=D0=BC=D0=BC=D0=B5=D0=BD=D1=82=D0=B0=D1=80=D0=B8?= =?UTF-8?q?=D1=8F=D1=85.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mdbx.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mdbx.h b/mdbx.h index 72b377f9..ffa7f3b5 100644 --- a/mdbx.h +++ b/mdbx.h @@ -2444,7 +2444,7 @@ typedef struct MDBX_stat MDBX_stat; /** \brief Return statistics about the MDBX environment. * \ingroup c_statinfo * - * At least one of env or txn argument must be non-null. If txn is passed + * At least one of `env` or `txn` argument must be non-null. If txn is passed * non-null then stat will be filled accordingly to the given transaction. * Otherwise, if txn is null, then stat will be populated by a snapshot from * the last committed write transaction, and at next time, other information @@ -2562,7 +2562,7 @@ typedef struct MDBX_envinfo MDBX_envinfo; /** \brief Return information about the MDBX environment. * \ingroup c_statinfo * - * At least one of env or txn argument must be non-null. If txn is passed + * At least one of `env` or `txn` argument must be non-null. If txn is passed * non-null then stat will be filled accordingly to the given transaction. * Otherwise, if txn is null, then stat will be populated by a snapshot from * the last committed write transaction, and at next time, other information @@ -2871,7 +2871,7 @@ DEFINE_ENUM_FLAG_OPERATORS(MDBX_warmup_flags_t) * Therefore an unused pages that are in GC handled (i.e. will be loaded) in * the same way as those that contain payload. * - * At least one of env or txn argument must be non-null. + * At least one of `env` or `txn` argument must be non-null. * * \param [in] env An environment handle returned * by \ref mdbx_env_create(). From 28e2e319499dd386781dd59145dad22de9ee7edb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Tue, 1 Nov 2022 19:31:25 +0300 Subject: [PATCH 179/364] =?UTF-8?q?mdbx:=20=D0=B2=D1=8B=D0=B4=D0=B5=D0=BB?= =?UTF-8?q?=D0=B5=D0=BD=D0=B8=D0=B5=20=D1=81=D0=BF=D0=B5=D1=86=D0=B8=D1=84?= =?UTF-8?q?=D0=B8=D1=87=D0=B5=D1=81=D0=BA=D0=BE=D0=B9=20=D0=B8=D0=BD=D0=B8?= =?UTF-8?q?=D1=86=D0=B8=D0=B0=D0=BB=D0=B8=D0=B7=D0=B0=D1=86=D0=B8=D0=B8=20?= =?UTF-8?q?=D0=B2=20`osal=5Fctor()`.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core.c | 2 + src/internals.h | 2 + src/osal.c | 126 +++++++++++++++++++++++++++--------------------- src/osal.h | 12 ++--- 4 files changed, 80 insertions(+), 62 deletions(-) diff --git a/src/core.c b/src/core.c index 5c0d8815..a7854b7f 100644 --- a/src/core.c +++ b/src/core.c @@ -1501,6 +1501,7 @@ __cold void global_dtor(void) { workaround_glibc_bug21031(); #endif + osal_dtor(); TRACE("<< pid %d\n", osal_getpid()); } @@ -23946,6 +23947,7 @@ __cold int mdbx_env_warmup(const MDBX_env *env, const MDBX_txn *txn, } __cold void global_ctor(void) { + osal_ctor(); rthc_limit = RTHC_INITIAL_LIMIT; rthc_table = rthc_table_static; #if defined(_WIN32) || defined(_WIN64) diff --git a/src/internals.h b/src/internals.h index f35d9aad..15bcc583 100644 --- a/src/internals.h +++ b/src/internals.h @@ -1464,7 +1464,9 @@ MDBX_INTERNAL_FUNC int rthc_alloc(osal_thread_key_t *key, MDBX_reader *begin, MDBX_INTERNAL_FUNC void rthc_remove(const osal_thread_key_t key); MDBX_INTERNAL_FUNC void global_ctor(void); +MDBX_INTERNAL_FUNC void osal_ctor(void); MDBX_INTERNAL_FUNC void global_dtor(void); +MDBX_INTERNAL_FUNC void osal_dtor(void); MDBX_INTERNAL_FUNC void thread_dtor(void *ptr); #endif /* !__cplusplus */ diff --git a/src/osal.c b/src/osal.c index a1a85bac..a29919fa 100644 --- a/src/osal.c +++ b/src/osal.c @@ -623,13 +623,8 @@ MDBX_INTERNAL_FUNC int osal_ioring_create(osal_ioring_t *ior, #endif /* !Windows */ #if MDBX_HAVE_PWRITEV && defined(_SC_IOV_MAX) - if (!osal_iov_max) { - osal_iov_max = sysconf(_SC_IOV_MAX); - if (RUNNING_ON_VALGRIND && osal_iov_max > 64) - /* чтобы не описывать все 1024 исключения в valgrind_suppress.txt */ - osal_iov_max = 64; - } -#endif + assert(osal_iov_max > 0); +#endif /* MDBX_HAVE_PWRITEV && _SC_IOV_MAX */ ior->boundary = (char *)(ior->pool + ior->allocated); return MDBX_SUCCESS; @@ -660,7 +655,6 @@ static __inline ior_item_t *ior_next(ior_item_t *item, size_t sgvcnt) { MDBX_INTERNAL_FUNC int osal_ioring_add(osal_ioring_t *ior, const size_t offset, void *data, const size_t bytes) { - assert(bytes && data); assert(bytes % MIN_PAGESIZE == 0 && bytes <= MAX_WRITE); assert(offset % MIN_PAGESIZE == 0 && offset + (uint64_t)bytes <= MAX_MAPSIZE); @@ -2634,10 +2628,15 @@ __cold MDBX_INTERNAL_FUNC void osal_jitter(bool tiny) { } } +/*----------------------------------------------------------------------------*/ + #if defined(_WIN32) || defined(_WIN64) +static LARGE_INTEGER performance_frequency; #elif defined(__APPLE__) || defined(__MACH__) #include +static uint64_t ratio_16dot16_to_monotine; #elif defined(__linux__) || defined(__gnu_linux__) +static clockid_t posix_clockid; __cold static clockid_t choice_monoclock(void) { struct timespec probe; #if defined(CLOCK_BOOTTIME) @@ -2652,27 +2651,16 @@ __cold static clockid_t choice_monoclock(void) { #endif return CLOCK_MONOTONIC; } -#endif - -/*----------------------------------------------------------------------------*/ - -#if defined(_WIN32) || defined(_WIN64) -static LARGE_INTEGER performance_frequency; -#elif defined(__APPLE__) || defined(__MACH__) -static uint64_t ratio_16dot16_to_monotine; +#elif defined(CLOCK_MONOTONIC) +#define posix_clockid CLOCK_MONOTONIC +#else +#define posix_clockid CLOCK_REALTIME #endif MDBX_INTERNAL_FUNC uint64_t osal_16dot16_to_monotime(uint32_t seconds_16dot16) { #if defined(_WIN32) || defined(_WIN64) - if (unlikely(performance_frequency.QuadPart == 0)) - QueryPerformanceFrequency(&performance_frequency); const uint64_t ratio = performance_frequency.QuadPart; #elif defined(__APPLE__) || defined(__MACH__) - if (unlikely(ratio_16dot16_to_monotine == 0)) { - mach_timebase_info_data_t ti; - mach_timebase_info(&ti); - ratio_16dot16_to_monotine = UINT64_C(1000000000) * ti.denom / ti.numer; - } const uint64_t ratio = ratio_16dot16_to_monotine; #else const uint64_t ratio = UINT64_C(1000000000); @@ -2681,22 +2669,18 @@ MDBX_INTERNAL_FUNC uint64_t osal_16dot16_to_monotime(uint32_t seconds_16dot16) { return likely(ret || seconds_16dot16 == 0) ? ret : /* fix underflow */ 1; } +static uint64_t monotime_limit; MDBX_INTERNAL_FUNC uint32_t osal_monotime_to_16dot16(uint64_t monotime) { - static uint64_t limit; - if (unlikely(monotime > limit)) { - if (likely(limit != 0)) - return UINT32_MAX; - limit = osal_16dot16_to_monotime(UINT32_MAX - 1); - if (unlikely(monotime > limit)) - return UINT32_MAX; - } + if (unlikely(monotime > monotime_limit)) + return UINT32_MAX; + const uint32_t ret = #if defined(_WIN32) || defined(_WIN64) (uint32_t)((monotime << 16) / performance_frequency.QuadPart); #elif defined(__APPLE__) || defined(__MACH__) (uint32_t)((monotime << 16) / ratio_16dot16_to_monotine); #else - (uint32_t)(monotime * 128 / 1953125); + (uint32_t)((monotime << 7) / 1953125); #endif return ret; } @@ -2704,30 +2688,16 @@ MDBX_INTERNAL_FUNC uint32_t osal_monotime_to_16dot16(uint64_t monotime) { MDBX_INTERNAL_FUNC uint64_t osal_monotime(void) { #if defined(_WIN32) || defined(_WIN64) LARGE_INTEGER counter; - counter.QuadPart = 0; - QueryPerformanceCounter(&counter); - return counter.QuadPart; + if (QueryPerformanceCounter(&counter)) + return counter.QuadPart; #elif defined(__APPLE__) || defined(__MACH__) return mach_absolute_time(); #else - -#if defined(__linux__) || defined(__gnu_linux__) - static clockid_t posix_clockid = -1; - if (unlikely(posix_clockid < 0)) - posix_clockid = choice_monoclock(); -#elif defined(CLOCK_MONOTONIC) -#define posix_clockid CLOCK_MONOTONIC -#else -#define posix_clockid CLOCK_REALTIME -#endif - struct timespec ts; - if (unlikely(clock_gettime(posix_clockid, &ts) != 0)) { - ts.tv_nsec = 0; - ts.tv_sec = 0; - } - return ts.tv_sec * UINT64_C(1000000000) + ts.tv_nsec; + if (likely(clock_gettime(posix_clockid, &ts) == 0)) + return ts.tv_sec * UINT64_C(1000000000) + ts.tv_nsec; #endif + return 0; } /*----------------------------------------------------------------------------*/ @@ -2766,13 +2736,13 @@ __cold static void bootid_collect(bin128_t *p, const void *s, size_t n) { #if defined(_WIN32) || defined(_WIN64) -static uint64_t windows_systemtime_ms() { +__cold static uint64_t windows_systemtime_ms() { FILETIME ft; GetSystemTimeAsFileTime(&ft); return ((uint64_t)ft.dwHighDateTime << 32 | ft.dwLowDateTime) / 10000ul; } -static uint64_t windows_bootime(void) { +__cold static uint64_t windows_bootime(void) { unsigned confirmed = 0; uint64_t boottime = 0; uint64_t up0 = mdbx_GetTickCount64(); @@ -2799,8 +2769,9 @@ static uint64_t windows_bootime(void) { return 0; } -static LSTATUS mdbx_RegGetValue(HKEY hKey, LPCSTR lpSubKey, LPCSTR lpValue, - PVOID pvData, LPDWORD pcbData) { +__cold static LSTATUS mdbx_RegGetValue(HKEY hKey, LPCSTR lpSubKey, + LPCSTR lpValue, PVOID pvData, + LPDWORD pcbData) { LSTATUS rc; if (!mdbx_RegGetValueA) { /* an old Windows 2000/XP */ @@ -3294,3 +3265,48 @@ __cold int mdbx_get_sysraminfo(intptr_t *page_size, intptr_t *total_pages, return MDBX_SUCCESS; } + +#ifndef xMDBX_ALLOY +unsigned sys_pagesize; +MDBX_MAYBE_UNUSED unsigned sys_allocation_granularity; +#endif /* xMDBX_ALLOY */ + +void osal_ctor(void) { +#if MDBX_HAVE_PWRITEV && defined(_SC_IOV_MAX) + osal_iov_max = sysconf(_SC_IOV_MAX); + if (RUNNING_ON_VALGRIND && osal_iov_max > 64) + /* чтобы не описывать все 1024 исключения в valgrind_suppress.txt */ + osal_iov_max = 64; +#endif /* MDBX_HAVE_PWRITEV && _SC_IOV_MAX */ + +#if defined(_WIN32) || defined(_WIN64) + SYSTEM_INFO si; + GetSystemInfo(&si); + sys_pagesize = si.dwPageSize; + sys_allocation_granularity = si.dwAllocationGranularity; +#else + sys_pagesize = sysconf(_SC_PAGE_SIZE); + sys_allocation_granularity = (MDBX_WORDBITS > 32) ? 65536 : 4096; + sys_allocation_granularity = (sys_allocation_granularity > sys_pagesize) + ? sys_allocation_granularity + : sys_pagesize; +#endif + assert(sys_pagesize > 0 && (sys_pagesize & (sys_pagesize - 1)) == 0); + assert(sys_allocation_granularity >= sys_pagesize && + sys_allocation_granularity % sys_pagesize == 0); + +#if defined(__linux__) || defined(__gnu_linux__) + posix_clockid = choice_monoclock(); +#endif + +#if defined(_WIN32) || defined(_WIN64) + QueryPerformanceFrequency(&performance_frequency); +#elif defined(__APPLE__) || defined(__MACH__) + mach_timebase_info_data_t ti; + mach_timebase_info(&ti); + ratio_16dot16_to_monotine = UINT64_C(1000000000) * ti.denom / ti.numer; +#endif + monotime_limit = osal_16dot16_to_monotime(UINT32_MAX - 1); +} + +void osal_dtor(void) {} diff --git a/src/osal.h b/src/osal.h index 017abcb4..d2603334 100644 --- a/src/osal.h +++ b/src/osal.h @@ -210,18 +210,16 @@ typedef pthread_mutex_t osal_fastmutex_t; /*----------------------------------------------------------------------------*/ /* OS abstraction layer stuff */ +MDBX_INTERNAL_VAR unsigned sys_pagesize; +MDBX_MAYBE_UNUSED MDBX_INTERNAL_VAR unsigned sys_allocation_granularity; + /* Get the size of a memory page for the system. * This is the basic size that the platform's memory manager uses, and is * fundamental to the use of memory-mapped files. */ MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline size_t osal_syspagesize(void) { -#if defined(_WIN32) || defined(_WIN64) - SYSTEM_INFO si; - GetSystemInfo(&si); - return si.dwPageSize; -#else - return sysconf(_SC_PAGE_SIZE); -#endif + assert(sys_pagesize > 0 && (sys_pagesize & (sys_pagesize - 1)) == 0); + return sys_pagesize; } #if defined(_WIN32) || defined(_WIN64) From 91a6e84caba43733ad2af15e98501bdf10b07368 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Thu, 3 Nov 2022 12:57:18 +0300 Subject: [PATCH 180/364] =?UTF-8?q?mdbx-windows:=20=D0=BF=D0=BE=D0=BF?= =?UTF-8?q?=D1=8B=D1=82=D0=BA=D0=B0=20=D0=B1=D0=BE=D1=80=D1=8C=D0=B1=D1=8B?= =?UTF-8?q?=20=D1=81=20=D0=BB=D0=BE=D0=B6=D0=BD=D0=BE-=D0=BF=D0=BE=D0=BB?= =?UTF-8?q?=D0=BE=D0=B6=D0=B8=D1=82=D0=B5=D0=BB=D1=8C=D0=BD=D1=8B=D0=BC?= =?UTF-8?q?=D0=B8=20=D0=BA=D0=BE=D0=BD=D1=84=D0=BB=D0=B8=D0=BA=D1=82=D0=B0?= =?UTF-8?q?=D0=BC=D0=B8=20`LockFileEx()`.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/lck-windows.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/lck-windows.c b/src/lck-windows.c index 7038854e..96d416ad 100644 --- a/src/lck-windows.c +++ b/src/lck-windows.c @@ -203,6 +203,16 @@ int mdbx_txn_lock(MDBX_env *env, bool dontwait) { dontwait ? (LCK_EXCLUSIVE | LCK_DONTWAIT) : (LCK_EXCLUSIVE | LCK_WAITFOR), DXB_BODY); + if (rc == ERROR_LOCK_VIOLATION && dontwait) { + SleepEx(0, true); + rc = flock_with_event(env->me_fd4data, env->me_data_lock_event, + LCK_EXCLUSIVE | LCK_DONTWAIT, DXB_BODY); + if (rc == ERROR_LOCK_VIOLATION) { + SleepEx(0, true); + rc = flock_with_event(env->me_fd4data, env->me_data_lock_event, + LCK_EXCLUSIVE | LCK_DONTWAIT, DXB_BODY); + } + } if (rc == MDBX_SUCCESS) return rc; From d4e67d14ce2517d298c23d1f9e8afa2174d57761 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Thu, 3 Nov 2022 17:23:32 +0300 Subject: [PATCH 181/364] =?UTF-8?q?mdbx:=20=D0=B8=D1=81=D0=BF=D1=80=D0=B0?= =?UTF-8?q?=D0=B2=D0=BB=D0=B5=D0=BD=D0=B8=D0=B5=20=D0=BD=D0=B5=D0=BE=D0=B6?= =?UTF-8?q?=D0=B8=D0=B4=D0=B0=D0=BD=D0=BD=D0=BE=D0=B3=D0=BE=20`MDBX=5FBUSY?= =?UTF-8?q?`=20=D0=B8=D0=B7=20`mdbx=5Fenv=5Fset=5Foption()`.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core.c | 37 +++++++++++++++++++++---------------- 1 file changed, 21 insertions(+), 16 deletions(-) diff --git a/src/core.c b/src/core.c index a7854b7f..80c320bf 100644 --- a/src/core.c +++ b/src/core.c @@ -23467,14 +23467,17 @@ __cold int mdbx_env_set_option(MDBX_env *env, const MDBX_option_t option, return MDBX_EPERM; if (unlikely(value > SIZE_MAX - 65536)) return MDBX_TOO_LARGE; - if (atomic_store32(&env->me_lck->mti_autosync_threshold, - bytes2pgno(env, (size_t)value + env->me_psize - 1), - mo_Relaxed) != 0 && - (env->me_flags & MDBX_ENV_ACTIVE)) { - err = mdbx_env_sync_poll(env); - if (unlikely(MDBX_IS_ERROR(err))) - return err; - err = MDBX_SUCCESS; + value = bytes2pgno(env, (size_t)value + env->me_psize - 1); + if ((uint32_t)value != atomic_load32(&env->me_lck->mti_autosync_threshold, + mo_AcquireRelease) && + atomic_store32(&env->me_lck->mti_autosync_threshold, (uint32_t)value, + mo_Relaxed) + /* Дергаем sync(force=off) только если задано новое не-нулевое значение + * и мы вне транзакции */ + && lock_needed) { + err = env_sync(env, false, false); + if (err == /* нечего сбрасывать на диск */ MDBX_RESULT_TRUE) + err = MDBX_SUCCESS; } break; @@ -23487,14 +23490,16 @@ __cold int mdbx_env_set_option(MDBX_env *env, const MDBX_option_t option, return MDBX_EPERM; if (unlikely(value > UINT32_MAX)) return MDBX_TOO_LARGE; - if (atomic_store64(&env->me_lck->mti_autosync_period, - osal_16dot16_to_monotime((uint32_t)value), - mo_Relaxed) != 0 && - (env->me_flags & MDBX_ENV_ACTIVE)) { - err = mdbx_env_sync_poll(env); - if (unlikely(MDBX_IS_ERROR(err))) - return err; - err = MDBX_SUCCESS; + value = osal_16dot16_to_monotime((uint32_t)value); + if (value != atomic_load64(&env->me_lck->mti_autosync_period, + mo_AcquireRelease) && + atomic_store64(&env->me_lck->mti_autosync_period, value, mo_Relaxed) + /* Дергаем sync(force=off) только если задано новое не-нулевое значение + * и мы вне транзакции */ + && lock_needed) { + err = env_sync(env, false, false); + if (err == /* нечего сбрасывать на диск */ MDBX_RESULT_TRUE) + err = MDBX_SUCCESS; } break; From 9cbd4e63ca6b483f8f902735f4b196bdc9d72eb9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Fri, 4 Nov 2022 15:07:46 +0300 Subject: [PATCH 182/364] =?UTF-8?q?mdbx-test:=20=D0=BC=D0=B8=D0=BD=D0=BE?= =?UTF-8?q?=D1=80=D0=BD=D0=BE=D0=B5=20=D1=83=D1=82=D0=BE=D1=87=D0=BD=D0=B5?= =?UTF-8?q?=D0=BD=D0=B8=D0=B5=20=D0=BF=D0=BE=D0=B4=D1=81=D0=BA=D0=B0=D0=B7?= =?UTF-8?q?=D0=BA=D0=B8=20`--help`.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test/main.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/main.cc b/test/main.cc index e5854276..67ea1bd0 100644 --- a/test/main.cc +++ b/test/main.cc @@ -24,7 +24,7 @@ MDBX_NORETURN void usage(void) { "usage:\n" " --help or -h Show this text\n" "Common parameters:\n" - " --loglevel=[0-7]|[extra..fatal]" + " --loglevel=[0-7]|[fatal..extra]s" " --pathname=... Path and/or name of database files\n" " --repeat=N Set repeat counter\n" " --threads=N Number of thread (unsupported for now)\n" From 47e7a646fd2b5709aa725ad6e605af0586cfe895 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Fri, 4 Nov 2022 19:43:48 +0300 Subject: [PATCH 183/364] =?UTF-8?q?mdbx:=20=D0=BF=D0=B5=D1=80=D0=B5=D0=B4?= =?UTF-8?q?=D0=B5=D0=BB=D0=BA=D0=B0=20=D0=BE=D1=82=D1=81=D0=BB=D0=B5=D0=B6?= =?UTF-8?q?=D0=B8=D0=B2=D0=B0=D0=BD=D0=B8=D1=8F=20mlocks=20=D0=B4=D0=BB?= =?UTF-8?q?=D1=8F=20=D0=B8=D0=B3=D0=BD=D0=BE=D1=80=D0=B8=D1=80=D0=BE=D0=B2?= =?UTF-8?q?=D0=B0=D0=BD=D0=B8=D1=8F=20`EINVAL`=20=D0=BE=D1=82=20`madvise()?= =?UTF-8?q?`.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core.c | 124 ++++++++++++++++++++++++++---------------------- src/internals.h | 10 ++-- 2 files changed, 73 insertions(+), 61 deletions(-) diff --git a/src/core.c b/src/core.c index 80c320bf..bef4dd5e 100644 --- a/src/core.c +++ b/src/core.c @@ -5688,41 +5688,44 @@ __cold static int set_readahead(const MDBX_env *env, const pgno_t edge, } #endif /* MDBX_ENABLE_MADVISE */ -__cold static void update_mlocked(const MDBX_env *env, - const pgno_t new_aligned_mlocked_pgno, - const bool lock_not_release) { +__cold static void update_mlcnt(const MDBX_env *env, + const pgno_t new_aligned_mlocked_pgno, + const bool lock_not_release) { for (;;) { - const pgno_t mlock_pgno_snap = + const pgno_t mlock_pgno_before = atomic_load32(&env->me_mlocked_pgno, mo_AcquireRelease); - eASSERT(env, pgno_align2os_pgno(env, mlock_pgno_snap) == mlock_pgno_snap); + eASSERT(env, + pgno_align2os_pgno(env, mlock_pgno_before) == mlock_pgno_before); eASSERT(env, pgno_align2os_pgno(env, new_aligned_mlocked_pgno) == new_aligned_mlocked_pgno); - if (lock_not_release ? (mlock_pgno_snap >= new_aligned_mlocked_pgno) - : (mlock_pgno_snap <= new_aligned_mlocked_pgno)) + if (lock_not_release ? (mlock_pgno_before >= new_aligned_mlocked_pgno) + : (mlock_pgno_before <= new_aligned_mlocked_pgno)) break; if (likely(atomic_cas32(&((MDBX_env *)env)->me_mlocked_pgno, - mlock_pgno_snap, new_aligned_mlocked_pgno))) + mlock_pgno_before, new_aligned_mlocked_pgno))) for (;;) { - MDBX_atomic_uint32_t *const mlock_counter = - &env->me_lck->mti_mlock_counter; - const uint32_t snap_counter = atomic_load32(mlock_counter, mo_Relaxed); - if (mlock_pgno_snap == 0 && snap_counter < INT_MAX) { + MDBX_atomic_uint32_t *const mlcnt = env->me_lck->mti_mlcnt; + const int32_t snap_locked = atomic_load32(mlcnt + 0, mo_Relaxed); + const int32_t snap_unlocked = atomic_load32(mlcnt + 1, mo_Relaxed); + if (mlock_pgno_before == 0 && (snap_locked - snap_unlocked) < INT_MAX) { eASSERT(env, lock_not_release); - if (unlikely( - !atomic_cas32(mlock_counter, snap_counter, snap_counter + 1))) + if (unlikely(!atomic_cas32(mlcnt + 0, snap_locked, snap_locked + 1))) continue; } - if (new_aligned_mlocked_pgno == 0 && snap_counter > 0) { + if (new_aligned_mlocked_pgno == 0 && + (snap_locked - snap_unlocked) > 0) { eASSERT(env, !lock_not_release); if (unlikely( - !atomic_cas32(mlock_counter, snap_counter, snap_counter - 1))) + !atomic_cas32(mlcnt + 1, snap_unlocked, snap_unlocked + 1))) continue; } NOTICE("%s-pages %u..%u, mlocked-process(es) %u -> %u", lock_not_release ? "lock" : "unlock", - lock_not_release ? mlock_pgno_snap : new_aligned_mlocked_pgno, - lock_not_release ? new_aligned_mlocked_pgno : mlock_pgno_snap, - snap_counter, atomic_load32(mlock_counter, mo_Relaxed)); + lock_not_release ? mlock_pgno_before : new_aligned_mlocked_pgno, + lock_not_release ? new_aligned_mlocked_pgno : mlock_pgno_before, + snap_locked - snap_unlocked, + atomic_load32(mlcnt + 0, mo_Relaxed) - + atomic_load32(mlcnt + 1, mo_Relaxed)); return; } } @@ -5748,7 +5751,7 @@ __cold static void munlock_after(const MDBX_env *env, const pgno_t aligned_pgno, : MDBX_SUCCESS; #endif if (likely(err == MDBX_SUCCESS)) - update_mlocked(env, aligned_pgno, false); + update_mlcnt(env, aligned_pgno, false); else { #if defined(_WIN32) || defined(_WIN64) WARNING("VirtualUnlock(%zu, %zu) error %d", munlock_begin, munlock_size, @@ -5878,6 +5881,8 @@ __cold static int map_resize(MDBX_env *env, const pgno_t used_pgno, NOTICE("resize-MADV_%s %u..%u", (env->me_flags & MDBX_WRITEMAP) ? "REMOVE" : "DONTNEED", size_pgno, bytes2pgno(env, prev_size)); + const uint32_t munlocks_before = + atomic_load32(&env->me_lck->mti_mlcnt[1], mo_Relaxed); rc = MDBX_RESULT_TRUE; #if defined(MADV_REMOVE) if (env->me_flags & MDBX_WRITEMAP) @@ -5903,23 +5908,25 @@ __cold static int map_resize(MDBX_env *env, const pgno_t used_pgno, prev_size - size_bytes, POSIX_FADV_DONTNEED)); #endif /* MADV_DONTNEED */ - uint32_t snap_mlock_counter; - if (unlikely(rc == MDBX_EINVAL) && - (snap_mlock_counter = - atomic_load32(&env->me_lck->mti_mlock_counter, mo_Relaxed)) > 0) { - NOTICE("%s-madvise: ignore EINVAL (%d) since some pages locked (have %u " - "mlocked-process(es))", - "resize", rc, snap_mlock_counter); - } else { - if (unlikely(MDBX_IS_ERROR(rc))) { - ERROR("%s-madvise(%s, %zu..%zu), %u mlocked-process(es), err %d", + if (unlikely(MDBX_IS_ERROR(rc))) { + const uint32_t mlocks_after = + atomic_load32(&env->me_lck->mti_mlcnt[0], mo_Relaxed); + if (rc == MDBX_EINVAL) { + const int severity = + (mlocks_after - munlocks_before) ? MDBX_LOG_NOTICE : MDBX_LOG_WARN; + if (LOG_ENABLED(severity)) + debug_log(severity, __func__, __LINE__, + "%s-madvise: ignore EINVAL (%d) since some pages maybe " + "locked (%u/%u mlcnt-processes)", + "resize", rc, mlocks_after, munlocks_before); + } else { + ERROR("%s-madvise(%s, %zu, +%zu), %u/%u mlcnt-processes, err %d", "mresize", "DONTNEED", size_bytes, prev_size - size_bytes, - atomic_load32(&env->me_lck->mti_mlock_counter, mo_Relaxed), rc); + mlocks_after, munlocks_before, rc); goto bailout; } - if (env->me_lck->mti_discarded_tail.weak > size_pgno) - env->me_lck->mti_discarded_tail.weak = size_pgno; - } + } else + env->me_lck->mti_discarded_tail.weak = size_pgno; } #endif /* MDBX_ENABLE_MADVISE */ @@ -11473,6 +11480,8 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending, ENSURE(env, prev_discarded_bytes > discard_edge_bytes); munlock_after(env, discard_edge_pgno, bytes_align2os_bytes(env, env->me_dxb_mmap.current)); + const uint32_t munlocks_before = + atomic_load32(&env->me_lck->mti_mlcnt[1], mo_Relaxed); #if defined(MADV_DONTNEED) int advise = MADV_DONTNEED; #if defined(MADV_FREE) && \ @@ -11489,23 +11498,27 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending, env->me_map + discard_edge_bytes, prev_discarded_bytes - discard_edge_bytes, POSIX_MADV_DONTNEED)); #endif - uint32_t snap_mlock_counter; - if (unlikely(err == MDBX_EINVAL) && - (snap_mlock_counter = atomic_load32(&env->me_lck->mti_mlock_counter, - mo_Relaxed)) > 0) { - NOTICE("%s-madvise: ignore EINVAL (%d) since some pages locked (have " - "%u mlocked-process(es))", - "shrink", err, snap_mlock_counter); - } else if (unlikely(MDBX_IS_ERROR(err))) { - ERROR("%s-madvise(%s, %zu..%zu), err %d", "shrink", "DONTNEED", - discard_edge_bytes, prev_discarded_bytes - discard_edge_bytes, - err); - ERROR("%s-madvise(%s, %zu..%zu), %u mlocked-process(es), err %d", - "shrink", "DONTNEED", discard_edge_bytes, - prev_discarded_bytes - discard_edge_bytes, - atomic_load32(&env->me_lck->mti_mlock_counter, mo_Relaxed), err); - return err; - } + if (unlikely(MDBX_IS_ERROR(err))) { + const uint32_t mlocks_after = + atomic_load32(&env->me_lck->mti_mlcnt[0], mo_Relaxed); + if (err == MDBX_EINVAL) { + const int severity = (mlocks_after - munlocks_before) + ? MDBX_LOG_NOTICE + : MDBX_LOG_WARN; + if (LOG_ENABLED(severity)) + debug_log(severity, __func__, __LINE__, + "%s-madvise: ignore EINVAL (%d) since some pages maybe " + "locked (%u/%u mlcnt-processes)", + "shrink", err, mlocks_after, munlocks_before); + } else { + ERROR("%s-madvise(%s, %zu, +%zu), %u/%u mlcnt-processes, err %d", + "shrink", "DONTNEED", discard_edge_bytes, + prev_discarded_bytes - discard_edge_bytes, mlocks_after, + munlocks_before, err); + return err; + } + } else + env->me_lck->mti_discarded_tail.weak = discard_edge_pgno; } #endif /* MDBX_ENABLE_MADVISE && (MADV_DONTNEED || POSIX_MADV_DONTNEED) */ @@ -11517,10 +11530,9 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending, (shrink_step = pv2pages(pending->mm_geo.shrink_pv)) + backlog_gap) { if (pending->mm_geo.now > largest_pgno && pending->mm_geo.now - largest_pgno > shrink_step + backlog_gap) { - pgno_t grow_step = 0; const pgno_t aligner = pending->mm_geo.grow_pv - ? (grow_step = pv2pages(pending->mm_geo.grow_pv)) + ? /* grow_step */ pv2pages(pending->mm_geo.grow_pv) : shrink_step; const pgno_t with_backlog_gap = largest_pgno + backlog_gap; const pgno_t aligned = pgno_align2os_pgno( @@ -23818,7 +23830,7 @@ __cold int mdbx_env_warmup(const MDBX_env *env, const MDBX_txn *txn, rc = errno; WARNING("mlock2(%zu, %s) error %d", used_range, "MLOCK_ONFAULT", rc); } else { - update_mlocked(env, mlock_pgno, true); + update_mlcnt(env, mlock_pgno, true); rc = MDBX_SUCCESS; } if (rc != EINVAL) @@ -23929,7 +23941,7 @@ __cold int mdbx_env_warmup(const MDBX_env *env, const MDBX_txn *txn, atomic_load32(&env->me_mlocked_pgno, mo_AcquireRelease) < mlock_pgno) { #if defined(_WIN32) || defined(_WIN64) if (VirtualLock(env->me_map, used_range)) { - update_mlocked(env, mlock_pgno, true); + update_mlcnt(env, mlock_pgno, true); rc = MDBX_SUCCESS; } else { rc = (int)GetLastError(); @@ -23937,7 +23949,7 @@ __cold int mdbx_env_warmup(const MDBX_env *env, const MDBX_txn *txn, } #elif defined(_POSIX_MEMLOCK_RANGE) if (mlock(env->me_map, used_range) == 0) { - update_mlocked(env, mlock_pgno, true); + update_mlcnt(env, mlock_pgno, true); rc = MDBX_SUCCESS; } else { rc = errno; diff --git a/src/internals.h b/src/internals.h index 15bcc583..73a74a95 100644 --- a/src/internals.h +++ b/src/internals.h @@ -731,10 +731,10 @@ typedef struct MDBX_lockinfo { /* Marker to distinguish uniqueness of DB/CLK. */ MDBX_atomic_uint64_t mti_bait_uniqueness; - /* Counter of processes which had mlock()'ed some of mmapped DB pages. - * Non-zero means at least one process lock at leat one page, - * and therefore madvise() could return EINVAL. */ - MDBX_atomic_uint32_t mti_mlock_counter; + /* Paired counter of processes that have mlock()ed part of mmapped DB. + * The (mti_mlcnt[0] - mti_mlcnt[1]) > 0 means at least one process + * lock at leat one page, so therefore madvise() could return EINVAL. */ + MDBX_atomic_uint32_t mti_mlcnt[2]; MDBX_ALIGNAS(MDBX_CACHELINE_SIZE) /* cacheline ----------------------------*/ @@ -764,7 +764,7 @@ typedef struct MDBX_lockinfo { /* Timestamp of the last readers check. */ MDBX_atomic_uint64_t mti_reader_check_timestamp; - /* Number of page which was discarded last time by madvise(MADV_FREE). */ + /* Number of page which was discarded last time by madvise(DONTNEED). */ atomic_pgno_t mti_discarded_tail; /* Shared anchor for tracking readahead edge and enabled/disabled status. */ From 36eb40bccb8a79e11be7148886615ee82ea13780 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Fri, 4 Nov 2022 21:06:24 +0300 Subject: [PATCH 184/364] =?UTF-8?q?mdbx:=20=D0=B4=D0=BE=D0=B1=D0=B0=D0=B2?= =?UTF-8?q?=D0=BB=D0=B5=D0=BD=D0=BE=20=D0=BE=D0=BF=D0=B8=D1=81=D0=B0=D0=BD?= =?UTF-8?q?=D0=B8=D0=B5=20=D0=BF=D0=B0=D1=80=D0=B0=D0=BC=D0=B5=D1=82=D1=80?= =?UTF-8?q?=D0=BE=D0=B2=20`MDBX=5Fdebug=5Ffunc`=20=D0=B8=20`MDBX=5Fdebug?= =?UTF-8?q?=5Ffunc`.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mdbx.h | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/mdbx.h b/mdbx.h index ffa7f3b5..35c3318a 100644 --- a/mdbx.h +++ b/mdbx.h @@ -988,8 +988,16 @@ DEFINE_ENUM_FLAG_OPERATORS(MDBX_debug_flags_t) * called before printing the message and aborting. * \see mdbx_setup_debug() * - * \param [in] env An environment handle returned by \ref mdbx_env_create(). - * \param [in] msg The assertion message, not including newline. */ + * \param [in] loglevel The severity of message. + * \param [in] function The function name which emits message, + * may be NULL. + * \param [in] line The source code line number which emits message, + * may be zero. + * \param [in] fmt The printf-like format string with message. + * \param [in] args The variable argument list respectively for the + * format-message string passed by `fmt` argument. + * Maybe NULL or invalid if the format-message string + * don't contain `%`-specification of arguments. */ typedef void MDBX_debug_func(MDBX_log_level_t loglevel, const char *function, int line, const char *fmt, va_list args) MDBX_CXX17_NOEXCEPT; @@ -1008,8 +1016,12 @@ LIBMDBX_API int mdbx_setup_debug(MDBX_log_level_t log_level, * called before printing the message and aborting. * \see mdbx_env_set_assert() * - * \param [in] env An environment handle returned by mdbx_env_create(). - * \param [in] msg The assertion message, not including newline. */ + * \param [in] env An environment handle. + * \param [in] msg The assertion message, not including newline. + * \param [in] function The function name where the assertion check failed, + * may be NULL. + * \param [in] line The line number in the source file + * where the assertion check failed, may be zero. */ typedef void MDBX_assert_func(const MDBX_env *env, const char *msg, const char *function, unsigned line) MDBX_CXX17_NOEXCEPT; From acaa1d82d90c944d40d4e94d2990c6de84e3d9aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Sat, 5 Nov 2022 14:04:38 +0300 Subject: [PATCH 185/364] mdbx: minor touch assertions for `issue#7`. https://gitflic.ru/project/erthink/libmdbx/issue/7 --- src/core.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/core.c b/src/core.c index bef4dd5e..60b060b9 100644 --- a/src/core.c +++ b/src/core.c @@ -5934,6 +5934,9 @@ __cold static int map_resize(MDBX_env *env, const pgno_t used_pgno, #if MDBX_ENABLE_MADVISE if (rc == MDBX_SUCCESS) { + eASSERT(env, limit_bytes == env->me_dxb_mmap.limit); + eASSERT(env, size_bytes <= env->me_dxb_mmap.filesize); + eASSERT(env, size_bytes == env->me_dxb_mmap.current); env->me_lck->mti_discarded_tail.weak = size_pgno; const bool readahead = !(env->me_flags & MDBX_NORDAHEAD) && @@ -5950,9 +5953,9 @@ __cold static int map_resize(MDBX_env *env, const pgno_t used_pgno, bailout: if (rc == MDBX_SUCCESS) { - eASSERT(env, size_bytes == env->me_dxb_mmap.current); - eASSERT(env, size_bytes <= env->me_dxb_mmap.filesize); eASSERT(env, limit_bytes == env->me_dxb_mmap.limit); + eASSERT(env, size_bytes <= env->me_dxb_mmap.filesize); + eASSERT(env, size_bytes == env->me_dxb_mmap.current); #ifdef MDBX_USE_VALGRIND if (prev_limit != env->me_dxb_mmap.limit || prev_addr != env->me_map) { VALGRIND_DISCARD(env->me_valgrind_handle); From f680c9911683d72281884f5168fffd863bac6edd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Sun, 6 Nov 2022 16:35:06 +0300 Subject: [PATCH 186/364] =?UTF-8?q?mdbx:=20=D0=BF=D0=B5=D1=80=D0=B5=D0=B4?= =?UTF-8?q?=D0=B5=D0=BB=D0=BA=D0=B0=20`page=5Falloc=5Fslowpath()`=20=D1=81?= =?UTF-8?q?=20=D0=B4=D0=BE=D0=B1=D0=B0=D0=B2=D0=BB=D0=B5=D0=BD=D0=B8=D0=B5?= =?UTF-8?q?=D0=BC=20=D0=BF=D1=80=D0=BE=D1=84=D0=B8=D0=BB=D0=B8=D1=80=D0=BE?= =?UTF-8?q?=D0=B2=D0=B0=D0=BD=D0=B8=D1=8F=20GC.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mdbx.h | 81 +++- src/bits.md | 4 +- src/core.c | 1189 +++++++++++++++++++++++++++-------------------- src/internals.h | 71 ++- src/options.h | 7 + src/osal.c | 55 +++ src/osal.h | 1 + 7 files changed, 868 insertions(+), 540 deletions(-) diff --git a/mdbx.h b/mdbx.h index 35c3318a..b1cfc0f0 100644 --- a/mdbx.h +++ b/mdbx.h @@ -2561,9 +2561,6 @@ struct MDBX_envinfo { msync; /**< Number of explicit msync-to-disk operations (not a pages) */ uint64_t fsync; /**< Number of explicit fsync-to-disk operations (not a pages) */ - uint64_t - gcrtime_seconds16dot16; /**< Time spent loading and searching inside - GC (aka FreeDB) in 1/65536 of second */ } mi_pgop_stat; }; #ifndef __cplusplus @@ -3713,8 +3710,8 @@ struct MDBX_commit_latency { /** \brief Duration of preparation (commit child transactions, update * sub-databases records and cursors destroying). */ uint32_t preparation; - /** \brief Duration of GC/freeDB handling & updation. */ - uint32_t gc; + /** \brief Duration of GC update by wall clock. */ + uint32_t gc_wallclock; /** \brief Duration of internal audit if enabled. */ uint32_t audit; /** \brief Duration of writing dirty/modified data pages to a filesystem, @@ -3727,6 +3724,80 @@ struct MDBX_commit_latency { uint32_t ending; /** \brief The total duration of a commit. */ uint32_t whole; + /** \brief User-mode CPU time spent on GC update. */ + uint32_t gc_cputime; + + /** \brief Информация для профилирования работы GC. + * \note Статистика является общей для всех процессов работающих с одним + * файлом БД и хранится в LCK-файле. Данные аккумулируются при фиксации всех + * транзакций, но только в сборках libmdbx c установленной опцией + * \ref MDBX_ENABLE_PROFGC. Собранная статистика возвращаются любому процессу + * при использовании \ref mdbx_txn_commit_ex() и одновременно обнуляется + * при завершении транзакций верхнего уровня (не вложенных). */ + struct { + /** \brief Количество итераций обновления GC, + * больше 1 если были повторы/перезапуски. */ + uint32_t wloops; + /** \brief Количество итераций слияния записей GC. */ + uint32_t coalescences; + /** \brief Количество уничтожений предыдущих надежных/устойчивых + * точек фиксации при работе в режиме \ref MDBX_UTTERLY_NOSYNC. */ + uint32_t wipes; + /** \brief Количество принудительных фиксаций на диск + * во избежания приращения БД при работе вне режима + * \ref MDBX_UTTERLY_NOSYNC. */ + uint32_t flushes; + /** \brief Количество обращений к механизму Handle-Slow-Readers + * во избежания приращения БД. + * \see MDBX_hsr_func */ + uint32_t kicks; + + /** \brief Счетчик выполнения по медленному пути (slow path execution count) + * GC ради данных пользователя. */ + uint32_t work_counter; + /** \brief Время "по настенным часам" затраченное на чтение и поиск внутри + * GC ради данных пользователя. */ + uint32_t work_rtime_monotonic; + /** \brief Монотонное время по "настенным часам" затраченное + * на подготовку страниц извлекаемых из GC для данных пользователя, + * включая подкачку с диска. */ + uint32_t work_xtime_monotonic; + /** \brief Время ЦПУ в режиме пользователе затраченное на чтение и поиск + * внтури GC ради данных пользователя. */ + uint32_t work_rtime_cpu; + /** \brief Количество итераций поиска внутри GC при выделении страниц + * ради данных пользователя. */ + uint32_t work_rsteps; + /** \brief Количество запросов на выделение последовательностей страниц + * ради данных пользователя. */ + uint32_t work_xpages; + /** \brief Количество страничных промахов (page faults) внутри GC + * при выделении и подготовки страниц для данных пользователя. */ + uint32_t work_majflt; + + /** \brief Счетчик выполнения по медленному пути (slow path execution count) + * GC для целей поддержки и обновления самой GC. */ + uint32_t self_counter; + /** \brief Время "по настенным часам" затраченное на чтение и поиск внутри + * GC для целей поддержки и обновления самой GC. */ + uint32_t self_rtime_monotonic; + /** \brief Монотонное время по "настенным часам" затраченное на подготовку + * страниц извлекаемых из GC для целей поддержки и обновления самой GC, + * включая подкачку с диска. */ + uint32_t self_xtime_monotonic; + /** \brief Время ЦПУ в режиме пользователе затраченное на чтение и поиск + * внтури GC для целей поддержки и обновления самой GC. */ + uint32_t self_rtime_cpu; + /** \brief Количество итераций поиска внутри GC при выделении страниц + * для целей поддержки и обновления самой GC. */ + uint32_t self_rsteps; + /** \brief Количество запросов на выделение последовательностей страниц + * для самой GC. */ + uint32_t self_xpages; + /** \brief Количество страничных промахов (page faults) внутри GC + * при выделении и подготовки страниц для самой GC. */ + uint32_t self_majflt; + } gc_prof; }; #ifndef __cplusplus /** \ingroup c_statinfo */ diff --git a/src/bits.md b/src/bits.md index 82c9eed4..a9b7c2b1 100644 --- a/src/bits.md +++ b/src/bits.md @@ -5,8 +5,8 @@ N | MASK | ENV | TXN | DB | PUT | DBI | NOD 2 |0000 0004|ALLOC_NEW |TXN_DIRTY |DUPSORT | |DBI_FRESH |F_DUPDATA|P_OVERFLOW| | 3 |0000 0008|ALLOC_SLOT |TXN_SPILLS |INTEGERKEY| |DBI_CREAT | |P_META | | 4 |0000 0010|ALLOC_FAKE |TXN_HAS_CHILD |DUPFIXED |NOOVERWRITE|DBI_VALID | |P_BAD | | -5 |0000 0020| | |INTEGERDUP|NODUPDATA |DBI_USRVALID| |P_LEAF2 | | -6 |0000 0040| | |REVERSEDUP|CURRENT |DBI_DUPDATA | |P_SUBP | | +5 |0000 0020| |TXN_UPDATE_GC |INTEGERDUP|NODUPDATA |DBI_USRVALID| |P_LEAF2 | | +6 |0000 0040| |TXN_FROZEN_RE |REVERSEDUP|CURRENT |DBI_DUPDATA | |P_SUBP | | 7 |0000 0080| | | |ALLDUPS |DBI_AUDITED | | | | 8 |0000 0100| _MAY_MOVE | | | | | | | <= | 9 |0000 0200| _MAY_UNMAP| | | | | | | <= | diff --git a/src/core.c b/src/core.c index 60b060b9..0f13c846 100644 --- a/src/core.c +++ b/src/core.c @@ -2442,19 +2442,21 @@ pnl_merge_inner(pgno_t *__restrict dst, const pgno_t *__restrict src_a, } /* Merge a PNL onto a PNL. The destination PNL must be big enough */ -__hot static void pnl_merge(MDBX_PNL dst, const MDBX_PNL src) { +__hot static size_t pnl_merge(MDBX_PNL dst, const MDBX_PNL src) { assert(pnl_check_allocated(dst, MAX_PAGENO + 1)); assert(pnl_check(src, MAX_PAGENO + 1)); const size_t src_len = MDBX_PNL_GETSIZE(src); const size_t dst_len = MDBX_PNL_GETSIZE(dst); + size_t total = dst_len; if (likely(src_len > 0)) { - const size_t total = dst_len + src_len; + total += src_len; assert(MDBX_PNL_ALLOCLEN(dst) >= total); dst[0] = /* the detent */ (MDBX_PNL_ASCENDING ? 0 : P_INVALID); pnl_merge_inner(dst + total, dst + dst_len, src + src_len, src); MDBX_PNL_SETSIZE(dst, total); } assert(pnl_check_allocated(dst, MAX_PAGENO + 1)); + return total; } static void spill_remove(MDBX_txn *txn, size_t idx, pgno_t npages) { @@ -3791,15 +3793,15 @@ MDBX_MAYBE_UNUSED __cold static bool dirtylist_check(MDBX_txn *txn) { } const size_t rpa = - pnl_search(txn->tw.reclaimed_pglist, dp->mp_pgno, txn->mt_next_pgno); - tASSERT(txn, rpa > MDBX_PNL_GETSIZE(txn->tw.reclaimed_pglist) || - txn->tw.reclaimed_pglist[rpa] != dp->mp_pgno); - if (rpa <= MDBX_PNL_GETSIZE(txn->tw.reclaimed_pglist) && - unlikely(txn->tw.reclaimed_pglist[rpa] == dp->mp_pgno)) + pnl_search(txn->tw.relist, dp->mp_pgno, txn->mt_next_pgno); + tASSERT(txn, rpa > MDBX_PNL_GETSIZE(txn->tw.relist) || + txn->tw.relist[rpa] != dp->mp_pgno); + if (rpa <= MDBX_PNL_GETSIZE(txn->tw.relist) && + unlikely(txn->tw.relist[rpa] == dp->mp_pgno)) return false; if (num > 1) { - const size_t rpb = pnl_search(txn->tw.reclaimed_pglist, - dp->mp_pgno + num - 1, txn->mt_next_pgno); + const size_t rpb = + pnl_search(txn->tw.relist, dp->mp_pgno + num - 1, txn->mt_next_pgno); tASSERT(txn, rpa == rpb); if (unlikely(rpa != rpb)) return false; @@ -3828,7 +3830,7 @@ MDBX_MAYBE_UNUSED __cold static bool dirtylist_check(MDBX_txn *txn) { static void refund_reclaimed(MDBX_txn *txn) { /* Scanning in descend order */ pgno_t next_pgno = txn->mt_next_pgno; - const MDBX_PNL pnl = txn->tw.reclaimed_pglist; + const MDBX_PNL pnl = txn->tw.relist; tASSERT(txn, MDBX_PNL_GETSIZE(pnl) && MDBX_PNL_MOST(pnl) == next_pgno - 1); #if MDBX_PNL_ASCENDING size_t i = MDBX_PNL_GETSIZE(pnl); @@ -3849,8 +3851,7 @@ static void refund_reclaimed(MDBX_txn *txn) { VERBOSE("refunded %" PRIaPGNO " pages: %" PRIaPGNO " -> %" PRIaPGNO, txn->mt_next_pgno - next_pgno, txn->mt_next_pgno, next_pgno); txn->mt_next_pgno = next_pgno; - tASSERT(txn, - pnl_check_allocated(txn->tw.reclaimed_pglist, txn->mt_next_pgno - 1)); + tASSERT(txn, pnl_check_allocated(txn->tw.relist, txn->mt_next_pgno - 1)); } static void refund_loose(MDBX_txn *txn) { @@ -4005,8 +4006,8 @@ static bool txn_refund(MDBX_txn *txn) { refund_loose(txn); while (true) { - if (MDBX_PNL_GETSIZE(txn->tw.reclaimed_pglist) == 0 || - MDBX_PNL_MOST(txn->tw.reclaimed_pglist) != txn->mt_next_pgno - 1) + if (MDBX_PNL_GETSIZE(txn->tw.relist) == 0 || + MDBX_PNL_MOST(txn->tw.relist) != txn->mt_next_pgno - 1) break; refund_reclaimed(txn); @@ -4336,8 +4337,8 @@ status_done: reclaim: DEBUG("reclaim %u %s page %" PRIaPGNO, npages, "dirty", pgno); - rc = pnl_insert_range(&txn->tw.reclaimed_pglist, pgno, npages); - tASSERT(txn, pnl_check_allocated(txn->tw.reclaimed_pglist, + rc = pnl_insert_range(&txn->tw.relist, pgno, npages); + tASSERT(txn, pnl_check_allocated(txn->tw.relist, txn->mt_next_pgno - MDBX_ENABLE_REFUND)); tASSERT(txn, dirtylist_check(txn)); return rc; @@ -5498,7 +5499,7 @@ __hot static int __must_check_result page_dirty(MDBX_txn *txn, MDBX_page *mp, if (txn->tw.loose_count) { MDBX_page *loose = txn->tw.loose_pages; DEBUG("purge-and-reclaim loose page %" PRIaPGNO, loose->mp_pgno); - rc = pnl_insert_range(&txn->tw.reclaimed_pglist, loose->mp_pgno, 1); + rc = pnl_insert_range(&txn->tw.relist, loose->mp_pgno, 1); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; size_t di = dpl_search(txn, loose->mp_pgno); @@ -6564,13 +6565,13 @@ static pgno_t *scan4seq_resolver(pgno_t *range, const size_t len, //------------------------------------------------------------------------------ /* Allocate page numbers and memory for writing. Maintain mt_last_reclaimed, - * mt_reclaimed_pglist and mt_next_pgno. Set MDBX_TXN_ERROR on failure. + * mt_relist and mt_next_pgno. Set MDBX_TXN_ERROR on failure. * * If there are free pages available from older transactions, they * are re-used first. Otherwise allocate a new page at mt_next_pgno. - * Do not modify the GC, just merge GC records into mt_reclaimed_pglist + * Do not modify the GC, just merge GC records into mt_relist * and move mt_last_reclaimed to say which records were consumed. Only this - * function can create mt_reclaimed_pglist and move + * function can create mt_relist and move * mt_last_reclaimed/mt_next_pgno. * * [in] mc cursor A cursor handle identifying the transaction and @@ -6586,267 +6587,318 @@ static pgno_t *scan4seq_resolver(pgno_t *range, const size_t len, #define MDBX_ALLOC_RESERVE 16 #define MDBX_ALLOC_BACKLOG 32 #define MDBX_ALLOC_ALL (MDBX_ALLOC_GC | MDBX_ALLOC_NEW) +#define MDBX_ALLOC_LIFO 128 + +static __inline bool is_gc_usable(const MDBX_txn *txn) { + /* If txn is updating the GC, then the retired-list cannot play catch-up with + * itself by growing while trying to save it. */ + if (txn->mt_flags & (MDBX_TXN_UPDATE_GC | MDBX_TXN_FROZEN_RE)) + return false; + + /* avoid (recursive) search inside empty tree and while tree is + updating, todo4recovery://erased_by_github/libmdbx/issues/31 */ + if (txn->mt_dbs[FREE_DBI].md_entries == 0) + return false; + + /* If our dirty list is already full, we can't touch GC */ + if (unlikely(txn->tw.dirtyroom < txn->mt_dbs[FREE_DBI].md_depth) && + !(txn->mt_dbistate[FREE_DBI] & DBI_DIRTY)) + return false; + + return true; +} + +static int gc_cursor_init(MDBX_cursor *mc, MDBX_txn *txn) { + if (unlikely(txn->mt_dbs[FREE_DBI].md_flags != MDBX_INTEGERKEY)) { + ERROR("unexpected/invalid db-flags 0x%u for GC/FreeDB", + txn->mt_dbs[FREE_DBI].md_flags); + return MDBX_CORRUPTED; + } + return cursor_init(mc, txn, FREE_DBI); +} + +static pgr_t page_alloc_slowpath(MDBX_cursor *mc, const size_t num, + char flags) { +#if MDBX_ENABLE_PROFGC + const uint64_t monotime_before = osal_monotime(); + size_t majflt_before; + const uint64_t cputime_before = osal_cputime(&majflt_before); + uint64_t monotime_shot = 0; +#endif /* MDBX_ENABLE_PROFGC */ -static pgr_t page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num, int flags) { pgr_t ret; MDBX_txn *const txn = mc->mc_txn; MDBX_env *const env = txn->mt_env; +#if MDBX_ENABLE_PROFGC + profgc_stat_t *const prof = (mc->mc_dbi == FREE_DBI) + ? &env->me_lck->mti_pgop_stat.gc_prof.self + : &env->me_lck->mti_pgop_stat.gc_prof.work; + prof->spe_counter += 1; +#endif /* MDBX_ENABLE_PROFGC */ + eASSERT(env, num == 0 || !(flags & MDBX_ALLOC_SLOT)); eASSERT(env, num > 0 || !(flags & MDBX_ALLOC_NEW)); + eASSERT(env, (flags & (MDBX_ALLOC_SLOT | MDBX_ALLOC_RESERVE | + MDBX_ALLOC_BACKLOG)) == 0 || + (flags & MDBX_ALLOC_GC)); + eASSERT(env, (flags & (MDBX_ALLOC_SLOT | MDBX_ALLOC_RESERVE | + MDBX_ALLOC_BACKLOG)) == 0 || + (flags & MDBX_ALLOC_NEW) == 0); + eASSERT(env, pnl_check_allocated(txn->tw.relist, + txn->mt_next_pgno - MDBX_ENABLE_REFUND)); - const unsigned coalesce_threshold = env->me_maxgc_ov1page >> 2; - if (likely(flags & MDBX_ALLOC_GC)) { - if (unlikely( - /* If mc is updating the GC, then the retired-list cannot play - catch-up with itself by growing while trying to save it. */ - (mc->mc_flags & (C_RECLAIMING | C_GCFREEZE)) || - /* avoid (recursive) search inside empty tree and while tree is - updating, todo4recovery://erased_by_github/libmdbx/issues/31 */ - txn->mt_dbs[FREE_DBI].md_entries == 0 || - /* If our dirty list is already full, we can't touch GC */ - (txn->tw.dirtyroom < txn->mt_dbs[FREE_DBI].md_depth && - !(txn->mt_dbistate[FREE_DBI] & DBI_DIRTY)))) - flags -= MDBX_ALLOC_GC; - else { - flags |= env->me_flags & MDBX_LIFORECLAIM; - if (txn->mt_dbs[FREE_DBI].md_branch_pages && - MDBX_PNL_GETSIZE(txn->tw.reclaimed_pglist) < coalesce_threshold) - flags |= MDBX_ALLOC_COALESCE; + pgno_t pgno = 0, *range = nullptr; + size_t re_len = MDBX_PNL_GETSIZE(txn->tw.relist); + if (num > 1) { + eASSERT(env, !(flags & MDBX_ALLOC_SLOT)); +#if MDBX_ENABLE_PROFGC + prof->xpages += 1; +#endif /* MDBX_ENABLE_PROFGC */ + if (re_len >= num) { + eASSERT(env, MDBX_PNL_LAST(txn->tw.relist) < txn->mt_next_pgno && + MDBX_PNL_FIRST(txn->tw.relist) < txn->mt_next_pgno); + range = txn->tw.relist + (MDBX_PNL_ASCENDING ? 1 : re_len); + pgno = *range; + if (num == 1) + goto done; + range = scan4seq(range, re_len, num - 1); + eASSERT(env, range == scan4range_checker(txn->tw.relist, num - 1)); + if (likely(range)) { + pgno = *range; + goto done; + } } + } else { + eASSERT(env, (flags & (MDBX_ALLOC_SLOT | MDBX_ALLOC_RESERVE)) || + MDBX_PNL_GETSIZE(txn->tw.relist) == 0); } - eASSERT(env, pnl_check_allocated(txn->tw.reclaimed_pglist, - txn->mt_next_pgno - MDBX_ENABLE_REFUND)); - pgno_t pgno, *re_list = txn->tw.reclaimed_pglist; - size_t re_len = MDBX_PNL_GETSIZE(re_list); - pgno_t *range = nullptr; - txnid_t detent = 0, last = 0; -#if MDBX_ENABLE_PGOP_STAT - uint64_t timestamp = 0; -#endif /* MDBX_ENABLE_PGOP_STAT */ + //--------------------------------------------------------------------------- - while (true) { /* hsr-kick retry loop */ - MDBX_cursor_couple recur; - for (MDBX_cursor_op op = MDBX_FIRST;; - op = (flags & MDBX_LIFORECLAIM) ? MDBX_PREV : MDBX_NEXT) { - MDBX_val key, data; + if (likely(flags & MDBX_ALLOC_GC)) { + if (unlikely(!is_gc_usable(txn))) + goto no_gc; - /* Seek a big enough contiguous page range. - * Prefer pages with lower pgno. */ - eASSERT(env, - pnl_check_allocated(txn->tw.reclaimed_pglist, txn->mt_next_pgno)); - if (!(flags & (MDBX_ALLOC_COALESCE | MDBX_ALLOC_SLOT)) && re_len >= num) { - eASSERT(env, MDBX_PNL_LAST(re_list) < txn->mt_next_pgno && - MDBX_PNL_FIRST(re_list) < txn->mt_next_pgno); - range = re_list + (MDBX_PNL_ASCENDING ? 1 : re_len); - pgno = *range; - if (num == 1) - goto done; - range = scan4seq(range, re_len, num - 1); - tASSERT(txn, range == scan4range_checker(re_list, num - 1)); - if (likely(range)) { - pgno = *range; - goto done; - } - } + eASSERT(env, (flags & (MDBX_ALLOC_COALESCE | MDBX_ALLOC_LIFO)) == 0); + flags += (env->me_flags & MDBX_LIFORECLAIM) ? MDBX_ALLOC_LIFO : 0; - if (op == MDBX_FIRST) { /* 1st iteration, setup cursor, etc */ - if (unlikely(!(flags & MDBX_ALLOC_GC))) - break /* reclaiming is prohibited for now */; + const unsigned coalesce_threshold = env->me_maxgc_ov1page >> 2; + if (txn->mt_dbs[FREE_DBI].md_branch_pages && + MDBX_PNL_GETSIZE(txn->tw.relist) < coalesce_threshold && num) + flags += MDBX_ALLOC_COALESCE; - /* Prepare to fetch and coalesce */ -#if MDBX_ENABLE_PGOP_STAT - if (likely(timestamp == 0)) - timestamp = osal_monotime(); -#endif /* MDBX_ENABLE_PGOP_STAT */ - detent = txn_oldest_reader(txn) + 1; + MDBX_cursor recur; + ret.err = gc_cursor_init(&recur, txn); + if (unlikely(ret.err != MDBX_SUCCESS)) + goto fail; - ret.err = cursor_init(&recur.outer, txn, FREE_DBI); - if (unlikely(ret.err != MDBX_SUCCESS)) - goto fail; - if (flags & MDBX_LIFORECLAIM) { - /* Begin from oldest reader if any */ - if (detent > MIN_TXNID) { - last = detent - 1; - op = MDBX_SET_RANGE; - } - } else if (txn->tw.last_reclaimed) { - /* Continue lookup from txn->tw.last_reclaimed to oldest reader */ - last = txn->tw.last_reclaimed; - op = MDBX_SET_RANGE; - } + retry_gc_refresh_oldest:; + txnid_t oldest = txn_oldest_reader(txn); + if (unlikely(!oldest)) + goto no_gc; - key.iov_base = &last; - key.iov_len = sizeof(last); - } + retry_gc_have_oldest: + if (unlikely(oldest >= txn->mt_txnid)) { + ERROR("unexpected/invalid oldest-readed txnid %" PRIaTXN + " for current-txnid %" PRIaTXN, + oldest, txn->mt_txnid); + ret.err = MDBX_PROBLEM; + goto fail; + } + const txnid_t detent = oldest + 1; - if (!(flags & MDBX_LIFORECLAIM)) { - /* Do not try fetch more if the record will be too recent */ - if (op != MDBX_FIRST && ++last >= detent) { - detent = txn_oldest_reader(txn) + 1; - if (detent <= last) - break; - } - } - - ret.err = mdbx_cursor_get(&recur.outer, &key, NULL, op); - if (ret.err == MDBX_NOTFOUND && (flags & MDBX_LIFORECLAIM)) { - if (op == MDBX_SET_RANGE) - continue; - const txnid_t snap = txn_oldest_reader(txn); - if (unlikely(detent <= snap)) { - detent = snap + 1; - last = snap; - key.iov_base = &last; - key.iov_len = sizeof(last); - op = MDBX_SET_RANGE; - ret.err = mdbx_cursor_get(&recur.outer, &key, NULL, op); - } - } - if (unlikely(ret.err)) { - if (ret.err == MDBX_NOTFOUND) - break; - goto fail; - } - - if (unlikely(key.iov_len != sizeof(txnid_t))) { - ret.err = MDBX_CORRUPTED; - goto fail; - } - last = unaligned_peek_u64(4, key.iov_base); - if (detent <= last) { - detent = txn_oldest_reader(txn) + 1; - if (detent <= last) { - if (flags & MDBX_LIFORECLAIM) - continue; - break; - } - } - - if (flags & MDBX_LIFORECLAIM) { - /* skip IDs of records that already reclaimed */ - if (txn->tw.lifo_reclaimed) { - size_t i; - for (i = MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed); i > 0; --i) - if (txn->tw.lifo_reclaimed[i] == last) - break; - if (i) - continue; - } - } - - /* Reading next GC record */ - MDBX_page *const mp = recur.outer.mc_pg[recur.outer.mc_top]; - if (unlikely((ret.err = node_read( - &recur.outer, - page_node(mp, recur.outer.mc_ki[recur.outer.mc_top]), - &data, mp)) != MDBX_SUCCESS)) - goto fail; - - if ((flags & MDBX_LIFORECLAIM) && !txn->tw.lifo_reclaimed) { + txnid_t last = 0; + bool should_scan = false; + MDBX_cursor_op op = MDBX_FIRST; + if (flags & MDBX_ALLOC_LIFO) { + if (!txn->tw.lifo_reclaimed) { txn->tw.lifo_reclaimed = txl_alloc(); if (unlikely(!txn->tw.lifo_reclaimed)) { ret.err = MDBX_ENOMEM; goto fail; } } + /* Begin lookup backward from oldest reader */ + last = detent - 1; + op = MDBX_SET_RANGE; + } else if (txn->tw.last_reclaimed) { + /* Continue lookup forward from last-reclaimed */ + last = txn->tw.last_reclaimed + 1; + if (last >= detent) + goto no_gc; + op = MDBX_SET_RANGE; + } - /* Append PNL from GC record to tw.reclaimed_pglist */ - cASSERT(mc, (mc->mc_flags & C_GCFREEZE) == 0); - pgno_t *gc_pnl = (pgno_t *)data.iov_base; - tASSERT(txn, data.iov_len >= MDBX_PNL_SIZEOF(gc_pnl)); - if (unlikely(data.iov_len % sizeof(pgno_t) || - data.iov_len < MDBX_PNL_SIZEOF(gc_pnl) || - !pnl_check(gc_pnl, txn->mt_next_pgno))) { - ret.err = MDBX_CORRUPTED; + next_gc:; + MDBX_val key; + key.iov_base = &last; + key.iov_len = sizeof(last); + +#if MDBX_ENABLE_PROFGC + prof->rsteps += 1; +#endif /* MDBX_ENABLE_PROFGC */ + + /* Seek first/next GC record */ + ret.err = mdbx_cursor_get(&recur, &key, NULL, op); + if (unlikely(ret.err != MDBX_SUCCESS)) { + if (unlikely(ret.err != MDBX_NOTFOUND)) goto fail; + if ((flags & MDBX_ALLOC_LIFO) && op == MDBX_SET_RANGE) { + op = MDBX_PREV; + goto next_gc; } - const size_t gc_len = MDBX_PNL_GETSIZE(gc_pnl); - if (unlikely(/* list is too long already */ MDBX_PNL_GETSIZE( - txn->tw.reclaimed_pglist) >= - env->me_options.rp_augment_limit) && - ((/* not a slot-request from gc-update */ - (flags & MDBX_ALLOC_SLOT) == 0 && - /* have enough unallocated space */ txn->mt_geo.upper >= - txn->mt_next_pgno + (size_t)num) || - gc_len + MDBX_PNL_GETSIZE(txn->tw.reclaimed_pglist) >= - MDBX_PGL_LIMIT)) { - /* Stop reclaiming to avoid large/overflow the page list. - * This is a rare case while search for a continuously multi-page region - * in a large database. - * todo4recovery://erased_by_github/libmdbx/issues/123 */ - NOTICE("stop reclaiming to avoid PNL overflow: %zu (current) + %zu " - "(chunk) -> %zu", - MDBX_PNL_GETSIZE(txn->tw.reclaimed_pglist), gc_len, - gc_len + MDBX_PNL_GETSIZE(txn->tw.reclaimed_pglist)); - flags &= ~(MDBX_ALLOC_GC | MDBX_ALLOC_COALESCE); - break; - } - ret.err = pnl_need(&txn->tw.reclaimed_pglist, gc_len); + goto depleted_gc; + } + if (unlikely(key.iov_len != sizeof(txnid_t))) { + ret.err = MDBX_CORRUPTED; + goto fail; + } + last = unaligned_peek_u64(4, key.iov_base); + if (flags & MDBX_ALLOC_LIFO) { + op = MDBX_PREV; + if (last >= detent) + goto next_gc; + /* skip IDs of records that already reclaimed */ + for (size_t i = MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed); i > 0; --i) + if (txn->tw.lifo_reclaimed[i] == last) + goto next_gc; + } else { + op = MDBX_NEXT; + if (unlikely(last >= detent)) + goto depleted_gc; + } + + /* Reading next GC record */ + MDBX_val data; + MDBX_page *const mp = recur.mc_pg[recur.mc_top]; + if (unlikely((ret.err = node_read(&recur, + page_node(mp, recur.mc_ki[recur.mc_top]), + &data, mp)) != MDBX_SUCCESS)) + goto fail; + + eASSERT(env, (txn->mt_flags & MDBX_TXN_FROZEN_RE) == 0); + pgno_t *gc_pnl = (pgno_t *)data.iov_base; + if (unlikely(data.iov_len % sizeof(pgno_t) || + data.iov_len < MDBX_PNL_SIZEOF(gc_pnl) || + !pnl_check(gc_pnl, txn->mt_next_pgno))) { + ret.err = MDBX_CORRUPTED; + goto fail; + } + const size_t gc_len = MDBX_PNL_GETSIZE(gc_pnl); + if (unlikely(/* list is too long already */ MDBX_PNL_GETSIZE( + txn->tw.relist) >= env->me_options.rp_augment_limit) && + ((/* not a slot-request from gc-update */ + (flags & MDBX_ALLOC_SLOT) == 0 && + /* have enough unallocated space */ txn->mt_geo.upper >= + txn->mt_next_pgno + num) || + gc_len + MDBX_PNL_GETSIZE(txn->tw.relist) >= MDBX_PGL_LIMIT)) { + /* Stop reclaiming to avoid large/overflow the page list. + * This is a rare case while search for a continuously multi-page region + * in a large database. + * todo4recovery://erased_by_github/libmdbx/issues/123 */ + NOTICE("stop reclaiming to avoid PNL overflow: %zu (current) + %zu " + "(chunk) -> %zu", + MDBX_PNL_GETSIZE(txn->tw.relist), gc_len, + gc_len + MDBX_PNL_GETSIZE(txn->tw.relist)); + goto depleted_gc; + } + + /* Remember ID of readed GC record */ + txn->tw.last_reclaimed = last; + if (flags & MDBX_ALLOC_LIFO) { + ret.err = txl_append(&txn->tw.lifo_reclaimed, last); if (unlikely(ret.err != MDBX_SUCCESS)) goto fail; - re_list = txn->tw.reclaimed_pglist; + } - /* Remember ID of GC record */ - if (flags & MDBX_LIFORECLAIM) { - ret.err = txl_append(&txn->tw.lifo_reclaimed, last); - if (unlikely(ret.err != MDBX_SUCCESS)) - goto fail; - } - txn->tw.last_reclaimed = last; + /* Append PNL from GC record to tw.relist */ + ret.err = pnl_need(&txn->tw.relist, gc_len); + if (unlikely(ret.err != MDBX_SUCCESS)) + goto fail; + txn->tw.relist = txn->tw.relist; - if (LOG_ENABLED(MDBX_LOG_EXTRA)) { - DEBUG_EXTRA("PNL read txn %" PRIaTXN " root %" PRIaPGNO " len %zu, PNL", - last, txn->mt_dbs[FREE_DBI].md_root, gc_len); - for (size_t i = gc_len; i; i--) - DEBUG_EXTRA_PRINT(" %" PRIaPGNO, gc_pnl[i]); - DEBUG_EXTRA_PRINT("%s\n", "."); - } + if (LOG_ENABLED(MDBX_LOG_EXTRA)) { + DEBUG_EXTRA("readed GC-pnl txn %" PRIaTXN " root %" PRIaPGNO + " len %zu, PNL", + last, txn->mt_dbs[FREE_DBI].md_root, gc_len); + for (size_t i = gc_len; i; i--) + DEBUG_EXTRA_PRINT(" %" PRIaPGNO, gc_pnl[i]); + DEBUG_EXTRA_PRINT(", next_pgno %u\n", txn->mt_next_pgno); + } - /* Merge in descending sorted order */ - pnl_merge(re_list, gc_pnl); - if (AUDIT_ENABLED() && unlikely(!pnl_check(re_list, txn->mt_next_pgno))) { + /* Merge in descending sorted order */ + re_len = pnl_merge(txn->tw.relist, gc_pnl); + should_scan = true; + if (AUDIT_ENABLED()) { + if (unlikely(!pnl_check(txn->tw.relist, txn->mt_next_pgno))) { ret.err = MDBX_CORRUPTED; goto fail; } - tASSERT(txn, dirtylist_check(txn)); + } else { + eASSERT(env, pnl_check_allocated(txn->tw.relist, txn->mt_next_pgno)); + } + eASSERT(env, dirtylist_check(txn)); - re_len = MDBX_PNL_GETSIZE(re_list); - tASSERT(txn, re_len == 0 || re_list[re_len] < txn->mt_next_pgno); - if (MDBX_ENABLE_REFUND && re_len && - unlikely(MDBX_PNL_MOST(re_list) == txn->mt_next_pgno - 1)) { - /* Refund suitable pages into "unallocated" space */ - txn_refund(txn); - re_list = txn->tw.reclaimed_pglist; - re_len = MDBX_PNL_GETSIZE(re_list); - } + eASSERT(env, + re_len == 0 || MDBX_PNL_MOST(txn->tw.relist) < txn->mt_next_pgno); + if (MDBX_ENABLE_REFUND && re_len && + unlikely(MDBX_PNL_MOST(txn->tw.relist) == txn->mt_next_pgno - 1)) { + /* Refund suitable pages into "unallocated" space */ + if (txn_refund(txn)) + re_len = MDBX_PNL_GETSIZE(txn->tw.relist); + } + eASSERT(env, re_len == MDBX_PNL_GETSIZE(txn->tw.relist)); + eASSERT(env, pnl_check_allocated(txn->tw.relist, + txn->mt_next_pgno - MDBX_ENABLE_REFUND)); - /* Done for a kick-reclaim mode, actually no page needed */ - if (unlikely(flags & MDBX_ALLOC_SLOT)) { - DEBUG("early-return NULL-page for %s mode", "MDBX_ALLOC_SLOT"); -#if MDBX_ENABLE_PGOP_STAT - eASSERT(env, timestamp != 0); - env->me_lck->mti_pgop_stat.gcrtime.weak += osal_monotime() - timestamp; -#endif /* MDBX_ENABLE_PGOP_STAT */ - ret.err = MDBX_SUCCESS; - ret.page = NULL; - return ret; - } - - /* Don't try to coalesce too much. */ - if (re_len /* current size */ > coalesce_threshold) { - if (flags & MDBX_ALLOC_COALESCE) - TRACE("clear %s %s", "MDBX_ALLOC_COALESCE", "since got threshold"); - flags &= ~MDBX_ALLOC_COALESCE; - } + /* Done for a kick-reclaim mode, actually no page needed */ + if (unlikely(flags & MDBX_ALLOC_SLOT)) { + eASSERT(env, ret.err == MDBX_SUCCESS); + goto early_exit; } - if (F_ISSET(flags, MDBX_ALLOC_COALESCE | MDBX_ALLOC_GC)) { - DEBUG_EXTRA("clear %s and continue", "MDBX_ALLOC_COALESCE"); + /* TODO: delete reclaimed records */ + + /* Don't try to coalesce too much. */ + eASSERT(env, op == MDBX_PREV || op == MDBX_NEXT); + if (flags & MDBX_ALLOC_COALESCE) { + if (re_len /* current size */ < coalesce_threshold) { +#if MDBX_ENABLE_PROFGC + env->me_lck->mti_pgop_stat.gc_prof.coalescences += 1; +#endif /* MDBX_ENABLE_PROFGC */ + goto next_gc; + } + TRACE("clear %s %s", "MDBX_ALLOC_COALESCE", "since got threshold"); flags &= ~MDBX_ALLOC_COALESCE; - continue; } + scan: + eASSERT(env, should_scan); + if (re_len >= num) { + eASSERT(env, MDBX_PNL_LAST(txn->tw.relist) < txn->mt_next_pgno && + MDBX_PNL_FIRST(txn->tw.relist) < txn->mt_next_pgno); + range = txn->tw.relist + (MDBX_PNL_ASCENDING ? 1 : re_len); + pgno = *range; + if (num == 1) + goto done; + range = scan4seq(range, re_len, num - 1); + eASSERT(env, range == scan4range_checker(txn->tw.relist, num - 1)); + if (likely(range)) { + pgno = *range; + goto done; + } + } + should_scan = false; + if (ret.err == MDBX_SUCCESS) + goto next_gc; + + depleted_gc: + ret.err = MDBX_NOTFOUND; + if (should_scan) + goto scan; + + //------------------------------------------------------------------------- + /* There is no suitable pages in the GC and to be able to allocate * we should CHOICE one of: * - make a new steady checkpoint if reclaiming was stopped by @@ -6855,203 +6907,241 @@ static pgr_t page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num, int flags) { * - extend the database file. */ /* Will use new pages from the map if nothing is suitable in the GC. */ - range = nullptr; pgno = txn->mt_next_pgno; - const size_t next = (size_t)pgno + num; + const size_t newnext = num + pgno; - if (flags & MDBX_ALLOC_GC) { - const meta_ptr_t recent = meta_recent(env, &txn->tw.troika); - const meta_ptr_t prefer_steady = meta_prefer_steady(env, &txn->tw.troika); - /* does reclaiming stopped at the last steady point? */ - if (recent.ptr_c != prefer_steady.ptr_c && prefer_steady.is_steady && - detent == prefer_steady.txnid + 1) { - DEBUG("gc-kick-steady: recent %" PRIaTXN "-%s, steady %" PRIaTXN - "-%s, detent %" PRIaTXN, - recent.txnid, durable_caption(recent.ptr_c), prefer_steady.txnid, - durable_caption(prefer_steady.ptr_c), detent); - ret.err = MDBX_RESULT_TRUE; - const pgno_t autosync_threshold = - atomic_load32(&env->me_lck->mti_autosync_threshold, mo_Relaxed); - const uint64_t autosync_period = - atomic_load64(&env->me_lck->mti_autosync_period, mo_Relaxed); - uint64_t eoos_timestamp; - /* wipe the last steady-point if one of: - * - UTTERLY_NOSYNC mode AND auto-sync threshold is NOT specified - * - UTTERLY_NOSYNC mode AND free space at steady-point is exhausted - * otherwise, make a new steady-point if one of: - * - auto-sync threshold is specified and reached; - * - upper limit of database size is reached; - * - database is full (with the current file size) - * AND auto-sync threshold it NOT specified */ - if (F_ISSET(env->me_flags, MDBX_UTTERLY_NOSYNC) && - ((autosync_threshold | autosync_period) == 0 || - next >= prefer_steady.ptr_c->mm_geo.now)) { - /* wipe steady checkpoint in MDBX_UTTERLY_NOSYNC mode - * without any auto-sync threshold(s). */ - ret.err = wipe_steady(txn, detent); - DEBUG("gc-wipe-steady, rc %d", ret.err); - eASSERT(env, prefer_steady.ptr_c != - meta_prefer_steady(env, &txn->tw.troika).ptr_c); - } else if ((flags & (MDBX_ALLOC_BACKLOG | MDBX_ALLOC_NEW)) == 0 || - (autosync_threshold && - atomic_load64(&env->me_lck->mti_unsynced_pages, - mo_Relaxed) >= autosync_threshold) || - (autosync_period && - (eoos_timestamp = atomic_load64( - &env->me_lck->mti_eoos_timestamp, mo_Relaxed)) && - osal_monotime() - eoos_timestamp >= autosync_period) || - next >= txn->mt_geo.upper || - (next >= txn->mt_end_pgno && - (autosync_threshold | autosync_period) == 0)) { - /* make steady checkpoint. */ - MDBX_meta meta = *recent.ptr_c; - ret.err = sync_locked(env, env->me_flags & MDBX_WRITEMAP, &meta, - &txn->tw.troika); - DEBUG("gc-make-steady, rc %d", ret.err); - eASSERT(env, prefer_steady.ptr_c != - meta_prefer_steady(env, &txn->tw.troika).ptr_c); - } - if (likely(ret.err != MDBX_RESULT_TRUE)) { - if (unlikely(ret.err != MDBX_SUCCESS)) - goto fail; - continue; - } + const meta_ptr_t recent = meta_recent(env, &txn->tw.troika); + const meta_ptr_t prefer_steady = meta_prefer_steady(env, &txn->tw.troika); + /* does reclaiming stopped at the last steady point? */ + if (recent.ptr_c != prefer_steady.ptr_c && prefer_steady.is_steady && + detent == prefer_steady.txnid + 1) { + DEBUG("gc-kick-steady: recent %" PRIaTXN "-%s, steady %" PRIaTXN + "-%s, detent %" PRIaTXN, + recent.txnid, durable_caption(recent.ptr_c), prefer_steady.txnid, + durable_caption(prefer_steady.ptr_c), detent); + const pgno_t autosync_threshold = + atomic_load32(&env->me_lck->mti_autosync_threshold, mo_Relaxed); + const uint64_t autosync_period = + atomic_load64(&env->me_lck->mti_autosync_period, mo_Relaxed); + uint64_t eoos_timestamp; + /* wipe the last steady-point if one of: + * - UTTERLY_NOSYNC mode AND auto-sync threshold is NOT specified + * - UTTERLY_NOSYNC mode AND free space at steady-point is exhausted + * otherwise, make a new steady-point if one of: + * - auto-sync threshold is specified and reached; + * - upper limit of database size is reached; + * - database is full (with the current file size) + * AND auto-sync threshold it NOT specified */ + if (F_ISSET(env->me_flags, MDBX_UTTERLY_NOSYNC) && + ((autosync_threshold | autosync_period) == 0 || + newnext >= prefer_steady.ptr_c->mm_geo.now)) { + /* wipe steady checkpoint in MDBX_UTTERLY_NOSYNC mode + * without any auto-sync threshold(s). */ +#if MDBX_ENABLE_PROFGC + env->me_lck->mti_pgop_stat.gc_prof.wipes += 1; +#endif /* MDBX_ENABLE_PROFGC */ + ret.err = wipe_steady(txn, detent); + DEBUG("gc-wipe-steady, rc %d", ret.err); + if (unlikely(ret.err != MDBX_SUCCESS)) + goto fail; + eASSERT(env, prefer_steady.ptr_c != + meta_prefer_steady(env, &txn->tw.troika).ptr_c); + goto retry_gc_refresh_oldest; + } + if ((flags & (MDBX_ALLOC_BACKLOG | MDBX_ALLOC_NEW)) == 0 || + (autosync_threshold && + atomic_load64(&env->me_lck->mti_unsynced_pages, mo_Relaxed) >= + autosync_threshold) || + (autosync_period && + (eoos_timestamp = + atomic_load64(&env->me_lck->mti_eoos_timestamp, mo_Relaxed)) && + osal_monotime() - eoos_timestamp >= autosync_period) || + newnext >= txn->mt_geo.upper || + (newnext >= txn->mt_end_pgno && + (autosync_threshold | autosync_period) == 0)) { + /* make steady checkpoint. */ +#if MDBX_ENABLE_PROFGC + env->me_lck->mti_pgop_stat.gc_prof.flushes += 1; +#endif /* MDBX_ENABLE_PROFGC */ + MDBX_meta meta = *recent.ptr_c; + ret.err = sync_locked(env, env->me_flags & MDBX_WRITEMAP, &meta, + &txn->tw.troika); + DEBUG("gc-make-steady, rc %d", ret.err); + eASSERT(env, ret.err != MDBX_RESULT_TRUE); + if (unlikely(ret.err != MDBX_SUCCESS)) + goto fail; + eASSERT(env, prefer_steady.ptr_c != + meta_prefer_steady(env, &txn->tw.troika).ptr_c); + goto retry_gc_refresh_oldest; } } - /* don't kick lagging reader(s) if is enough unallocated space + if (env->me_lck_mmap.lck && + unlikely(true == + atomic_load32(&env->me_lck_mmap.lck->mti_readers_refresh_flag, + mo_AcquireRelease))) { + oldest = txn_oldest_reader(txn); + if (oldest >= detent) + goto retry_gc_have_oldest; + } + + /* avoid kick lagging reader(s) if is enough unallocated space * at the end of database file. */ - if ((flags & MDBX_ALLOC_NEW) && next <= txn->mt_end_pgno) + if ((flags & MDBX_ALLOC_NEW) && newnext <= txn->mt_end_pgno) { + eASSERT(env, range == nullptr); goto done; - - if (flags & MDBX_ALLOC_GC) { - const txnid_t laggard = txn_oldest_reader(txn); - if (laggard >= detent || (laggard < txn->mt_txnid - xMDBX_TXNID_STEP && - kick_longlived_readers(env, laggard) >= detent)) - continue; } - ret.err = MDBX_NOTFOUND; - if (flags & MDBX_ALLOC_NEW) { - ret.err = MDBX_MAP_FULL; - if (next < txn->mt_geo.upper && txn->mt_geo.grow_pv) { - eASSERT(env, next > txn->mt_end_pgno); - const pgno_t grow_step = pv2pages(txn->mt_geo.grow_pv); - size_t aligned = pgno_align2os_pgno( - env, (pgno_t)(next + grow_step - next % grow_step)); - - if (aligned > txn->mt_geo.upper) - aligned = txn->mt_geo.upper; - eASSERT(env, aligned > txn->mt_end_pgno); - - VERBOSE("try growth datafile to %zu pages (+%zu)", aligned, - aligned - txn->mt_end_pgno); - ret.err = map_resize_implicit(env, txn->mt_next_pgno, (pgno_t)aligned, - txn->mt_geo.upper); - if (ret.err == MDBX_SUCCESS) { - env->me_txn->mt_end_pgno = (pgno_t)aligned; - goto done; - } - - ERROR("unable growth datafile to %zu pages (+%zu), errcode %d", aligned, - aligned - txn->mt_end_pgno, ret.err); - } else { - NOTICE("gc-alloc: next %zu > upper %" PRIaPGNO, next, - txn->mt_geo.upper); - } + if (oldest < txn->mt_txnid - xMDBX_TXNID_STEP) { + oldest = kick_longlived_readers(env, oldest); + if (oldest >= detent) + goto retry_gc_have_oldest; } - - fail: -#if MDBX_ENABLE_PGOP_STAT - if (timestamp) - env->me_lck->mti_pgop_stat.gcrtime.weak += osal_monotime() - timestamp; -#endif /* MDBX_ENABLE_PGOP_STAT */ - eASSERT(env, pnl_check_allocated(txn->tw.reclaimed_pglist, - txn->mt_next_pgno - MDBX_ENABLE_REFUND)); - int level; - const char *what; - if (likely(!(flags & MDBX_ALLOC_RESERVE))) { - txn->mt_flags |= MDBX_TXN_ERROR; - level = MDBX_LOG_ERROR; - what = "pages"; - } else { - level = (flags & MDBX_ALLOC_BACKLOG) ? MDBX_LOG_DEBUG : MDBX_LOG_NOTICE; - what = (flags & MDBX_ALLOC_SLOT) ? "gc-slot/backlog" : "backlog-pages"; - } - if (LOG_ENABLED(level)) - debug_log(level, __func__, __LINE__, - "unable alloc %u %s, flags 0x%x, errcode %d\n", num, what, - flags, ret.err); - - eASSERT(env, ret.err != MDBX_SUCCESS); - ret.page = NULL; - return ret; } + //--------------------------------------------------------------------------- + +no_gc: + if ((flags & MDBX_ALLOC_NEW) == 0) { + ret.err = MDBX_NOTFOUND; + goto fail; + } + + /* Will use new pages from the map if nothing is suitable in the GC. */ + pgno = txn->mt_next_pgno; + const size_t newnext = num + pgno; + if (newnext <= txn->mt_end_pgno) + goto done; + + if (newnext > txn->mt_geo.upper || !txn->mt_geo.grow_pv) { + NOTICE("gc-alloc: next %zu > upper %" PRIaPGNO, newnext, txn->mt_geo.upper); + ret.err = MDBX_MAP_FULL; + goto fail; + } + + eASSERT(env, newnext > txn->mt_end_pgno); + const size_t grow_step = pv2pages(txn->mt_geo.grow_pv); + size_t aligned = pgno_align2os_pgno( + env, (pgno_t)(newnext + grow_step - newnext % grow_step)); + + if (aligned > txn->mt_geo.upper) + aligned = txn->mt_geo.upper; + eASSERT(env, aligned >= newnext); + +#if MDBX_ENABLE_PROFGC + monotime_shot = osal_monotime(); +#endif /* MDBX_ENABLE_PROFGC */ + VERBOSE("try growth datafile to %zu pages (+%zu)", aligned, + aligned - txn->mt_end_pgno); + ret.err = map_resize_implicit(env, txn->mt_next_pgno, (pgno_t)aligned, + txn->mt_geo.upper); + if (ret.err != MDBX_SUCCESS) { + ERROR("unable growth datafile to %zu pages (+%zu), errcode %d", aligned, + aligned - txn->mt_end_pgno, ret.err); + goto fail; + } + env->me_txn->mt_end_pgno = (pgno_t)aligned; + + //--------------------------------------------------------------------------- + done: - eASSERT(env, !(flags & MDBX_ALLOC_SLOT)); - ENSURE(env, pgno >= NUM_METAS); -#if MDBX_ENABLE_PGOP_STAT - if (likely(timestamp)) - env->me_lck->mti_pgop_stat.gcrtime.weak += osal_monotime() - timestamp; -#endif /* MDBX_ENABLE_PGOP_STAT */ - if (unlikely(flags & MDBX_ALLOC_RESERVE)) { - DEBUG("return NULL for %u pages %s reservation", num, "gc-slot/backlog"); - ret.page = NULL; - ret.err = MDBX_SUCCESS; - return ret; - } - - if (env->me_flags & MDBX_WRITEMAP) { - ret.page = pgno2page(env, pgno); - VALGRIND_MAKE_MEM_UNDEFINED(ret.page, pgno2bytes(env, num)); - MDBX_ASAN_UNPOISON_MEMORY_REGION(ret.page, pgno2bytes(env, num)); - } else { - ret.page = page_malloc(txn, num); - if (unlikely(!ret.page)) { - ret.err = MDBX_ENOMEM; - goto fail; - } - } - - if (range) { - cASSERT(mc, (mc->mc_flags & C_GCFREEZE) == 0); - tASSERT(txn, pgno < txn->mt_next_pgno); - tASSERT(txn, pgno == *range); - /* Cutoff allocated pages from tw.reclaimed_pglist */ + ret.err = MDBX_SUCCESS; + if (likely((flags & (MDBX_ALLOC_SLOT | MDBX_ALLOC_RESERVE)) == 0)) { + ENSURE(env, pgno >= NUM_METAS); + if (range) { + eASSERT(env, (txn->mt_flags & MDBX_TXN_FROZEN_RE) == 0); + eASSERT(env, pgno == *range); + eASSERT(env, pgno + num <= txn->mt_next_pgno && pgno >= NUM_METAS); + /* Cutoff allocated pages from tw.relist */ #if MDBX_PNL_ASCENDING - for (const pgno_t *const end = re_list + re_len - num; range <= end; - ++range) - *range = range[num]; + for (const pgno_t *const end = re_list + re_len - num; range <= end; + ++range) + *range = range[num]; #else - for (const pgno_t *const end = re_list + re_len; ++range <= end;) - range[-(ptrdiff_t)num] = *range; + for (const pgno_t *const end = txn->tw.relist + re_len; ++range <= end;) + range[-(ptrdiff_t)num] = *range; #endif - MDBX_PNL_SETSIZE(re_list, re_len -= num); - tASSERT(txn, pnl_check_allocated(txn->tw.reclaimed_pglist, - txn->mt_next_pgno - MDBX_ENABLE_REFUND)); + MDBX_PNL_SETSIZE(txn->tw.relist, re_len -= num); + eASSERT(env, pnl_check_allocated(txn->tw.relist, + txn->mt_next_pgno - MDBX_ENABLE_REFUND)); + } else { + eASSERT(env, flags & MDBX_ALLOC_NEW); + pgno = txn->mt_next_pgno; + txn->mt_next_pgno += (pgno_t)num; + eASSERT(env, txn->mt_next_pgno <= txn->mt_end_pgno); + eASSERT(env, pgno >= NUM_METAS && pgno + num <= txn->mt_next_pgno); + } + +#if MDBX_ENABLE_PROFGC + if (!monotime_shot) + monotime_shot = osal_monotime(); +#endif /* MDBX_ENABLE_PROFGC */ + if (env->me_flags & MDBX_WRITEMAP) { + ret.page = pgno2page(env, pgno); + VALGRIND_MAKE_MEM_UNDEFINED(ret.page, pgno2bytes(env, num)); + MDBX_ASAN_UNPOISON_MEMORY_REGION(ret.page, pgno2bytes(env, num)); + } else { + ret.page = page_malloc(txn, num); + if (unlikely(!ret.page)) { + ret.err = MDBX_ENOMEM; + goto fail; + } + } + + if (unlikely(env->me_flags & MDBX_PAGEPERTURB)) + memset(ret.page, -1, pgno2bytes(env, num)); + VALGRIND_MAKE_MEM_UNDEFINED(ret.page, pgno2bytes(env, num)); + + ret.page->mp_pgno = pgno; + ret.page->mp_leaf2_ksize = 0; + ret.page->mp_flags = 0; + if ((ASSERT_ENABLED() || AUDIT_ENABLED()) && num > 1) { + ret.page->mp_pages = (pgno_t)num; + ret.page->mp_flags = P_OVERFLOW; + } + + ret.err = page_dirty(txn, ret.page, (pgno_t)num); + if (unlikely(ret.err != MDBX_SUCCESS)) { + fail: + eASSERT(env, ret.err != MDBX_SUCCESS); + eASSERT(env, pnl_check_allocated(txn->tw.relist, + txn->mt_next_pgno - MDBX_ENABLE_REFUND)); + int level; + const char *what; + if (flags & MDBX_ALLOC_RESERVE) { + level = (flags & MDBX_ALLOC_BACKLOG) ? MDBX_LOG_DEBUG : MDBX_LOG_NOTICE; + what = (flags & MDBX_ALLOC_SLOT) ? "gc-slot/backlog" : "backlog-pages"; + } else { + txn->mt_flags |= MDBX_TXN_ERROR; + level = MDBX_LOG_ERROR; + what = "pages"; + } + if (LOG_ENABLED(level)) + debug_log(level, __func__, __LINE__, + "unable alloc %zu %s, flags 0x%x, errcode %d\n", num, what, + flags, ret.err); + ret.page = NULL; + } } else { - txn->mt_next_pgno = (pgno_t)(pgno + num); - eASSERT(env, txn->mt_next_pgno <= txn->mt_end_pgno); + early_exit: + DEBUG("return NULL for %zu pages for ALLOC_%s, rc %d", num, + (flags & MDBX_ALLOC_SLOT) ? "SLOT" : "RESERVE", ret.err); + ret.page = NULL; } - if (unlikely(env->me_flags & MDBX_PAGEPERTURB)) - memset(ret.page, -1, pgno2bytes(env, num)); - VALGRIND_MAKE_MEM_UNDEFINED(ret.page, pgno2bytes(env, num)); - - ret.page->mp_pgno = pgno; - ret.page->mp_leaf2_ksize = 0; - ret.page->mp_flags = 0; - if ((ASSERT_ENABLED() || AUDIT_ENABLED()) && num > 1) { - ret.page->mp_pages = (pgno_t)num; - ret.page->mp_flags = P_OVERFLOW; - } - ret.err = page_dirty(txn, ret.page, num); - if (unlikely(ret.err != MDBX_SUCCESS)) - goto fail; - - tASSERT(txn, pnl_check_allocated(txn->tw.reclaimed_pglist, + eASSERT(env, pnl_check_allocated(txn->tw.relist, txn->mt_next_pgno - MDBX_ENABLE_REFUND)); +#if MDBX_ENABLE_PROFGC + size_t majflt_after; + prof->rtime_cpu += osal_cputime(&majflt_after) - cputime_before; + prof->majflt += majflt_after - majflt_before; + const uint64_t monotime_now = osal_monotime(); + if (monotime_shot) { + prof->xtime_monotonic += monotime_shot - monotime_before; + prof->rtime_monotonic += monotime_now - monotime_shot; + } else + prof->rtime_monotonic += monotime_now - monotime_before; +#endif /* MDBX_ENABLE_PROFGC */ return ret; } @@ -7081,10 +7171,12 @@ __hot static pgr_t page_alloc(MDBX_cursor *mc) { return ret; } - if (likely(!(mc->mc_flags & C_GCFREEZE))) { - MDBX_PNL pnl = txn->tw.reclaimed_pglist; + if (likely(!(txn->mt_flags & MDBX_TXN_FROZEN_RE))) { + MDBX_PNL pnl = txn->tw.relist; const size_t len = MDBX_PNL_GETSIZE(pnl); if (likely(len > 0)) { + MDBX_env *const env = txn->mt_env; + MDBX_PNL_SETSIZE(pnl, len - 1); #if MDBX_PNL_ASCENDING const pgno_t pgno = pnl[1]; @@ -7094,7 +7186,14 @@ __hot static pgr_t page_alloc(MDBX_cursor *mc) { const pgno_t pgno = pnl[len]; #endif - MDBX_env *const env = txn->mt_env; +#if MDBX_ENABLE_PROFGC + const uint64_t monotime_before = osal_monotime(); + size_t majflt_before; + const uint64_t cputime_before = osal_cputime(&majflt_before); + profgc_stat_t *const prof = + (mc->mc_dbi == FREE_DBI) ? &env->me_lck->mti_pgop_stat.gc_prof.self + : &env->me_lck->mti_pgop_stat.gc_prof.work; +#endif /* MDBX_ENABLE_PROFGC */ pgr_t ret; if (env->me_flags & MDBX_WRITEMAP) { ret.page = pgno2page(env, pgno); @@ -7103,7 +7202,7 @@ __hot static pgr_t page_alloc(MDBX_cursor *mc) { ret.page = page_malloc(txn, 1); if (unlikely(!ret.page)) { ret.err = MDBX_ENOMEM; - return ret; + goto bailout; } } @@ -7114,8 +7213,15 @@ __hot static pgr_t page_alloc(MDBX_cursor *mc) { tASSERT(txn, ret.page->mp_pgno >= NUM_METAS); ret.err = page_dirty(txn, ret.page, 1); - tASSERT(txn, pnl_check_allocated(txn->tw.reclaimed_pglist, + bailout: + tASSERT(txn, pnl_check_allocated(txn->tw.relist, txn->mt_next_pgno - MDBX_ENABLE_REFUND)); +#if MDBX_ENABLE_PROFGC + size_t majflt_after; + prof->rtime_cpu += osal_cputime(&majflt_after) - cputime_before; + prof->majflt += majflt_after - majflt_before; + prof->xtime_monotonic += osal_monotime() - monotime_before; +#endif /* MDBX_ENABLE_PROFGC */ return ret; } } @@ -8483,16 +8589,16 @@ int mdbx_txn_begin_ex(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags, txn->mt_geo = parent->mt_geo; rc = dpl_alloc(txn); if (likely(rc == MDBX_SUCCESS)) { - const size_t len = MDBX_PNL_GETSIZE(parent->tw.reclaimed_pglist) + - parent->tw.loose_count; - txn->tw.reclaimed_pglist = + const size_t len = + MDBX_PNL_GETSIZE(parent->tw.relist) + parent->tw.loose_count; + txn->tw.relist = pnl_alloc((len > MDBX_PNL_INITIAL) ? len : MDBX_PNL_INITIAL); - if (unlikely(!txn->tw.reclaimed_pglist)) + if (unlikely(!txn->tw.relist)) rc = MDBX_ENOMEM; } if (unlikely(rc != MDBX_SUCCESS)) { nested_failed: - pnl_free(txn->tw.reclaimed_pglist); + pnl_free(txn->tw.relist); dpl_free(txn); osal_free(txn); return rc; @@ -8505,7 +8611,7 @@ int mdbx_txn_begin_ex(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags, const size_t di = dpl_exist(parent, lp->mp_pgno); tASSERT(parent, di && parent->tw.dirtylist->items[di].ptr == lp); tASSERT(parent, lp->mp_flags == P_LOOSE); - rc = pnl_insert_range(&parent->tw.reclaimed_pglist, lp->mp_pgno, 1); + rc = pnl_insert_range(&parent->tw.relist, lp->mp_pgno, 1); if (unlikely(rc != MDBX_SUCCESS)) goto nested_failed; parent->tw.loose_pages = lp->mp_next; @@ -8525,12 +8631,12 @@ int mdbx_txn_begin_ex(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags, if (parent->tw.spill_pages) spill_purge(parent); - tASSERT(txn, MDBX_PNL_ALLOCLEN(txn->tw.reclaimed_pglist) >= - MDBX_PNL_GETSIZE(parent->tw.reclaimed_pglist)); - memcpy(txn->tw.reclaimed_pglist, parent->tw.reclaimed_pglist, - MDBX_PNL_SIZEOF(parent->tw.reclaimed_pglist)); + tASSERT(txn, MDBX_PNL_ALLOCLEN(txn->tw.relist) >= + MDBX_PNL_GETSIZE(parent->tw.relist)); + memcpy(txn->tw.relist, parent->tw.relist, + MDBX_PNL_SIZEOF(parent->tw.relist)); eASSERT(env, pnl_check_allocated( - txn->tw.reclaimed_pglist, + txn->tw.relist, (txn->mt_next_pgno /* LY: intentional assignment here, only for assertion */ = parent->mt_next_pgno) - @@ -9028,7 +9134,7 @@ static int txn_end(MDBX_txn *txn, const unsigned mode) { /* Export or close DBI handles created in this txn */ dbi_update(txn, mode & MDBX_END_UPDATE); pnl_shrink(&txn->tw.retired_pages); - pnl_shrink(&txn->tw.reclaimed_pglist); + pnl_shrink(&txn->tw.relist); if (!(env->me_flags & MDBX_WRITEMAP)) dlist_free(txn); /* The writer mutex was locked in mdbx_txn_begin. */ @@ -9039,7 +9145,7 @@ static int txn_end(MDBX_txn *txn, const unsigned mode) { eASSERT(env, parent->mt_signature == MDBX_MT_SIGNATURE); eASSERT(env, parent->mt_child == txn && (parent->mt_flags & MDBX_TXN_HAS_CHILD) != 0); - eASSERT(env, pnl_check_allocated(txn->tw.reclaimed_pglist, + eASSERT(env, pnl_check_allocated(txn->tw.relist, txn->mt_next_pgno - MDBX_ENABLE_REFUND)); eASSERT(env, memcmp(&txn->tw.troika, &parent->tw.troika, sizeof(meta_troika_t)) == 0); @@ -9067,7 +9173,7 @@ static int txn_end(MDBX_txn *txn, const unsigned mode) { tASSERT(parent, audit_ex(parent, 0, false) == 0); dlist_free(txn); dpl_free(txn); - pnl_free(txn->tw.reclaimed_pglist); + pnl_free(txn->tw.relist); if (parent->mt_geo.upper != txn->mt_geo.upper || parent->mt_geo.now != txn->mt_geo.now) { @@ -9163,7 +9269,7 @@ __cold static int audit_ex(MDBX_txn *txn, size_t retired_stored, bool dont_filter_gc) { size_t pending = 0; if ((txn->mt_flags & MDBX_TXN_RDONLY) == 0) - pending = txn->tw.loose_count + MDBX_PNL_GETSIZE(txn->tw.reclaimed_pglist) + + pending = txn->tw.loose_count + MDBX_PNL_GETSIZE(txn->tw.relist) + (MDBX_PNL_GETSIZE(txn->tw.retired_pages) - retired_stored); MDBX_cursor_couple cx; @@ -9270,7 +9376,7 @@ __cold static int audit_ex(MDBX_txn *txn, size_t retired_stored, ERROR("audit @%" PRIaTXN ": %zu(pending) = %zu(loose) + " "%zu(reclaimed) + %zu(retired-pending) - %zu(retired-stored)", txn->mt_txnid, pending, txn->tw.loose_count, - MDBX_PNL_GETSIZE(txn->tw.reclaimed_pglist), + MDBX_PNL_GETSIZE(txn->tw.relist), txn->tw.retired_pages ? MDBX_PNL_GETSIZE(txn->tw.retired_pages) : 0, retired_stored); ERROR("audit @%" PRIaTXN ": %zu(pending) + %zu" @@ -9302,7 +9408,7 @@ static __inline int gcu_context_init(MDBX_txn *txn, gcu_context_t *ctx) { } static __always_inline size_t gcu_backlog_size(MDBX_txn *txn) { - return MDBX_PNL_GETSIZE(txn->tw.reclaimed_pglist) + txn->tw.loose_count; + return MDBX_PNL_GETSIZE(txn->tw.relist) + txn->tw.loose_count; } static int gcu_clean_stored_retired(MDBX_txn *txn, gcu_context_t *ctx) { @@ -9367,7 +9473,8 @@ static int gcu_prepare_backlog(MDBX_txn *txn, gcu_context_t *ctx, return err; } - ctx->cursor.outer.mc_flags &= ~C_RECLAIMING; + tASSERT(txn, txn->mt_flags & MDBX_TXN_UPDATE_GC); + txn->mt_flags -= MDBX_TXN_UPDATE_GC; err = cursor_touch(&ctx->cursor.outer); TRACE("== after-touch, backlog %zu, err %d", gcu_backlog_size(txn), err); @@ -9393,7 +9500,7 @@ static int gcu_prepare_backlog(MDBX_txn *txn, gcu_context_t *ctx, MDBX_ALLOC_RESERVE | MDBX_ALLOC_BACKLOG) .err; - ctx->cursor.outer.mc_flags |= C_RECLAIMING; + txn->mt_flags += MDBX_TXN_UPDATE_GC; TRACE("<< backlog %zu, err %d", gcu_backlog_size(txn), err); return (err != MDBX_NOTFOUND) ? err : MDBX_SUCCESS; } @@ -9422,11 +9529,11 @@ static int update_gc(MDBX_txn *txn, gcu_context_t *ctx) { MDBX_env *const env = txn->mt_env; const char *const dbg_prefix_mode = ctx->lifo ? " lifo" : " fifo"; (void)dbg_prefix_mode; - ctx->cursor.outer.mc_flags |= C_RECLAIMING; + txn->mt_flags += MDBX_TXN_UPDATE_GC; ctx->cursor.outer.mc_next = txn->mt_cursors[FREE_DBI]; txn->mt_cursors[FREE_DBI] = &ctx->cursor.outer; - /* txn->tw.reclaimed_pglist[] can grow and shrink during this call. + /* txn->tw.relist[] can grow and shrink during this call. * txn->tw.last_reclaimed and txn->tw.retired_pages[] can only grow. * Page numbers cannot disappear from txn->tw.retired_pages[]. */ @@ -9434,7 +9541,7 @@ retry: ++ctx->loop; TRACE("%s", " >> restart"); int rc = MDBX_SUCCESS; - tASSERT(txn, pnl_check_allocated(txn->tw.reclaimed_pglist, + tASSERT(txn, pnl_check_allocated(txn->tw.relist, txn->mt_next_pgno - MDBX_ENABLE_REFUND)); tASSERT(txn, dirtylist_check(txn)); if (unlikely(/* paranoia */ ctx->loop > ((MDBX_DEBUG > 0) ? 12 : 42))) { @@ -9468,7 +9575,7 @@ retry: goto bailout; } - tASSERT(txn, pnl_check_allocated(txn->tw.reclaimed_pglist, + tASSERT(txn, pnl_check_allocated(txn->tw.relist, txn->mt_next_pgno - MDBX_ENABLE_REFUND)); if (ctx->lifo) { if (ctx->cleaned_slot < (txn->tw.lifo_reclaimed @@ -9508,7 +9615,7 @@ retry: } } else { /* If using records from GC which we have not yet deleted, - * now delete them and any we reserved for tw.reclaimed_pglist. */ + * now delete them and any we reserved for tw.relist. */ while (ctx->cleaned_id <= txn->tw.last_reclaimed) { rc = cursor_first(&ctx->cursor.outer, &key, NULL); if (rc == MDBX_NOTFOUND) @@ -9542,7 +9649,7 @@ retry: } } - tASSERT(txn, pnl_check_allocated(txn->tw.reclaimed_pglist, + tASSERT(txn, pnl_check_allocated(txn->tw.relist, txn->mt_next_pgno - MDBX_ENABLE_REFUND)); tASSERT(txn, dirtylist_check(txn)); if (AUDIT_ENABLED()) { @@ -9553,7 +9660,7 @@ retry: /* return suitable into unallocated space */ if (txn_refund(txn)) { - tASSERT(txn, pnl_check_allocated(txn->tw.reclaimed_pglist, + tASSERT(txn, pnl_check_allocated(txn->tw.relist, txn->mt_next_pgno - MDBX_ENABLE_REFUND)); if (AUDIT_ENABLED()) { rc = audit_ex(txn, ctx->retired_stored, false); @@ -9564,7 +9671,7 @@ retry: /* handle loose pages - put ones into the reclaimed- or retired-list */ if (txn->tw.loose_pages) { - /* Return loose page numbers to tw.reclaimed_pglist, + /* Return loose page numbers to tw.relist, * though usually none are left at this point. * The pages themselves remain in dirtylist. */ if (unlikely(!txn->tw.lifo_reclaimed && txn->tw.last_reclaimed < 1)) { @@ -9582,7 +9689,7 @@ retry: } /* Put loose page numbers in tw.retired_pages, - * since unable to return them to tw.reclaimed_pglist. */ + * since unable to return them to tw.relist. */ if (unlikely((rc = pnl_need(&txn->tw.retired_pages, txn->tw.loose_count)) != 0)) goto bailout; @@ -9593,11 +9700,10 @@ retry: } } else { /* Room for loose pages + temp PNL with same */ - rc = pnl_need(&txn->tw.reclaimed_pglist, 2 * txn->tw.loose_count + 2); + rc = pnl_need(&txn->tw.relist, 2 * txn->tw.loose_count + 2); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; - MDBX_PNL loose = txn->tw.reclaimed_pglist + - MDBX_PNL_ALLOCLEN(txn->tw.reclaimed_pglist) - + MDBX_PNL loose = txn->tw.relist + MDBX_PNL_ALLOCLEN(txn->tw.relist) - txn->tw.loose_count - 1; size_t count = 0; for (MDBX_page *mp = txn->tw.loose_pages; mp; mp = mp->mp_next) { @@ -9607,7 +9713,7 @@ retry: tASSERT(txn, count == txn->tw.loose_count); MDBX_PNL_SETSIZE(loose, count); pnl_sort(loose, txn->mt_next_pgno); - pnl_merge(txn->tw.reclaimed_pglist, loose); + pnl_merge(txn->tw.relist, loose); TRACE("%s: append %zu loose-pages to reclaimed-pages", dbg_prefix_mode, txn->tw.loose_count); } @@ -9652,15 +9758,15 @@ retry: #endif /* MDBX_ENABLE_REFUND */ } - const size_t amount = MDBX_PNL_GETSIZE(txn->tw.reclaimed_pglist); + const size_t amount = MDBX_PNL_GETSIZE(txn->tw.relist); /* handle retired-list - store ones into single gc-record */ if (ctx->retired_stored < MDBX_PNL_GETSIZE(txn->tw.retired_pages)) { if (unlikely(!ctx->retired_stored)) { /* Make sure last page of GC is touched and on retired-list */ - ctx->cursor.outer.mc_flags &= ~C_RECLAIMING; + txn->mt_flags -= MDBX_TXN_UPDATE_GC; rc = page_search(&ctx->cursor.outer, NULL, MDBX_PS_LAST | MDBX_PS_MODIFY); - ctx->cursor.outer.mc_flags |= C_RECLAIMING; + txn->mt_flags += MDBX_TXN_UPDATE_GC; if (unlikely(rc != MDBX_SUCCESS) && rc != MDBX_NOTFOUND) goto bailout; } @@ -9751,10 +9857,10 @@ retry: DEBUG_EXTRA_PRINT(" %" PRIaPGNO, txn->tw.retired_pages[i]); DEBUG_EXTRA_PRINT("%s\n", "."); } - if (unlikely(amount != MDBX_PNL_GETSIZE(txn->tw.reclaimed_pglist) && + if (unlikely(amount != MDBX_PNL_GETSIZE(txn->tw.relist) && ctx->settled)) { TRACE("%s: reclaimed-list changed %zu -> %zu, retry", dbg_prefix_mode, - amount, MDBX_PNL_GETSIZE(txn->tw.reclaimed_pglist)); + amount, MDBX_PNL_GETSIZE(txn->tw.relist)); goto retry /* rare case, but avoids GC fragmentation and one cycle. */ ; @@ -9763,7 +9869,7 @@ retry: } /* handle reclaimed and lost pages - merge and store both into gc */ - tASSERT(txn, pnl_check_allocated(txn->tw.reclaimed_pglist, + tASSERT(txn, pnl_check_allocated(txn->tw.relist, txn->mt_next_pgno - MDBX_ENABLE_REFUND)); tASSERT(txn, txn->tw.loose_count == 0); @@ -9800,7 +9906,7 @@ retry: bool need_cleanup = false; txnid_t snap_oldest; retry_rid: - ctx->cursor.outer.mc_flags &= ~C_RECLAIMING; + txn->mt_flags -= MDBX_TXN_UPDATE_GC; do { snap_oldest = txn_oldest_reader(txn); rc = page_alloc_slowpath(&ctx->cursor.outer, 0, @@ -9818,7 +9924,7 @@ retry: left > (MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed) - ctx->reused_slot) * env->me_maxgc_ov1page); - ctx->cursor.outer.mc_flags |= C_RECLAIMING; + txn->mt_flags += MDBX_TXN_UPDATE_GC; if (likely(rc == MDBX_SUCCESS)) { TRACE("%s: got enough from GC.", dbg_prefix_mode); @@ -9980,10 +10086,9 @@ retry: sizeof(pgno_t)) /* - 1 + span */; if (tail > avail) { for (size_t i = amount - span; i > 0; --i) { - if (MDBX_PNL_ASCENDING - ? (txn->tw.reclaimed_pglist[i] + span) - : (txn->tw.reclaimed_pglist[i] - span) == - txn->tw.reclaimed_pglist[i + span]) { + if (MDBX_PNL_ASCENDING ? (txn->tw.relist[i] + span) + : (txn->tw.relist[i] - span) == + txn->tw.relist[i + span]) { span += 1; avail = ((pgno2bytes(env, span) - PAGEHDRSZ) / sizeof(pgno_t)) - @@ -10031,7 +10136,7 @@ retry: gcu_prepare_backlog(txn, ctx, true); rc = mdbx_cursor_put(&ctx->cursor.outer, &key, &data, MDBX_RESERVE | MDBX_NOOVERWRITE); - tASSERT(txn, pnl_check_allocated(txn->tw.reclaimed_pglist, + tASSERT(txn, pnl_check_allocated(txn->tw.relist, txn->mt_next_pgno - MDBX_ENABLE_REFUND)); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; @@ -10042,11 +10147,11 @@ retry: chunk); if (txn->tw.lifo_reclaimed && - unlikely(amount < MDBX_PNL_GETSIZE(txn->tw.reclaimed_pglist)) && - (ctx->loop < 5 || MDBX_PNL_GETSIZE(txn->tw.reclaimed_pglist) - amount > - env->me_maxgc_ov1page)) { + unlikely(amount < MDBX_PNL_GETSIZE(txn->tw.relist)) && + (ctx->loop < 5 || + MDBX_PNL_GETSIZE(txn->tw.relist) - amount > env->me_maxgc_ov1page)) { NOTICE("** restart: reclaimed-list growth %zu -> %zu", amount, - MDBX_PNL_GETSIZE(txn->tw.reclaimed_pglist)); + MDBX_PNL_GETSIZE(txn->tw.relist)); goto retry; } @@ -10065,15 +10170,15 @@ retry: ? MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed) - ctx->reused_slot : ctx->reused_slot; rc = MDBX_SUCCESS; - tASSERT(txn, pnl_check_allocated(txn->tw.reclaimed_pglist, + tASSERT(txn, pnl_check_allocated(txn->tw.relist, txn->mt_next_pgno - MDBX_ENABLE_REFUND)); tASSERT(txn, dirtylist_check(txn)); - if (MDBX_PNL_GETSIZE(txn->tw.reclaimed_pglist)) { + if (MDBX_PNL_GETSIZE(txn->tw.relist)) { MDBX_val key, data; key.iov_len = data.iov_len = 0; /* avoid MSVC warning */ key.iov_base = data.iov_base = NULL; - const size_t amount = MDBX_PNL_GETSIZE(txn->tw.reclaimed_pglist); + const size_t amount = MDBX_PNL_GETSIZE(txn->tw.relist); size_t left = amount; if (txn->tw.lifo_reclaimed == nullptr) { tASSERT(txn, ctx->lifo == 0); @@ -10087,7 +10192,7 @@ retry: while (true) { txnid_t fill_gc_id; TRACE("%s: left %zu of %zu", dbg_prefix_mode, left, - MDBX_PNL_GETSIZE(txn->tw.reclaimed_pglist)); + MDBX_PNL_GETSIZE(txn->tw.relist)); if (txn->tw.lifo_reclaimed == nullptr) { tASSERT(txn, ctx->lifo == 0); fill_gc_id = unaligned_peek_u64(4, key.iov_base); @@ -10125,7 +10230,7 @@ retry: key.iov_len = sizeof(fill_gc_id); tASSERT(txn, data.iov_len >= sizeof(pgno_t) * 2); - ctx->cursor.outer.mc_flags |= C_GCFREEZE; + txn->mt_flags += MDBX_TXN_FROZEN_RE; size_t chunk = data.iov_len / sizeof(pgno_t) - 1; if (unlikely(chunk > left)) { TRACE("%s: chunk %zu > left %zu, @%" PRIaTXN, dbg_prefix_mode, chunk, @@ -10134,22 +10239,21 @@ retry: chunk - left > env->me_maxgc_ov1page) { data.iov_len = (left + 1) * sizeof(pgno_t); if (ctx->loop < 7) - ctx->cursor.outer.mc_flags &= ~C_GCFREEZE; + txn->mt_flags &= ~MDBX_TXN_FROZEN_RE; } chunk = left; } rc = mdbx_cursor_put(&ctx->cursor.outer, &key, &data, MDBX_CURRENT | MDBX_RESERVE); - ctx->cursor.outer.mc_flags &= ~C_GCFREEZE; + txn->mt_flags &= ~MDBX_TXN_FROZEN_RE; if (unlikely(rc != MDBX_SUCCESS)) goto bailout; gcu_clean_reserved(env, data); if (unlikely(txn->tw.loose_count || - amount != MDBX_PNL_GETSIZE(txn->tw.reclaimed_pglist))) { + amount != MDBX_PNL_GETSIZE(txn->tw.relist))) { NOTICE("** restart: reclaimed-list growth (%zu -> %zu, loose +%zu)", - amount, MDBX_PNL_GETSIZE(txn->tw.reclaimed_pglist), - txn->tw.loose_count); + amount, MDBX_PNL_GETSIZE(txn->tw.relist), txn->tw.loose_count); goto retry; } if (unlikely(txn->tw.lifo_reclaimed @@ -10170,12 +10274,12 @@ retry: pgno_t *dst = data.iov_base; *dst++ = (pgno_t)chunk; - pgno_t *src = MDBX_PNL_BEGIN(txn->tw.reclaimed_pglist) + left - chunk; + pgno_t *src = MDBX_PNL_BEGIN(txn->tw.relist) + left - chunk; memcpy(dst, src, chunk * sizeof(pgno_t)); pgno_t *from = src, *to = src + chunk; TRACE("%s: fill %zu [ %zu:%" PRIaPGNO "...%zu:%" PRIaPGNO "] @%" PRIaTXN, - dbg_prefix_mode, chunk, from - txn->tw.reclaimed_pglist, from[0], - to - txn->tw.reclaimed_pglist, to[-1], fill_gc_id); + dbg_prefix_mode, chunk, from - txn->tw.relist, from[0], + to - txn->tw.relist, to[-1], fill_gc_id); left -= chunk; if (AUDIT_ENABLED()) { @@ -10223,7 +10327,10 @@ retry: bailout: txn->mt_cursors[FREE_DBI] = ctx->cursor.outer.mc_next; - MDBX_PNL_SETSIZE(txn->tw.reclaimed_pglist, 0); + MDBX_PNL_SETSIZE(txn->tw.relist, 0); +#if MDBX_ENABLE_PROFGC + env->me_lck->mti_pgop_stat.gc_prof.wloops += ctx->loop; +#endif /* MDBX_ENABLE_PROFGC */ TRACE("<<< %zu loops, rc = %d", ctx->loop, rc); return rc; } @@ -10304,7 +10411,7 @@ static __inline void txn_merge(MDBX_txn *const parent, MDBX_txn *const txn, } /* Remove reclaimed pages from parent's dirty list */ - const MDBX_PNL reclaimed_list = parent->tw.reclaimed_pglist; + const MDBX_PNL reclaimed_list = parent->tw.relist; dpl_sift(parent, reclaimed_list, false); /* Move retired pages from parent's dirty & spilled list to reclaimed */ @@ -10362,7 +10469,7 @@ static __inline void txn_merge(MDBX_txn *const parent, MDBX_txn *const txn, DEBUG("reclaim retired parent's %u -> %zu %s page %" PRIaPGNO, npages, l, kind, pgno); - int err = pnl_insert_range(&parent->tw.reclaimed_pglist, pgno, l); + int err = pnl_insert_range(&parent->tw.relist, pgno, l); ENSURE(txn->mt_env, err == MDBX_SUCCESS); } MDBX_PNL_SETSIZE(parent->tw.retired_pages, w); @@ -10636,8 +10743,9 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { STATIC_ASSERT(MDBX_TXN_FINISHED == MDBX_TXN_BLOCKED - MDBX_TXN_HAS_CHILD - MDBX_TXN_ERROR); const uint64_t ts_0 = latency ? osal_monotime() : 0; - uint64_t ts_1 = 0, ts_2 = 0, ts_3 = 0, ts_4 = 0, ts_5 = 0; + uint64_t ts_1 = 0, ts_2 = 0, ts_3 = 0, ts_4 = 0, ts_5 = 0, gc_cputime = 0; + MDBX_env *const env = txn->mt_env; int rc = check_txn(txn, MDBX_TXN_FINISHED); if (unlikely(rc != MDBX_SUCCESS)) goto provide_latency; @@ -10647,7 +10755,6 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { goto fail; } - MDBX_env *env = txn->mt_env; #if MDBX_ENV_CHECKPID if (unlikely(env->me_pid != osal_getpid())) { env->me_flags |= MDBX_FATAL_ERROR; @@ -10714,7 +10821,7 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { const size_t retired_delta = MDBX_PNL_GETSIZE(txn->tw.retired_pages) - parent_retired_len; if (retired_delta) { - rc = pnl_need(&txn->tw.reclaimed_pglist, retired_delta); + rc = pnl_need(&txn->tw.relist, retired_delta); if (unlikely(rc != MDBX_SUCCESS)) goto fail; } @@ -10745,9 +10852,9 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { parent->tw.retired_pages = txn->tw.retired_pages; txn->tw.retired_pages = NULL; - pnl_free(parent->tw.reclaimed_pglist); - parent->tw.reclaimed_pglist = txn->tw.reclaimed_pglist; - txn->tw.reclaimed_pglist = NULL; + pnl_free(parent->tw.relist); + parent->tw.relist = txn->tw.relist; + txn->tw.relist = NULL; parent->tw.last_reclaimed = txn->tw.last_reclaimed; parent->mt_geo = txn->mt_geo; @@ -10779,9 +10886,14 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { parent->mt_dbistate[i] = state; } - ts_1 = latency ? osal_monotime() : 0; + if (latency) { + ts_1 = osal_monotime(); + ts_2 = /* no gc-update */ ts_1; + ts_3 = /* no audit */ ts_2; + ts_4 = /* no write */ ts_3; + ts_5 = /* no sync */ ts_4; + } txn_merge(parent, txn, parent_retired_len); - ts_2 = ts_3 = latency ? osal_monotime() : 0; env->me_txn = parent; parent->mt_child = NULL; tASSERT(parent, dirtylist_check(parent)); @@ -10794,13 +10906,12 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { tASSERT(parent, lp->mp_pgno < parent->tw.loose_refund_wl && lp->mp_pgno + 1 < parent->mt_next_pgno); /* Check parent's reclaimed pages not suitable for refund */ - if (MDBX_PNL_GETSIZE(parent->tw.reclaimed_pglist)) - tASSERT(parent, MDBX_PNL_MOST(parent->tw.reclaimed_pglist) + 1 < - parent->mt_next_pgno); + if (MDBX_PNL_GETSIZE(parent->tw.relist)) + tASSERT(parent, + MDBX_PNL_MOST(parent->tw.relist) + 1 < parent->mt_next_pgno); } #endif /* MDBX_ENABLE_REFUND */ - ts_4 = ts_5 = latency ? osal_monotime() : 0; txn->mt_signature = 0; osal_free(txn); tASSERT(parent, audit_ex(parent, 0, false) == 0); @@ -10868,11 +10979,14 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { } ts_1 = latency ? osal_monotime() : 0; + gcu_context_t gcu_ctx; + gc_cputime = latency ? osal_cputime(nullptr) : 0; rc = gcu_context_init(txn, &gcu_ctx); if (unlikely(rc != MDBX_SUCCESS)) goto fail; rc = update_gc(txn, &gcu_ctx); + gc_cputime = latency ? osal_cputime(nullptr) - gc_cputime : 0; if (unlikely(rc != MDBX_SUCCESS)) goto fail; @@ -10884,7 +10998,8 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { ? txn->mt_txnid : txn->mt_dbs[MAIN_DBI].md_mod_txnid; - ts_2 = ts_3 = latency ? osal_monotime() : 0; + ts_2 = latency ? osal_monotime() : 0; + ts_3 = ts_2; if (AUDIT_ENABLED()) { rc = audit_ex(txn, MDBX_PNL_GETSIZE(txn->tw.retired_pages), true); ts_3 = osal_monotime(); @@ -10967,11 +11082,48 @@ done: provide_latency: if (latency) { latency->preparation = ts_1 ? osal_monotime_to_16dot16(ts_1 - ts_0) : 0; - latency->gc = (ts_1 && ts_2) ? osal_monotime_to_16dot16(ts_2 - ts_1) : 0; - latency->audit = - (ts_2 && AUDIT_ENABLED()) ? osal_monotime_to_16dot16(ts_3 - ts_2) : 0; - latency->write = (ts_3 && ts_4) ? osal_monotime_to_16dot16(ts_4 - ts_3) : 0; - latency->sync = (ts_4 && ts_5) ? osal_monotime_to_16dot16(ts_5 - ts_4) : 0; + latency->gc_wallclock = + (ts_2 > ts_1) ? osal_monotime_to_16dot16(ts_2 - ts_1) : 0; + latency->gc_cputime = gc_cputime ? osal_monotime_to_16dot16(gc_cputime) : 0; + latency->audit = (ts_3 > ts_2) ? osal_monotime_to_16dot16(ts_3 - ts_2) : 0; + latency->write = (ts_4 > ts_3) ? osal_monotime_to_16dot16(ts_4 - ts_3) : 0; + latency->sync = (ts_5 > ts_4) ? osal_monotime_to_16dot16(ts_5 - ts_4) : 0; + +#if MDBX_ENABLE_PROFGC + pgop_stat_t *const ptr = &env->me_lck->mti_pgop_stat; + latency->gc_prof.work_counter = ptr->gc_prof.work.spe_counter; + latency->gc_prof.work_rtime_monotonic = + osal_monotime_to_16dot16(ptr->gc_prof.work.rtime_monotonic); + latency->gc_prof.work_xtime_monotonic = + osal_monotime_to_16dot16(ptr->gc_prof.work.xtime_monotonic); + latency->gc_prof.work_rtime_cpu = + osal_monotime_to_16dot16(ptr->gc_prof.work.rtime_cpu); + latency->gc_prof.work_rsteps = ptr->gc_prof.work.rsteps; + latency->gc_prof.work_xpages = ptr->gc_prof.work.xpages; + latency->gc_prof.work_majflt = ptr->gc_prof.work.majflt; + + latency->gc_prof.self_counter = ptr->gc_prof.self.spe_counter; + latency->gc_prof.self_rtime_monotonic = + osal_monotime_to_16dot16(ptr->gc_prof.self.rtime_monotonic); + latency->gc_prof.self_xtime_monotonic = + osal_monotime_to_16dot16(ptr->gc_prof.self.xtime_monotonic); + latency->gc_prof.self_rtime_cpu = + osal_monotime_to_16dot16(ptr->gc_prof.self.rtime_cpu); + latency->gc_prof.self_rsteps = ptr->gc_prof.self.rsteps; + latency->gc_prof.self_xpages = ptr->gc_prof.self.xpages; + latency->gc_prof.self_majflt = ptr->gc_prof.self.majflt; + + latency->gc_prof.wloops = ptr->gc_prof.wloops; + latency->gc_prof.coalescences = ptr->gc_prof.coalescences; + latency->gc_prof.wipes = ptr->gc_prof.wipes; + latency->gc_prof.flushes = ptr->gc_prof.flushes; + latency->gc_prof.kicks = ptr->gc_prof.kicks; + if (txn == env->me_txn0) + memset(&ptr->gc_prof, 0, sizeof(ptr->gc_prof)); +#else + memset(&latency->gc_prof, 0, sizeof(latency->gc_prof)); +#endif /* MDBX_ENABLE_PROFGC */ + const uint64_t ts_6 = osal_monotime(); latency->ending = ts_5 ? osal_monotime_to_16dot16(ts_6 - ts_5) : 0; latency->whole = osal_monotime_to_16dot16_noUnderflow(ts_6 - ts_0); @@ -11586,6 +11738,8 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending, if (!MDBX_AVOID_MSYNC && (flags & MDBX_WRITEMAP)) { #if MDBX_ENABLE_PGOP_STAT env->me_lck->mti_pgop_stat.msync.weak += sync_op; +#else + (void)sync_op; #endif /* MDBX_ENABLE_PGOP_STAT */ rc = osal_msync(&env->me_dxb_mmap, 0, @@ -11593,6 +11747,8 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending, } else { #if MDBX_ENABLE_PGOP_STAT env->me_lck->mti_pgop_stat.fsync.weak += sync_op; +#else + (void)sync_op; #endif /* MDBX_ENABLE_PGOP_STAT */ rc = osal_fsync(env->me_lazy_fd, mode_bits); } @@ -13966,8 +14122,8 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, txn->mt_flags = MDBX_TXN_FINISHED; env->me_txn0 = txn; txn->tw.retired_pages = pnl_alloc(MDBX_PNL_INITIAL); - txn->tw.reclaimed_pglist = pnl_alloc(MDBX_PNL_INITIAL); - if (unlikely(!txn->tw.retired_pages || !txn->tw.reclaimed_pglist)) + txn->tw.relist = pnl_alloc(MDBX_PNL_INITIAL); + if (unlikely(!txn->tw.retired_pages || !txn->tw.relist)) rc = MDBX_ENOMEM; } else rc = MDBX_ENOMEM; @@ -14098,7 +14254,7 @@ __cold static int env_close(MDBX_env *env) { txl_free(env->me_txn0->tw.lifo_reclaimed); pnl_free(env->me_txn0->tw.retired_pages); pnl_free(env->me_txn0->tw.spill_pages); - pnl_free(env->me_txn0->tw.reclaimed_pglist); + pnl_free(env->me_txn0->tw.relist); osal_free(env->me_txn0); env->me_txn0 = nullptr; } @@ -15944,7 +16100,7 @@ static int touch_dbi(MDBX_cursor *mc) { *mc->mc_dbistate |= DBI_DIRTY; mc->mc_txn->mt_flags |= MDBX_TXN_DIRTY; if (mc->mc_dbi >= CORE_DBS) { - cASSERT(mc, (mc->mc_flags & C_RECLAIMING) == 0); + cASSERT(mc, (mc->mc_txn->mt_flags & MDBX_TXN_UPDATE_GC) == 0); /* Touch DB record of named DB */ MDBX_cursor_couple cx; int rc = cursor_init(&cx.outer, mc->mc_txn, MAIN_DBI); @@ -16370,7 +16526,7 @@ __hot int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, /* Is the ov page from this txn (or a parent) and big enough? */ int ovpages = lp.page->mp_pages; if (!IS_FROZEN(mc->mc_txn, lp.page) && - (unlikely(mc->mc_flags & C_GCFREEZE) + (unlikely(mc->mc_txn->mt_flags & MDBX_TXN_FROZEN_RE) ? (ovpages >= dpages) : (ovpages == /* LY: add configurable threshold to keep reserve space */ @@ -17428,9 +17584,8 @@ static __inline int couple_init(MDBX_cursor_couple *couple, const size_t dbi, if (unlikely(*couple->outer.mc_dbistate & DBI_STALE)) { rc = page_search(&couple->outer, NULL, MDBX_PS_ROOTONLY); rc = (rc != MDBX_NOTFOUND) ? rc : MDBX_SUCCESS; - } else if (unlikely(couple->outer.mc_dbx->md_klen_max == 0)) { - rc = setup_dbx(couple->outer.mc_dbx, couple->outer.mc_db, - txn->mt_env->me_psize); + } else if (unlikely(dbx->md_klen_max == 0)) { + rc = setup_dbx(dbx, db, txn->mt_env->me_psize); } if (couple->outer.mc_db->md_flags & MDBX_DUPSORT) { @@ -21114,8 +21269,6 @@ __cold static int fetch_envinfo_ex(const MDBX_env *env, const MDBX_txn *txn, atomic_load64(&lck->mti_pgop_stat.msync, mo_Relaxed); arg->mi_pgop_stat.fsync = atomic_load64(&lck->mti_pgop_stat.fsync, mo_Relaxed); - arg->mi_pgop_stat.gcrtime_seconds16dot16 = osal_monotime_to_16dot16( - atomic_load64(&lck->mti_pgop_stat.gcrtime, mo_Relaxed)); #else memset(&arg->mi_pgop_stat, 0, sizeof(arg->mi_pgop_stat)); #endif /* MDBX_ENABLE_PGOP_STAT*/ @@ -22064,8 +22217,12 @@ __cold static txnid_t kick_longlived_readers(MDBX_env *env, atomic_store64(&stucked->mr_tid, 0, mo_Relaxed); atomic_store32(&stucked->mr_pid, 0, mo_AcquireRelease); } - } else + } else if (!notify_eof_of_loop) { +#if MDBX_ENABLE_PROFGC + env->me_lck->mti_pgop_stat.gc_prof.kicks += 1; +#endif /* MDBX_ENABLE_PROFGC */ notify_eof_of_loop = true; + } } while (++retry < INT_MAX); diff --git a/src/internals.h b/src/internals.h index 73a74a95..28282414 100644 --- a/src/internals.h +++ b/src/internals.h @@ -578,10 +578,30 @@ typedef struct MDBX_page { #pragma pack(pop) -#if MDBX_ENABLE_PGOP_STAT +typedef struct profgc_stat { + /* Монотонное время по "настенным часам" + * затраченное на чтение и поиск внутри GC */ + uint64_t rtime_monotonic; + /* Монотонное время по "настенным часам" затраченное + * на подготовку страниц извлекаемых из GC, включая подкачку с диска. */ + uint64_t xtime_monotonic; + /* Процессорное время в режим пользователя + * затраченное на чтение и поиск внутри GC */ + uint64_t rtime_cpu; + /* Количество итераций чтения-поиска внутри GC при выделении страниц */ + uint32_t rsteps; + /* Количество запросов на выделение последовательностей страниц, + * т.е. когда запрашивает выделение больше одной страницы */ + uint32_t xpages; + /* Счетчик выполнения по медленному пути (slow path execution count) */ + uint32_t spe_counter; + /* page faults (hard page faults) */ + uint32_t majflt; +} profgc_stat_t; + /* Statistics of page operations overall of all (running, completed and aborted) * transactions */ -typedef struct { +typedef struct pgop_stat { MDBX_atomic_uint64_t newly; /* Quantity of a new pages added */ MDBX_atomic_uint64_t cow; /* Quantity of pages copied for update */ MDBX_atomic_uint64_t clone; /* Quantity of parent's dirty pages clones @@ -592,15 +612,32 @@ typedef struct { MDBX_atomic_uint64_t unspill; /* Quantity of unspilled/reloaded pages */ MDBX_atomic_uint64_t wops; /* Number of explicit write operations (not a pages) to a disk */ - MDBX_atomic_uint64_t - gcrtime; /* Time spending for reading/searching GC (aka FreeDB). The - unit/scale is platform-depended, see osal_monotime(). */ MDBX_atomic_uint64_t msync; /* Number of explicit msync/flush-to-disk operations */ MDBX_atomic_uint64_t fsync; /* Number of explicit fsync/flush-to-disk operations */ -} MDBX_pgop_stat_t; -#endif /* MDBX_ENABLE_PGOP_STAT */ + + /* Статистика для профилирования GC. + * Логически эти данные может быть стоит вынести в другую структуру, + * но разница будет сугубо косметическая. */ + struct { + /* Затраты на поддержку данных пользователя */ + profgc_stat_t work; + /* Затраты на поддержку и обновления самой GC */ + profgc_stat_t self; + /* Итераций обновления GC, + * больше 1 если были повторы/перезапуски */ + uint32_t wloops; + /* Итерации слияния записей GC */ + uint32_t coalescences; + /* Уничтожения steady-точек фиксации в MDBX_UTTERLY_NOSYNC */ + uint32_t wipes; + /* Сбросы данные на диск вне MDBX_UTTERLY_NOSYNC */ + uint32_t flushes; + /* Попытки пнуть тормозящих читателей */ + uint32_t kicks; + } gc_prof; +} pgop_stat_t; #if MDBX_LOCKING == MDBX_LOCKING_WIN32FILES #define MDBX_CLOCK_SIGN UINT32_C(0xF10C) @@ -738,11 +775,9 @@ typedef struct MDBX_lockinfo { MDBX_ALIGNAS(MDBX_CACHELINE_SIZE) /* cacheline ----------------------------*/ -#if MDBX_ENABLE_PGOP_STAT /* Statistics of costly ops of all (running, completed and aborted) * transactions */ - MDBX_pgop_stat_t mti_pgop_stat; -#endif /* MDBX_ENABLE_PGOP_STAT*/ + pgop_stat_t mti_pgop_stat; MDBX_ALIGNAS(MDBX_CACHELINE_SIZE) /* cacheline ----------------------------*/ @@ -962,9 +997,13 @@ struct MDBX_txn { /* Additional flag for sync_locked() */ #define MDBX_SHRINK_ALLOWED UINT32_C(0x40000000) +#define MDBX_TXN_UPDATE_GC 0x20 /* GC is being updated */ +#define MDBX_TXN_FROZEN_RE 0x40 /* list of reclaimed-pgno must not altered */ + #define TXN_FLAGS \ (MDBX_TXN_FINISHED | MDBX_TXN_ERROR | MDBX_TXN_DIRTY | MDBX_TXN_SPILLS | \ - MDBX_TXN_HAS_CHILD | MDBX_TXN_INVALID) + MDBX_TXN_HAS_CHILD | MDBX_TXN_INVALID | MDBX_TXN_UPDATE_GC | \ + MDBX_TXN_FROZEN_RE) #if (TXN_FLAGS & (MDBX_TXN_RW_BEGIN_FLAGS | MDBX_TXN_RO_BEGIN_FLAGS)) || \ ((MDBX_TXN_RW_BEGIN_FLAGS | MDBX_TXN_RO_BEGIN_FLAGS | TXN_FLAGS) & \ @@ -1023,8 +1062,8 @@ struct MDBX_txn { struct { meta_troika_t troika; /* In write txns, array of cursors for each DB */ - pgno_t *reclaimed_pglist; /* Reclaimed GC pages */ - txnid_t last_reclaimed; /* ID of last used record */ + pgno_t *relist; /* Reclaimed GC pages */ + txnid_t last_reclaimed; /* ID of last used record */ #if MDBX_ENABLE_REFUND pgno_t loose_refund_wl /* FIXME: describe */; #endif /* MDBX_ENABLE_REFUND */ @@ -1100,9 +1139,7 @@ struct MDBX_cursor { #define C_SUB 0x04 /* Cursor is a sub-cursor */ #define C_DEL 0x08 /* last op was a cursor_del */ #define C_UNTRACK 0x10 /* Un-track cursor when closing */ -#define C_RECLAIMING 0x20 /* GC lookup is prohibited */ -#define C_GCFREEZE 0x40 /* reclaimed_pglist must not be updated */ - uint8_t mc_flags; /* see mdbx_cursor */ + uint8_t mc_flags; /* Cursor checking flags. */ #define CC_BRANCH 0x01 /* same as P_BRANCH for CHECK_LEAF_TYPE() */ @@ -1113,7 +1150,7 @@ struct MDBX_cursor { #define CC_LEAF2 0x20 /* same as P_LEAF2 for CHECK_LEAF_TYPE() */ #define CC_RETIRING 0x40 /* refs to child pages may be invalid */ #define CC_PAGECHECK 0x80 /* perform page checking, see MDBX_VALIDATION */ - uint8_t mc_checking; /* page checking level */ + uint8_t mc_checking; MDBX_page *mc_pg[CURSOR_STACK]; /* stack of pushed pages */ indx_t mc_ki[CURSOR_STACK]; /* stack of page indices */ diff --git a/src/options.h b/src/options.h index 127a13be..11df6967 100644 --- a/src/options.h +++ b/src/options.h @@ -73,6 +73,13 @@ #error MDBX_ENABLE_REFUND must be defined as 0 or 1 #endif /* MDBX_ENABLE_REFUND */ +/** Controls profiling of GC search and updates. */ +#ifndef MDBX_ENABLE_PROFGC +#define MDBX_ENABLE_PROFGC 0 +#elif !(MDBX_ENABLE_PROFGC == 0 || MDBX_ENABLE_PROFGC == 1) +#error MDBX_ENABLE_PROFGC must be defined as 0 or 1 +#endif /* MDBX_ENABLE_PROFGC */ + /** Controls gathering statistics for page operations. */ #ifndef MDBX_ENABLE_PGOP_STAT #define MDBX_ENABLE_PGOP_STAT 1 diff --git a/src/osal.c b/src/osal.c index a29919fa..5d1339d3 100644 --- a/src/osal.c +++ b/src/osal.c @@ -18,6 +18,7 @@ #if defined(_WIN32) || defined(_WIN64) +#include #include #if !MDBX_WITHOUT_MSVC_CRT && defined(_DEBUG) @@ -2700,6 +2701,60 @@ MDBX_INTERNAL_FUNC uint64_t osal_monotime(void) { return 0; } +MDBX_INTERNAL_FUNC uint64_t osal_cputime(size_t *optional_page_faults) { +#if defined(_WIN32) || defined(_WIN64) + if (optional_page_faults) { + PROCESS_MEMORY_COUNTERS pmc; + *optional_page_faults = + GetProcessMemoryInfo(GetCurrentProcess(), &pmc, sizeof(pmc)) + ? pmc.PageFaultCount + : 0; + } + FILETIME unused, usermode; + if (GetThreadTimes(GetCurrentThread(), + /* CreationTime */ &unused, + /* ExitTime */ &unused, + /* KernelTime */ &unused, + /* UserTime */ &usermode)) { + /* one second = 10_000_000 * 100ns = 78125 * (1 << 7) * 100ns; + * result = (h * f / 10_000_000) << 32) + l * f / 10_000_000 = + * = ((h * f) >> 7) / 78125) << 32) + ((l * f) >> 7) / 78125; + * 1) {h, l} *= f; + * 2) {h, l} >>= 7; + * 3) result = ((h / 78125) << 32) + l / 78125; */ + uint64_t l = usermode.dwLowDateTime * performance_frequency.QuadPart; + uint64_t h = usermode.dwHighDateTime * performance_frequency.QuadPart; + l = h << (64 - 7) | l >> 7; + h = h >> 7; + return ((h / 78125) << 32) + l / 78125; + } +#elif defined(RUSAGE_THREAD) || defined(RUSAGE_LWP) +#ifndef RUSAGE_THREAD +#define RUSAGE_THREAD RUSAGE_LWP /* Solaris */ +#endif + struct rusage usage; + if (getrusage(RUSAGE_THREAD, &usage) == 0) { + if (optional_page_faults) + *optional_page_faults = usage.ru_majflt; + return usage.ru_utime.tv_sec * UINT64_C(1000000000) + + usage.ru_utime.tv_usec * 1000u; + } + if (optional_page_faults) + *optional_page_faults = 0; +#elif defined(CLOCK_THREAD_CPUTIME_ID) + if (optional_page_faults) + *optional_page_faults = 0; + struct timespec ts; + if (likely(clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts) == 0)) + return ts.tv_sec * UINT64_C(1000000000) + ts.tv_nsec; +#else + /* FIXME */ + if (optional_page_faults) + *optional_page_faults = 0; +#endif + return 0; +} + /*----------------------------------------------------------------------------*/ static void bootid_shake(bin128_t *p) { diff --git a/src/osal.h b/src/osal.h index d2603334..e8abacf3 100644 --- a/src/osal.h +++ b/src/osal.h @@ -620,6 +620,7 @@ osal_pthread_mutex_lock(pthread_mutex_t *mutex) { #endif /* !Windows */ MDBX_INTERNAL_FUNC uint64_t osal_monotime(void); +MDBX_INTERNAL_FUNC uint64_t osal_cputime(size_t *optional_page_faults); MDBX_INTERNAL_FUNC uint64_t osal_16dot16_to_monotime(uint32_t seconds_16dot16); MDBX_INTERNAL_FUNC uint32_t osal_monotime_to_16dot16(uint64_t monotime); From 4f1f9141f4b8b8e333f8a10bab94ac1ab305436b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Sat, 5 Nov 2022 15:11:43 +0300 Subject: [PATCH 187/364] =?UTF-8?q?mdbx:=20=D0=B4=D0=BE=D0=B1=D0=B0=D0=B2?= =?UTF-8?q?=D0=BB=D0=B5=D0=BD=D0=B8=D0=B5=20`MDBX=5FENABLE=5FPGOP=5FSTAT`?= =?UTF-8?q?=20=D0=B8=20`MDBX=5FENABLE=5FPROFGC`=20=D0=B2=D0=BE=20=D0=B2?= =?UTF-8?q?=D0=BD=D1=83=D1=82=D1=80=D0=B5=D0=BD=D0=BD=D1=8E=D1=8E=20=D1=81?= =?UTF-8?q?=D1=82=D1=80=D0=BE=D0=BA=D1=83=20=D1=81=20=D0=BE=D0=BF=D1=86?= =?UTF-8?q?=D0=B8=D1=8F=D0=BC=D0=B8=20=D1=81=D0=B1=D0=BE=D1=80=D0=BA=D0=B8?= =?UTF-8?q?.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/core.c b/src/core.c index 0f13c846..66070d30 100644 --- a/src/core.c +++ b/src/core.c @@ -24359,6 +24359,8 @@ __dll_export " MDBX_AVOID_MSYNC=" MDBX_STRINGIFY(MDBX_AVOID_MSYNC) " MDBX_ENABLE_REFUND=" MDBX_STRINGIFY(MDBX_ENABLE_REFUND) " MDBX_ENABLE_MADVISE=" MDBX_STRINGIFY(MDBX_ENABLE_MADVISE) + " MDBX_ENABLE_PGOP_STAT=" MDBX_STRINGIFY(MDBX_ENABLE_PGOP_STAT) + " MDBX_ENABLE_PROFGC=" MDBX_STRINGIFY(MDBX_ENABLE_PROFGC) #if MDBX_DISABLE_VALIDATION " MDBX_DISABLE_VALIDATION=YES" #endif /* MDBX_DISABLE_VALIDATION */ From 771c85a8802861ae9ea8f81b573797e401983605 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Sun, 6 Nov 2022 08:46:48 +0300 Subject: [PATCH 188/364] =?UTF-8?q?mdbx:=20=D1=83=D1=82=D0=BE=D1=87=D0=BD?= =?UTF-8?q?=D0=B5=D0=BD=D0=B8=D0=B5=20`txn=5Fspace=5Fdirty`=20=D0=B2=20?= =?UTF-8?q?=D1=81=D0=BE=D0=BE=D1=82=D0=B2=D0=B5=D1=82=D1=81=D1=82=D0=B2?= =?UTF-8?q?=D0=B8=D0=B8=20=D1=81=20=D0=BE=D0=B1=D0=BD=D0=BE=D0=B2=D0=BB?= =?UTF-8?q?=D0=B5=D0=BD=D0=BD=D1=8B=D0=BC=20=D1=83=D1=87=D0=B5=D1=82=D0=BE?= =?UTF-8?q?=D0=BC=20=D0=B3=D1=80=D1=8F=D0=B7=D0=BD=D1=8B=D1=85=20=D1=81?= =?UTF-8?q?=D1=82=D1=80=D0=B0=D0=BD=D0=B8=D1=86.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/core.c b/src/core.c index 66070d30..74ec3854 100644 --- a/src/core.c +++ b/src/core.c @@ -8811,7 +8811,9 @@ int mdbx_txn_info(const MDBX_txn *txn, MDBX_txn_info *info, bool scan_rlt) { : MDBX_PNL_GETSIZE(txn->tw.retired_pages)); info->txn_space_leftover = pgno2bytes(env, txn->tw.dirtyroom); info->txn_space_dirty = - pgno2bytes(env, txn->mt_env->me_options.dp_limit - txn->tw.dirtyroom); + txn->tw.dirtylist + ? pgno2bytes(env, txn->tw.dirtylist->pages_including_loose) + : 0; info->txn_reader_lag = INT64_MAX; MDBX_lockinfo *const lck = env->me_lck_mmap.lck; if (scan_rlt && lck) { From f0c6aa464652e158f78c124b60d741664a8a6b0f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Mon, 7 Nov 2022 00:14:24 +0300 Subject: [PATCH 189/364] mdbx: workaround for false-positives from Valgrind bug. --- src/core.c | 3 +++ src/internals.h | 3 +++ 2 files changed, 6 insertions(+) diff --git a/src/core.c b/src/core.c index 74ec3854..10e874aa 100644 --- a/src/core.c +++ b/src/core.c @@ -5256,6 +5256,9 @@ static __inline void meta_troika_unpack(meta_troika_t *troika, troika->recent = (packed >> 2) & 3; troika->prefer_steady = (packed >> 4) & 3; troika->tail_and_flags = packed & 0xC3; +#if MDBX_WORDBITS > 32 /* Workaround for false-positives from Valgrind */ + troika->unused_pad = 0; +#endif } static const uint8_t troika_fsm_map[2 * 2 * 2 * 3 * 3 * 3] = { diff --git a/src/internals.h b/src/internals.h index 28282414..55b17b3b 100644 --- a/src/internals.h +++ b/src/internals.h @@ -976,6 +976,9 @@ typedef struct MDBX_dbx { typedef struct troika { uint8_t fsm, recent, prefer_steady, tail_and_flags; +#if MDBX_WORDBITS > 32 /* Workaround for false-positives from Valgrind */ + uint32_t unused_pad; +#endif #define TROIKA_HAVE_STEADY(troika) ((troika)->fsm & 7) #define TROIKA_STRICT_VALID(troika) ((troika)->tail_and_flags & 64) #define TROIKA_VALID(troika) ((troika)->tail_and_flags & 128) From 5d36d242a7bcc0b79c16c9ee83b8e5598d9dad2e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Mon, 7 Nov 2022 00:53:44 +0300 Subject: [PATCH 190/364] =?UTF-8?q?mdbx:=20=D0=BE=D0=B1=D0=BD=D0=BE=D0=B2?= =?UTF-8?q?=D0=BB=D0=B5=D0=BD=D0=B8=D0=B5=20ChangeLog.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ChangeLog.md | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/ChangeLog.md b/ChangeLog.md index b0848699..4e4ca18a 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -4,11 +4,14 @@ ChangeLog English version [by Google](https://gitflic-ru.translate.goog/project/erthink/libmdbx/blob?file=ChangeLog.md&_x_tr_sl=ru&_x_tr_tl=en) and [by Yandex](https://translated.turbopages.org/proxy_u/ru-en.en/https/gitflic.ru/project/erthink/libmdbx/blob?file=ChangeLog.md). - -## В разработке v0.12.2 +## v0.12.2 (Ярыгин) запланировано на 2022-11-07 Новое: + - Добавлена возможность профилирования работы GC в сложных и/или нагруженных + сценариях (например Ethereum/Erigon). По-умолчанию соответствующий код отключен, + а для его активации необходимо указать опцию сборки `MDBX_ENABLE_PROFGC=1`. + - Добавлена функция `mdbx_env_warmup()` для "прогрева" БД с возможностью закрепления страниц в памяти. В утилиты `mdbx_chk`, `mdbx_copy` и `mdbx_dump` добавлены опции `-u` и `-U` @@ -100,8 +103,12 @@ and [by Yandex](https://translated.turbopages.org/proxy_u/ru-en.en/https/gitflic `__builtin_cpu_supports()` function, which could be unavailable on a fake OSes (macos, ios, android, etc). -Исправления: +Исправления (без корректировок вышеперечисленных новых функций): + - Устранение ложно-положительных сообщений от Valgrind об использовании + не инициализированных данных из-за выравнивающих зазоров в `struct troika`. + - Исправлен возврат неожиданной ошибки `MDBX_BUSY` из функций `mdbx_env_set_option()`, + `mdbx_env_set_syncbytes()` и `mdbx_env_set_syncperiod()`. - Небольшие исправления для совместимости с CMake 3.8 - Больше контроля и осторожности (паранойи) для страховки от дефектов `mremap()`. - Костыль для починки сборки со старыми версиями `stdatomic.h` из GNU Lib C, @@ -128,6 +135,9 @@ and [by Yandex](https://translated.turbopages.org/proxy_u/ru-en.en/https/gitflic Мелочи: + - Добавлено описание параметров `MDBX_debug_func` и `MDBX_debug_func`. + - Добавлено обходное решение для минимизации ложно-положительных + конфликтов при использовании файловых блокировок в Windows. - Проверка атомарности C11-операций c 32/64-битными данными. - Уменьшение в 42 раза значения по-умолчанию для `me_options.dp_limit` в отладочных сборках. From f7f94bb698621e0ff174363e6951ef7d0057a8aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Mon, 7 Nov 2022 10:57:42 +0300 Subject: [PATCH 191/364] =?UTF-8?q?mdbx:=20=D1=87=D1=83=D1=82=D1=8C=20?= =?UTF-8?q?=D0=B1=D0=BE=D0=BB=D1=8C=D1=88=D0=B5=20`const`=20=D0=B4=D0=BB?= =?UTF-8?q?=D1=8F=20=D0=BF=D1=80=D0=BE=D0=B7=D1=80=D0=B0=D1=87=D0=BD=D0=BE?= =?UTF-8?q?=D1=81=D1=82=D0=B8.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/core.c b/src/core.c index 10e874aa..139f8d85 100644 --- a/src/core.c +++ b/src/core.c @@ -6620,7 +6620,7 @@ static int gc_cursor_init(MDBX_cursor *mc, MDBX_txn *txn) { return cursor_init(mc, txn, FREE_DBI); } -static pgr_t page_alloc_slowpath(MDBX_cursor *mc, const size_t num, +static pgr_t page_alloc_slowpath(const MDBX_cursor *mc, const size_t num, char flags) { #if MDBX_ENABLE_PROFGC const uint64_t monotime_before = osal_monotime(); @@ -7148,7 +7148,7 @@ done: return ret; } -__hot static pgr_t page_alloc(MDBX_cursor *mc) { +__hot static pgr_t page_alloc(const MDBX_cursor *mc) { MDBX_txn *const txn = mc->mc_txn; /* If there are any loose pages, just use them */ From 3ee223514dd5b60cc2e9e3a8f359318a18b55c6e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Mon, 7 Nov 2022 14:07:32 +0300 Subject: [PATCH 192/364] =?UTF-8?q?mdbx:=20=D0=BE=D1=87=D0=B8=D1=81=D1=82?= =?UTF-8?q?=D0=BA=D0=B0=20`readers=5Frefresh=5Fflag`=20=D0=B4=D0=BB=D1=8F?= =?UTF-8?q?=20`page=5Falloc=5Fslowpath()`.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/core.c b/src/core.c index 139f8d85..fb30b30e 100644 --- a/src/core.c +++ b/src/core.c @@ -5400,9 +5400,8 @@ static txnid_t find_oldest_reader(MDBX_env *const env, const txnid_t steady) { eASSERT(env, steady >= prev_oldest); txnid_t new_oldest = prev_oldest; - while (new_oldest != steady && - nothing_changed != - atomic_load32(&lck->mti_readers_refresh_flag, mo_AcquireRelease)) { + while (nothing_changed != + atomic_load32(&lck->mti_readers_refresh_flag, mo_AcquireRelease)) { lck->mti_readers_refresh_flag.weak = nothing_changed; jitter4testing(false); const size_t snap_nreaders = From 79c65821ee420994a7cd901d4bd898a09cb3cdb6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Mon, 7 Nov 2022 14:19:30 +0300 Subject: [PATCH 193/364] =?UTF-8?q?mdbx:=20=D0=BF=D0=B5=D1=80=D0=B5=D0=B8?= =?UTF-8?q?=D0=BC=D0=B5=D0=BD=D0=BE=D0=B2=D0=B0=D0=BD=D0=B8=D0=B5=20`packa?= =?UTF-8?q?ges/rpm`.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- packages/{rpm => rpm.obsolete}/CMakeLists.txt | 0 packages/{rpm => rpm.obsolete}/build.sh | 0 packages/{rpm => rpm.obsolete}/package.sh | 0 3 files changed, 0 insertions(+), 0 deletions(-) rename packages/{rpm => rpm.obsolete}/CMakeLists.txt (100%) rename packages/{rpm => rpm.obsolete}/build.sh (100%) rename packages/{rpm => rpm.obsolete}/package.sh (100%) diff --git a/packages/rpm/CMakeLists.txt b/packages/rpm.obsolete/CMakeLists.txt similarity index 100% rename from packages/rpm/CMakeLists.txt rename to packages/rpm.obsolete/CMakeLists.txt diff --git a/packages/rpm/build.sh b/packages/rpm.obsolete/build.sh similarity index 100% rename from packages/rpm/build.sh rename to packages/rpm.obsolete/build.sh diff --git a/packages/rpm/package.sh b/packages/rpm.obsolete/package.sh similarity index 100% rename from packages/rpm/package.sh rename to packages/rpm.obsolete/package.sh From 623ab21707eaebfb19d5118b38a60d8330a14a5a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Mon, 7 Nov 2022 23:37:27 +0300 Subject: [PATCH 194/364] =?UTF-8?q?mdbx-cmake:=20=D1=81=D0=BE=D0=B2=D0=BC?= =?UTF-8?q?=D0=B5=D1=81=D1=82=D0=B8=D0=BC=D0=BE=D1=81=D1=82=D1=8C=20=D1=81?= =?UTF-8?q?=20CMake=203.0.2=20=D0=B4=D0=BB=D1=8F=20CI=20=D0=BD=D0=B0=20?= =?UTF-8?q?=D1=81=D1=82=D0=B0=D1=80=D1=8B=D1=85=20=D1=81=D0=B8=D1=81=D1=82?= =?UTF-8?q?=D0=B5=D0=BC=D0=B0=D1=85.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CMakeLists.txt | 20 ++++++++++++++------ cmake/compiler.cmake | 17 +++++++++++++---- cmake/profile.cmake | 4 +++- cmake/utils.cmake | 4 +++- test/CMakeLists.txt | 6 +++++- 5 files changed, 38 insertions(+), 13 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index dfbaa3de..277bad21 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -34,7 +34,9 @@ ## The Future will (be) Positive. Всё будет хорошо. ## -if(CMAKE_VERSION VERSION_LESS 3.12) +if(CMAKE_VERSION VERSION_LESS 3.8.2) + cmake_minimum_required(VERSION 3.0.2) +elseif(CMAKE_VERSION VERSION_LESS 3.12) cmake_minimum_required(VERSION 3.8.2) else() cmake_minimum_required(VERSION 3.12) @@ -385,6 +387,8 @@ if(NOT DEFINED MDBX_CXX_STANDARD) set(MDBX_CXX_STANDARD 14) elseif(NOT HAS_CXX11 LESS 0) set(MDBX_CXX_STANDARD 11) + elseif(CXX_FALLBACK_GNU11 OR CXX_FALLBACK_11) + set(MDBX_CXX_STANDARD 11) else() set(MDBX_CXX_STANDARD 98) endif() @@ -533,7 +537,7 @@ else() unset(MDBX_LINK_TOOLS_NONSTATIC CACHE) endif() -if(CMAKE_CXX_COMPILER_LOADED AND MDBX_CXX_STANDARD GREATER_EQUAL 11 AND MDBX_CXX_STANDARD LESS 83) +if(CMAKE_CXX_COMPILER_LOADED AND MDBX_CXX_STANDARD LESS 83 AND NOT MDBX_CXX_STANDARD LESS 11) if(NOT MDBX_AMALGAMATED_SOURCE) option(MDBX_ENABLE_TESTS "Build MDBX tests" ${BUILD_TESTING}) endif() @@ -639,7 +643,11 @@ macro(target_setup_options TARGET) endmacro() macro(libmdbx_setup_libs TARGET MODE) - target_link_libraries(${TARGET} ${MODE} Threads::Threads) + if(CMAKE_VERSION VERSION_LESS 3.1) + target_link_libraries(${TARGET} ${MODE} ${CMAKE_THREAD_LIBS_INIT}) + else() + target_link_libraries(${TARGET} ${MODE} Threads::Threads) + endif() if(${CMAKE_SYSTEM_NAME} STREQUAL "Windows") target_link_libraries(${TARGET} ${MODE} ntdll advapi32) if(MDBX_NTDLL_EXTRA_IMPLIB AND MDBX_WITHOUT_MSVC_CRT) @@ -906,16 +914,16 @@ else() endif() if(CMAKE_C_COMPILER_ABI AND NOT (CMAKE_C_COMPILER_ABI MATCHES ".*${MDBX_BUILD_TARGET}.*" OR MDBX_BUILD_TARGET MATCHES ".*${CMAKE_C_COMPILER_ABI}.*")) - string(APPEND MDBX_BUILD_TARGET "-${CMAKE_C_COMPILER_ABI}") + string(CONCAT MDBX_BUILD_TARGET "${MDBX_BUILD_TARGET}-${CMAKE_C_COMPILER_ABI}") endif() if(CMAKE_C_PLATFORM_ID AND NOT (CMAKE_SYSTEM_NAME AND (CMAKE_C_PLATFORM_ID MATCHES ".*${CMAKE_SYSTEM_NAME}.*" OR CMAKE_SYSTEM_NAME MATCHES ".*${CMAKE_C_PLATFORM_ID}.*")) AND NOT (CMAKE_C_PLATFORM_ID MATCHES ".*${CMAKE_C_PLATFORM_ID}.*" OR MDBX_BUILD_TARGET MATCHES ".*${CMAKE_C_PLATFORM_ID}.*")) - string(APPEND MDBX_BUILD_TARGET "-${CMAKE_C_COMPILER_ABI}") + string(CONCAT MDBX_BUILD_TARGET "${MDBX_BUILD_TARGET}-${CMAKE_C_COMPILER_ABI}") endif() if(CMAKE_SYSTEM_NAME) - string(APPEND MDBX_BUILD_TARGET "-${CMAKE_SYSTEM_NAME}") + string(CONCAT MDBX_BUILD_TARGET "${MDBX_BUILD_TARGET}-${CMAKE_SYSTEM_NAME}") endif() endif() diff --git a/cmake/compiler.cmake b/cmake/compiler.cmake index ef308fc2..78a31946 100644 --- a/cmake/compiler.cmake +++ b/cmake/compiler.cmake @@ -13,7 +13,9 @@ ## limitations under the License. ## -if(CMAKE_VERSION VERSION_LESS 3.12) +if(CMAKE_VERSION VERSION_LESS 3.8.2) + cmake_minimum_required(VERSION 3.0.2) +elseif(CMAKE_VERSION VERSION_LESS 3.12) cmake_minimum_required(VERSION 3.8.2) else() cmake_minimum_required(VERSION 3.12) @@ -958,12 +960,13 @@ endmacro(setup_compile_flags) macro(probe_libcxx_filesystem) if(CMAKE_CXX_COMPILER_LOADED AND NOT DEFINED LIBCXX_FILESYSTEM) list(FIND CMAKE_CXX_COMPILE_FEATURES cxx_std_11 HAS_CXX11) - if(NOT HAS_CXX11 LESS 0) + if(NOT HAS_CXX11 LESS 0 OR CXX_FALLBACK_GNU11 OR CXX_FALLBACK_11) include(CMakePushCheckState) include(CheckCXXSourceCompiles) cmake_push_check_state() set(stdfs_probe_save_libraries ${CMAKE_REQUIRED_LIBRARIES}) set(stdfs_probe_save_flags ${CMAKE_REQUIRED_FLAGS}) + set(stdfs_probe_flags ${CMAKE_REQUIRED_FLAGS}) set(stdfs_probe_save_link_options ${CMAKE_REQUIRED_LINK_OPTIONS}) unset(stdfs_probe_clear_cxx_standard) if(NOT DEFINED CMAKE_CXX_STANDARD) @@ -974,18 +977,23 @@ macro(probe_libcxx_filesystem) set(CMAKE_CXX_STANDARD 17) elseif(NOT HAS_CXX14 LESS 0) set(CMAKE_CXX_STANDARD 14) - else() + elseif(NOT HAS_CXX11 LESS 0) set(CMAKE_CXX_STANDARD 11) + elseif(CXX_FALLBACK_GNU11) + set(stdfs_probe_flags ${stdfs_probe_flags} "-std=gnu++11") + else() + set(stdfs_probe_flags ${stdfs_probe_flags} "-std=c++11") endif() set(stdfs_probe_clear_cxx_standard ON) endif() if(CMAKE_COMPILER_IS_ELBRUSCXX AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 1.25.23) if(CMAKE_VERSION VERSION_LESS 3.14) - set(CMAKE_REQUIRED_FLAGS ${stdfs_probe_save_flags} "-Wl,--allow-multiple-definition") + set(stdfs_probe_flags ${stdfs_probe_flags} "-Wl,--allow-multiple-definition") else() set(CMAKE_REQUIRED_LINK_OPTIONS ${stdfs_probe_save_link_options} "-Wl,--allow-multiple-definition") endif() endif() + set(CMAKE_REQUIRED_FLAGS ${stdfs_probe_flags}) set(stdfs_probe_code [[ #if defined(__SIZEOF_INT128__) && !defined(__GLIBCXX_TYPE_INT_N_0) && defined(__clang__) && __clang_major__ < 4 @@ -1066,6 +1074,7 @@ macro(probe_libcxx_filesystem) unset(stdfs_probe_clear_cxx_standard) unset(stdfs_probe_save_link_options) unset(stdfs_probe_save_flags) + unset(stdfs_probe_flags) unset(stdfs_probe_save_libraries) unset(stdfs_probe_code) unset(stdfs_probe_rc) diff --git a/cmake/profile.cmake b/cmake/profile.cmake index 6fe9c821..c9b8bed4 100644 --- a/cmake/profile.cmake +++ b/cmake/profile.cmake @@ -13,7 +13,9 @@ ## limitations under the License. ## -if(CMAKE_VERSION VERSION_LESS 3.12) +if(CMAKE_VERSION VERSION_LESS 3.8.2) + cmake_minimum_required(VERSION 3.0.2) +elseif(CMAKE_VERSION VERSION_LESS 3.12) cmake_minimum_required(VERSION 3.8.2) else() cmake_minimum_required(VERSION 3.12) diff --git a/cmake/utils.cmake b/cmake/utils.cmake index a461cc27..6a3315e1 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -13,7 +13,9 @@ ## limitations under the License. ## -if(CMAKE_VERSION VERSION_LESS 3.12) +if(CMAKE_VERSION VERSION_LESS 3.8.2) + cmake_minimum_required(VERSION 3.0.2) +elseif(CMAKE_VERSION VERSION_LESS 3.12) cmake_minimum_required(VERSION 3.8.2) else() cmake_minimum_required(VERSION 3.12) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 5e7066e1..f8fb1618 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -54,7 +54,11 @@ if(NOT MDBX_BUILD_CXX AND LIBCXX_FILESYSTEM) target_link_libraries(mdbx_test ${LIBCXX_FILESYSTEM}) endif() -target_link_libraries(mdbx_test ${TOOL_MDBX_LIB} ${LIB_MATH} ${CMAKE_THREAD_LIBS_INIT}) +if(CMAKE_VERSION VERSION_LESS 3.1) + target_link_libraries(mdbx_test ${TOOL_MDBX_LIB} ${LIB_MATH} ${CMAKE_THREAD_LIBS_INIT}) +else() + target_link_libraries(mdbx_test ${TOOL_MDBX_LIB} ${LIB_MATH} Threads::Threads) +endif() if(${CMAKE_SYSTEM_NAME} STREQUAL "Windows") target_link_libraries(mdbx_test winmm.lib) endif() From 8c75ed59dacb78ec36da439c50c077b72cd3dba1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Mon, 7 Nov 2022 23:57:25 +0300 Subject: [PATCH 195/364] =?UTF-8?q?mdbx-cmake:=20=D0=BE=D1=82=D0=BA=D0=BB?= =?UTF-8?q?=D1=8E=D1=87=D0=B5=D0=BD=D0=B8=D0=B5=D0=BC=20LTO=20=D0=B4=D0=BB?= =?UTF-8?q?=D1=8F=20G++=20<=207.0,=20=D0=B8=D0=B1=D0=BE=20=D0=BF=D0=B0?= =?UTF-8?q?=D0=B4=D0=B0=D0=B5=D1=82.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CMakeLists.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 277bad21..cd143ce7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -259,7 +259,9 @@ else() endif() if(INTERPROCEDURAL_OPTIMIZATION) - if(GCC_LTO_AVAILABLE) + if(GCC_LTO_AVAILABLE AND + # Отключаем LTO для G++ < 7.0, ибо падает: lto1: internal compiler error: in splice_child_die, at dwarf2out.c:5030 + (NOT CMAKE_CXX_COMPILER_LOADED OR NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 7)) set(LTO_ENABLED TRUE) set(CMAKE_AR ${CMAKE_GCC_AR} CACHE PATH "Path to ar program with LTO-plugin" FORCE) set(CMAKE_C_COMPILER_AR ${CMAKE_AR} CACHE PATH "Path to ar program with LTO-plugin" FORCE) From 987509f90ff8442fa876f57699e866a375419849 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Tue, 8 Nov 2022 00:42:32 +0300 Subject: [PATCH 196/364] mdbx-cmake: try fix libmdbx-rs/issues/10. https://github.com/vorot93/libmdbx-rs/issues/10 --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index cd143ce7..bd5bd1c2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -651,7 +651,7 @@ macro(libmdbx_setup_libs TARGET MODE) target_link_libraries(${TARGET} ${MODE} Threads::Threads) endif() if(${CMAKE_SYSTEM_NAME} STREQUAL "Windows") - target_link_libraries(${TARGET} ${MODE} ntdll advapi32) + target_link_libraries(${TARGET} ${MODE} ntdll user32 kernel32 advapi32) if(MDBX_NTDLL_EXTRA_IMPLIB AND MDBX_WITHOUT_MSVC_CRT) target_link_libraries(${TARGET} ${MODE} ntdll_extra) endif() From 652ca2b5cb1c79a0ee906de8d053a4c736ee39bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Tue, 8 Nov 2022 15:52:06 +0300 Subject: [PATCH 197/364] =?UTF-8?q?mdbx-windows:=20=D0=B8=D1=81=D0=BF?= =?UTF-8?q?=D1=80=D0=B0=D0=B2=D0=BB=D0=B5=D0=BD=D0=B8=D0=B5=20=D0=BC=D0=B8?= =?UTF-8?q?=D0=BD=D0=BE=D1=80=D0=BD=D1=8B=D1=85=20=D0=BF=D1=80=D0=B5=D0=B4?= =?UTF-8?q?=D1=83=D0=BF=D1=80=D0=B5=D0=B6=D0=B4=D0=B5=D0=BD=D0=B8=D0=B9=20?= =?UTF-8?q?MingGW.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/lck-windows.c | 32 ++++++++++++++------------------ src/osal.c | 37 +++++++++++++++++++++---------------- 2 files changed, 35 insertions(+), 34 deletions(-) diff --git a/src/lck-windows.c b/src/lck-windows.c index 96d416ad..3bbe3f3b 100644 --- a/src/lck-windows.c +++ b/src/lck-windows.c @@ -112,11 +112,10 @@ static #define LCK_WAITFOR 0 #define LCK_DONTWAIT LOCKFILE_FAIL_IMMEDIATELY -static int flock_with_event(HANDLE fd, HANDLE event, DWORD flags, - uint64_t offset, size_t bytes) { - TRACE("lock>>: fd %p, event %p, flags 0x%x offset %" PRId64 ", bytes %" PRId64 - " >>", - fd, event, flags, offset, bytes); +static int flock_with_event(HANDLE fd, HANDLE event, unsigned flags, + size_t offset, size_t bytes) { + TRACE("lock>>: fd %p, event %p, flags 0x%x offset %zu, bytes %zu >>", fd, + event, flags, offset, bytes); OVERLAPPED ov; ov.Internal = 0; ov.InternalHigh = 0; @@ -124,9 +123,8 @@ static int flock_with_event(HANDLE fd, HANDLE event, DWORD flags, ov.Offset = (DWORD)offset; ov.OffsetHigh = HIGH_DWORD(offset); if (LockFileEx(fd, flags, 0, (DWORD)bytes, HIGH_DWORD(bytes), &ov)) { - TRACE("lock<<: fd %p, event %p, flags 0x%x offset %" PRId64 - ", bytes %" PRId64 " << %s", - fd, event, flags, offset, bytes, "done"); + TRACE("lock<<: fd %p, event %p, flags 0x%x offset %zu, bytes %zu << %s", fd, + event, flags, offset, bytes, "done"); return MDBX_SUCCESS; } @@ -134,8 +132,7 @@ static int flock_with_event(HANDLE fd, HANDLE event, DWORD flags, if (rc == ERROR_IO_PENDING) { if (event) { if (GetOverlappedResult(fd, &ov, &rc, true)) { - TRACE("lock<<: fd %p, event %p, flags 0x%x offset %" PRId64 - ", bytes %" PRId64 " << %s", + TRACE("lock<<: fd %p, event %p, flags 0x%x offset %zu, bytes %zu << %s", fd, event, flags, offset, bytes, "overlapped-done"); return MDBX_SUCCESS; } @@ -143,25 +140,24 @@ static int flock_with_event(HANDLE fd, HANDLE event, DWORD flags, } else CancelIo(fd); } - TRACE("lock<<: fd %p, event %p, flags 0x%x offset %" PRId64 ", bytes %" PRId64 - " << err %d", - fd, event, flags, offset, bytes, rc); + TRACE("lock<<: fd %p, event %p, flags 0x%x offset %zu, bytes %zu << err %d", + fd, event, flags, offset, bytes, (int)rc); return (int)rc; } -static __inline int flock(HANDLE fd, DWORD flags, uint64_t offset, +static __inline int flock(HANDLE fd, unsigned flags, size_t offset, size_t bytes) { return flock_with_event(fd, 0, flags, offset, bytes); } -static __inline int flock_data(const MDBX_env *env, DWORD flags, - uint64_t offset, size_t bytes) { +static __inline int flock_data(const MDBX_env *env, unsigned flags, + size_t offset, size_t bytes) { return flock_with_event(env->me_fd4data, env->me_data_lock_event, flags, offset, bytes); } -static int funlock(mdbx_filehandle_t fd, uint64_t offset, size_t bytes) { - TRACE("unlock: fd %p, offset %" PRId64 ", bytes %" PRId64, fd, offset, bytes); +static int funlock(mdbx_filehandle_t fd, size_t offset, size_t bytes) { + TRACE("unlock: fd %p, offset %zu, bytes %zu", fd, offset, bytes); return UnlockFile(fd, (DWORD)offset, HIGH_DWORD(offset), (DWORD)bytes, HIGH_DWORD(bytes)) ? MDBX_SUCCESS diff --git a/src/osal.c b/src/osal.c index 5d1339d3..494290a2 100644 --- a/src/osal.c +++ b/src/osal.c @@ -853,9 +853,10 @@ osal_ioring_write(osal_ioring_t *ior) { if (unlikely(r.err != ERROR_IO_PENDING)) { ERROR("%s: fd %p, item %p (%zu), pgno %u, bytes %zu, offset %" PRId64 ", err %d", - "WriteFileGather", ior->fd, item, item - ior->pool, - ((MDBX_page *)item->single.iov_base)->mp_pgno, bytes, - item->ov.Offset + ((uint64_t)item->ov.OffsetHigh << 32), r.err); + "WriteFileGather", ior->fd, __Wpedantic_format_voidptr(item), + item - ior->pool, ((MDBX_page *)item->single.iov_base)->mp_pgno, + bytes, item->ov.Offset + ((uint64_t)item->ov.OffsetHigh << 32), + r.err); goto bailout_rc; } assert(wait_for > ior->event_pool + ior->event_stack); @@ -874,9 +875,10 @@ osal_ioring_write(osal_ioring_t *ior) { default: ERROR("%s: fd %p, item %p (%zu), pgno %u, bytes %zu, offset %" PRId64 ", err %d", - "WriteFileEx", ior->fd, item, item - ior->pool, - ((MDBX_page *)item->single.iov_base)->mp_pgno, bytes, - item->ov.Offset + ((uint64_t)item->ov.OffsetHigh << 32), r.err); + "WriteFileEx", ior->fd, __Wpedantic_format_voidptr(item), + item - ior->pool, ((MDBX_page *)item->single.iov_base)->mp_pgno, + bytes, item->ov.Offset + ((uint64_t)item->ov.OffsetHigh << 32), + r.err); goto bailout_rc; case ERROR_NOT_FOUND: case ERROR_USER_MAPPED_FILE: @@ -884,9 +886,10 @@ osal_ioring_write(osal_ioring_t *ior) { WARNING( "%s: fd %p, item %p (%zu), pgno %u, bytes %zu, offset %" PRId64 ", err %d", - "WriteFileEx", ior->fd, item, item - ior->pool, - ((MDBX_page *)item->single.iov_base)->mp_pgno, bytes, - item->ov.Offset + ((uint64_t)item->ov.OffsetHigh << 32), r.err); + "WriteFileEx", ior->fd, __Wpedantic_format_voidptr(item), + item - ior->pool, ((MDBX_page *)item->single.iov_base)->mp_pgno, + bytes, item->ov.Offset + ((uint64_t)item->ov.OffsetHigh << 32), + r.err); SleepEx(0, true); goto retry; case ERROR_INVALID_USER_BUFFER: @@ -906,9 +909,10 @@ osal_ioring_write(osal_ioring_t *ior) { r.err = (int)GetLastError(); ERROR("%s: fd %p, item %p (%zu), pgno %u, bytes %zu, offset %" PRId64 ", err %d", - "WriteFile", ior->fd, item, item - ior->pool, - ((MDBX_page *)item->single.iov_base)->mp_pgno, bytes, - item->ov.Offset + ((uint64_t)item->ov.OffsetHigh << 32), r.err); + "WriteFile", ior->fd, __Wpedantic_format_voidptr(item), + item - ior->pool, ((MDBX_page *)item->single.iov_base)->mp_pgno, + bytes, item->ov.Offset + ((uint64_t)item->ov.OffsetHigh << 32), + r.err); goto bailout_rc; } else if (unlikely(written != bytes)) { r.err = ERROR_WRITE_FAULT; @@ -973,10 +977,11 @@ osal_ioring_write(osal_ioring_t *ior) { !GetOverlappedResult(ior->fd, &item->ov, &written, true))) { ERROR("%s: item %p (%zu), pgno %u, bytes %zu, offset %" PRId64 ", err %d", - "GetOverlappedResult", item, item - ior->pool, + "GetOverlappedResult", __Wpedantic_format_voidptr(item), + item - ior->pool, ((MDBX_page *)item->single.iov_base)->mp_pgno, bytes, item->ov.Offset + ((uint64_t)item->ov.OffsetHigh << 32), - GetLastError()); + (int)GetLastError()); goto bailout_geterr; } assert(MDBX_SUCCESS == item->ov.Internal); @@ -994,10 +999,10 @@ osal_ioring_write(osal_ioring_t *ior) { r.err = (int)GetLastError(); ERROR("%s: item %p (%zu), pgno %u, bytes %zu, offset %" PRId64 ", err %d", - "Result", item, item - ior->pool, + "Result", __Wpedantic_format_voidptr(item), item - ior->pool, ((MDBX_page *)item->single.iov_base)->mp_pgno, bytes, item->ov.Offset + ((uint64_t)item->ov.OffsetHigh << 32), - GetLastError()); + (int)GetLastError()); goto bailout_rc; } if (unlikely(item->ov.InternalHigh != bytes)) { From d315a9255a6103ad82d2bc2b9cdb0482a6b4954b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Tue, 8 Nov 2022 16:17:14 +0300 Subject: [PATCH 198/364] =?UTF-8?q?mdbx-test:=20=D1=81=D0=BC=D0=B5=D0=BD?= =?UTF-8?q?=D0=B0=20=D1=80=D0=B0=D1=81=D1=88=D0=B8=D1=80=D0=B5=D0=BD=D0=B8?= =?UTF-8?q?=D1=8F=20=D1=83=20C++=20=D0=B8=D1=81=D1=85=D0=BE=D0=B4=D0=BD?= =?UTF-8?q?=D0=B8=D0=BA=D0=BE=D0=B2.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- GNUmakefile | 10 ++--- test/CMakeLists.txt | 52 +++++++++++----------- test/{append.cc => append.c++} | 2 +- test/{base.h => base.h++} | 0 test/{cases.cc => cases.c++} | 2 +- test/{chrono.cc => chrono.c++} | 2 +- test/{chrono.h => chrono.h++} | 4 +- test/{config.cc => config.c++} | 2 +- test/{config.h => config.h++} | 6 +-- test/{copy.cc => copy.c++} | 2 +- test/{dead.cc => dead.c++} | 2 +- test/{hill.cc => hill.c++} | 2 +- test/{jitter.cc => jitter.c++} | 2 +- test/{keygen.cc => keygen.c++} | 2 +- test/{keygen.h => keygen.h++} | 8 ++-- test/{log.cc => log.c++} | 2 +- test/{log.h => log.h++} | 4 +- test/{main.cc => main.c++} | 2 +- test/{nested.cc => nested.c++} | 2 +- test/{osal-unix.cc => osal-unix.c++} | 2 +- test/{osal-windows.cc => osal-windows.c++} | 2 +- test/{osal.h => osal.h++} | 2 +- test/{test.cc => test.c++} | 2 +- test/{test.h => test.h++} | 14 +++--- test/{try.cc => try.c++} | 2 +- test/{ttl.cc => ttl.c++} | 2 +- test/{utils.cc => utils.c++} | 2 +- test/{utils.h => utils.h++} | 2 +- 28 files changed, 69 insertions(+), 69 deletions(-) rename test/{append.cc => append.c++} (99%) rename test/{base.h => base.h++} (100%) rename test/{cases.cc => cases.c++} (99%) rename test/{chrono.cc => chrono.c++} (99%) rename test/{chrono.h => chrono.h++} (98%) rename test/{config.cc => config.c++} (99%) rename test/{config.h => config.h++} (99%) rename test/{copy.cc => copy.c++} (98%) rename test/{dead.cc => dead.c++} (98%) rename test/{hill.cc => hill.c++} (99%) rename test/{jitter.cc => jitter.c++} (99%) rename test/{keygen.cc => keygen.c++} (99%) rename test/{keygen.h => keygen.h++} (98%) rename test/{log.cc => log.c++} (99%) rename test/{log.h => log.h++} (98%) rename test/{main.cc => main.c++} (99%) rename test/{nested.cc => nested.c++} (99%) rename test/{osal-unix.cc => osal-unix.c++} (99%) rename test/{osal-windows.cc => osal-windows.c++} (99%) rename test/{osal.h => osal.h++} (98%) rename test/{test.cc => test.c++} (99%) rename test/{test.h => test.h++} (98%) rename test/{try.cc => try.c++} (97%) rename test/{ttl.cc => ttl.c++} (99%) rename test/{utils.cc => utils.c++} (99%) rename test/{utils.h => utils.h++} (99%) diff --git a/GNUmakefile b/GNUmakefile index 92c23dbc..68972f99 100644 --- a/GNUmakefile +++ b/GNUmakefile @@ -353,9 +353,9 @@ TEST_DB ?= $(shell [ -d /dev/shm ] && echo /dev/shm || echo /tmp)/mdbx-test.d TEST_LOG ?= $(shell [ -d /dev/shm ] && echo /dev/shm || echo /tmp)/mdbx-test.log TEST_OSAL := $(shell $(uname2osal)) TEST_ITER := $(shell $(uname2titer)) -TEST_SRC := test/osal-$(TEST_OSAL).cc $(filter-out $(wildcard test/osal-*.cc), $(wildcard test/*.cc)) -TEST_INC := $(wildcard test/*.h) -TEST_OBJ := $(patsubst %.cc,%.o,$(TEST_SRC)) +TEST_SRC := test/osal-$(TEST_OSAL).c++ $(filter-out $(wildcard test/osal-*.c++), $(wildcard test/*.c++)) +TEST_INC := $(wildcard test/*.h++) +TEST_OBJ := $(patsubst %.c++,%.o,$(TEST_SRC)) TAR ?= $(shell which gnu-tar || echo tar) ZIP ?= $(shell which zip || echo "echo 'Please install zip'") CLANG_FORMAT ?= $(shell (which clang-format-14 || which clang-format-13 || which clang-format) 2>/dev/null) @@ -363,7 +363,7 @@ CLANG_FORMAT ?= $(shell (which clang-format-14 || which clang-format-13 || which reformat: @echo ' RUNNING clang-format...' $(QUIET)if [ -n "$(CLANG_FORMAT)" ]; then \ - git ls-files | grep -E '\.(c|cxx|cc|cpp|h|hxx|hpp)(\.in)?$$' | xargs -r $(CLANG_FORMAT) -i --style=file; \ + git ls-files | grep -E '\.(c|c++|h|h++)(\.in)?$$' | xargs -r $(CLANG_FORMAT) -i --style=file; \ else \ echo "clang-format version 13..14 not found for 'reformat'"; \ fi @@ -469,7 +469,7 @@ mdbx_example: mdbx.h example/example-mdbx.c libmdbx.$(SO_SUFFIX) build-test: all mdbx_example mdbx_test define test-rule -$(patsubst %.cc,%.o,$(1)): $(1) $(TEST_INC) $(HEADERS) $(lastword $(MAKEFILE_LIST)) +$(patsubst %.c++,%.o,$(1)): $(1) $(TEST_INC) $(HEADERS) $(lastword $(MAKEFILE_LIST)) @echo ' CC $$@' $(QUIET)$$(CXX) $$(CXXFLAGS) $$(MDBX_BUILD_OPTIONS) -c $(1) -o $$@ diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index f8fb1618..0a067d09 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -2,32 +2,32 @@ enable_language(CXX) include(../cmake/compiler.cmake) set(LIBMDBX_TEST_SOURCES - base.h - cases.cc - chrono.cc - chrono.h - config.cc - config.h - copy.cc - dead.cc - hill.cc - jitter.cc - keygen.cc - keygen.h - log.cc - log.h - main.cc - osal.h - osal-unix.cc - osal-windows.cc - test.cc - test.h - try.cc - utils.cc - utils.h - append.cc - ttl.cc - nested.cc + base.h++ + cases.c++ + chrono.c++ + chrono.h++ + config.c++ + config.h++ + copy.c++ + dead.c++ + hill.c++ + jitter.c++ + keygen.c++ + keygen.h++ + log.c++ + log.h++ + main.c++ + osal.h++ + osal-unix.c++ + osal-windows.c++ + test.c++ + test.h++ + try.c++ + utils.c++ + utils.h++ + append.c++ + ttl.c++ + nested.c++ ) if(NOT MDBX_BUILD_CXX) diff --git a/test/append.cc b/test/append.c++ similarity index 99% rename from test/append.cc rename to test/append.c++ index d75e0231..5ca2245f 100644 --- a/test/append.cc +++ b/test/append.c++ @@ -12,7 +12,7 @@ * . */ -#include "test.h" +#include "test.h++" class testcase_append : public testcase { public: diff --git a/test/base.h b/test/base.h++ similarity index 100% rename from test/base.h rename to test/base.h++ diff --git a/test/cases.cc b/test/cases.c++ similarity index 99% rename from test/cases.cc rename to test/cases.c++ index 75432e5c..51d84b86 100644 --- a/test/cases.cc +++ b/test/cases.c++ @@ -12,7 +12,7 @@ * . */ -#include "test.h" +#include "test.h++" registry *registry::instance() { static registry *singleton; diff --git a/test/chrono.cc b/test/chrono.c++ similarity index 99% rename from test/chrono.cc rename to test/chrono.c++ index ec22b39b..33456867 100644 --- a/test/chrono.cc +++ b/test/chrono.c++ @@ -12,7 +12,7 @@ * . */ -#include "test.h" +#include "test.h++" namespace chrono { diff --git a/test/chrono.h b/test/chrono.h++ similarity index 98% rename from test/chrono.h rename to test/chrono.h++ index 5ee08856..4f86cf65 100644 --- a/test/chrono.h +++ b/test/chrono.h++ @@ -14,8 +14,8 @@ #pragma once -#include "base.h" -#include "utils.h" +#include "base.h++" +#include "utils.h++" namespace chrono { diff --git a/test/config.cc b/test/config.c++ similarity index 99% rename from test/config.cc rename to test/config.c++ index 38063892..e86984d4 100644 --- a/test/config.cc +++ b/test/config.c++ @@ -12,7 +12,7 @@ * . */ -#include "test.h" +#include "test.h++" #if defined(_MSC_VER) && !defined(strcasecmp) #define strcasecmp(str, len) _stricmp(str, len) diff --git a/test/config.h b/test/config.h++ similarity index 99% rename from test/config.h rename to test/config.h++ index 8c93981e..1e6e57e5 100644 --- a/test/config.h +++ b/test/config.h++ @@ -14,9 +14,9 @@ #pragma once -#include "base.h" -#include "log.h" -#include "utils.h" +#include "base.h++" +#include "log.h++" +#include "utils.h++" #define ACTOR_ID_MAX INT16_MAX diff --git a/test/copy.cc b/test/copy.c++ similarity index 98% rename from test/copy.cc rename to test/copy.c++ index d164fc45..93ae77c8 100644 --- a/test/copy.cc +++ b/test/copy.c++ @@ -1,4 +1,4 @@ -#include "test.h" +#include "test.h++" class testcase_copy : public testcase { const std::string copy_pathname; diff --git a/test/dead.cc b/test/dead.c++ similarity index 98% rename from test/dead.cc rename to test/dead.c++ index 05304bc2..0d698d91 100644 --- a/test/dead.cc +++ b/test/dead.c++ @@ -12,7 +12,7 @@ * . */ -#include "test.h" +#include "test.h++" class testcase_deadread : public testcase { public: diff --git a/test/hill.cc b/test/hill.c++ similarity index 99% rename from test/hill.cc rename to test/hill.c++ index 5aea4d71..3a5c29f3 100644 --- a/test/hill.cc +++ b/test/hill.c++ @@ -12,7 +12,7 @@ * . */ -#include "test.h" +#include "test.h++" /* LY: тест "холмиком": * - сначала наполняем таблицу циклическими CRUD-манипуляциями, diff --git a/test/jitter.cc b/test/jitter.c++ similarity index 99% rename from test/jitter.cc rename to test/jitter.c++ index 2c781d7f..391d5deb 100644 --- a/test/jitter.cc +++ b/test/jitter.c++ @@ -12,7 +12,7 @@ * . */ -#include "test.h" +#include "test.h++" class testcase_jitter : public testcase { protected: diff --git a/test/keygen.cc b/test/keygen.c++ similarity index 99% rename from test/keygen.cc rename to test/keygen.c++ index 807954f3..2cd7e574 100644 --- a/test/keygen.cc +++ b/test/keygen.c++ @@ -12,7 +12,7 @@ * . */ -#include "test.h" +#include "test.h++" namespace keygen { diff --git a/test/keygen.h b/test/keygen.h++ similarity index 98% rename from test/keygen.h rename to test/keygen.h++ index 53219f5d..54122ab1 100644 --- a/test/keygen.h +++ b/test/keygen.h++ @@ -14,10 +14,10 @@ #pragma once -#include "base.h" -#include "config.h" -#include "log.h" -#include "utils.h" +#include "base.h++" +#include "config.h++" +#include "log.h++" +#include "utils.h++" namespace keygen { diff --git a/test/log.cc b/test/log.c++ similarity index 99% rename from test/log.cc rename to test/log.c++ index 9597328f..f9cb1194 100644 --- a/test/log.cc +++ b/test/log.c++ @@ -12,7 +12,7 @@ * . */ -#include "test.h" +#include "test.h++" static void fflushall() { fflush(nullptr); } diff --git a/test/log.h b/test/log.h++ similarity index 98% rename from test/log.h rename to test/log.h++ index bc9f4579..0ff12ec2 100644 --- a/test/log.h +++ b/test/log.h++ @@ -14,8 +14,8 @@ #pragma once -#include "base.h" -#include "chrono.h" +#include "base.h++" +#include "chrono.h++" MDBX_NORETURN void usage(void); MDBX_NORETURN void MDBX_PRINTF_ARGS(1, 2) failure(const char *fmt, ...); diff --git a/test/main.cc b/test/main.c++ similarity index 99% rename from test/main.cc rename to test/main.c++ index 67ea1bd0..0cd82dbd 100644 --- a/test/main.cc +++ b/test/main.c++ @@ -12,7 +12,7 @@ * . */ -#include "test.h" +#include "test.h++" #if !(defined(_WIN32) || defined(_WIN64)) #include diff --git a/test/nested.cc b/test/nested.c++ similarity index 99% rename from test/nested.cc rename to test/nested.c++ index 098eada0..60c02ae9 100644 --- a/test/nested.cc +++ b/test/nested.c++ @@ -12,7 +12,7 @@ * . */ -#include "test.h" +#include "test.h++" #include /* LY: тест "эмуляцией time-to-live" с вложенными транзакциями: diff --git a/test/osal-unix.cc b/test/osal-unix.c++ similarity index 99% rename from test/osal-unix.cc rename to test/osal-unix.c++ index 320ebad3..1711518e 100644 --- a/test/osal-unix.cc +++ b/test/osal-unix.c++ @@ -12,7 +12,7 @@ * . */ -#include "test.h" +#include "test.h++" #if !(defined(_WIN32) || defined(_WIN64)) diff --git a/test/osal-windows.cc b/test/osal-windows.c++ similarity index 99% rename from test/osal-windows.cc rename to test/osal-windows.c++ index 70b8cf5c..711a9b12 100644 --- a/test/osal-windows.cc +++ b/test/osal-windows.c++ @@ -12,7 +12,7 @@ * . */ -#include "test.h" +#include "test.h++" #if defined(_WIN32) || defined(_WIN64) diff --git a/test/osal.h b/test/osal.h++ similarity index 98% rename from test/osal.h rename to test/osal.h++ index a893ddb4..0fe44f68 100644 --- a/test/osal.h +++ b/test/osal.h++ @@ -14,7 +14,7 @@ #pragma once -#include "base.h" +#include "base.h++" void osal_setup(const std::vector &actors); void osal_broadcast(unsigned id); diff --git a/test/test.cc b/test/test.c++ similarity index 99% rename from test/test.cc rename to test/test.c++ index 60fd6914..1d06dd77 100644 --- a/test/test.cc +++ b/test/test.c++ @@ -12,7 +12,7 @@ * . */ -#include "test.h" +#include "test.h++" const char *testcase2str(const actor_testcase testcase) { switch (testcase) { diff --git a/test/test.h b/test/test.h++ similarity index 98% rename from test/test.h rename to test/test.h++ index 40bb01ac..4442aaa7 100644 --- a/test/test.h +++ b/test/test.h++ @@ -14,13 +14,13 @@ #pragma once -#include "base.h" -#include "chrono.h" -#include "config.h" -#include "keygen.h" -#include "log.h" -#include "osal.h" -#include "utils.h" +#include "base.h++" +#include "chrono.h++" +#include "config.h++" +#include "keygen.h++" +#include "log.h++" +#include "osal.h++" +#include "utils.h++" #include #include diff --git a/test/try.cc b/test/try.c++ similarity index 97% rename from test/try.cc rename to test/try.c++ index da81e631..50c959c9 100644 --- a/test/try.cc +++ b/test/try.c++ @@ -1,4 +1,4 @@ -#include "test.h" +#include "test.h++" class testcase_try : public testcase { public: diff --git a/test/ttl.cc b/test/ttl.c++ similarity index 99% rename from test/ttl.cc rename to test/ttl.c++ index de3c9f42..f8239e94 100644 --- a/test/ttl.cc +++ b/test/ttl.c++ @@ -12,7 +12,7 @@ * . */ -#include "test.h" +#include "test.h++" #include #include diff --git a/test/utils.cc b/test/utils.c++ similarity index 99% rename from test/utils.cc rename to test/utils.c++ index 33420cc0..9e61e4bf 100644 --- a/test/utils.cc +++ b/test/utils.c++ @@ -12,7 +12,7 @@ * . */ -#include "test.h" +#include "test.h++" #include #if defined(HAVE_IEEE754_H) || __has_include() #include diff --git a/test/utils.h b/test/utils.h++ similarity index 99% rename from test/utils.h rename to test/utils.h++ index dd52dc00..3fbf4513 100644 --- a/test/utils.h +++ b/test/utils.h++ @@ -13,7 +13,7 @@ */ #pragma once -#include "base.h" +#include "base.h++" #if !defined(__BYTE_ORDER__) || !defined(__ORDER_LITTLE_ENDIAN__) || \ !defined(__ORDER_BIG_ENDIAN__) From 255890308101d5a4a48f0b64fc670f17dd8382ec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Tue, 8 Nov 2022 16:19:09 +0300 Subject: [PATCH 199/364] =?UTF-8?q?mdbx-make:=20=D0=B8=D1=81=D0=BF=D1=80?= =?UTF-8?q?=D0=B0=D0=B2=D0=BB=D0=B5=D0=BD=D0=B8=D0=B5=20=D1=81=D0=B1=D0=BE?= =?UTF-8?q?=D1=80=D0=BA=D0=B8=20=D1=82=D0=B5=D1=81=D1=82=D0=BE=D0=B2=20?= =?UTF-8?q?=D0=BF=D1=80=D0=B8=20`MDBX=5FBUILD=5FCXX=3DNO`.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- GNUmakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/GNUmakefile b/GNUmakefile index 68972f99..2ea6bf91 100644 --- a/GNUmakefile +++ b/GNUmakefile @@ -353,7 +353,7 @@ TEST_DB ?= $(shell [ -d /dev/shm ] && echo /dev/shm || echo /tmp)/mdbx-test.d TEST_LOG ?= $(shell [ -d /dev/shm ] && echo /dev/shm || echo /tmp)/mdbx-test.log TEST_OSAL := $(shell $(uname2osal)) TEST_ITER := $(shell $(uname2titer)) -TEST_SRC := test/osal-$(TEST_OSAL).c++ $(filter-out $(wildcard test/osal-*.c++), $(wildcard test/*.c++)) +TEST_SRC := test/osal-$(TEST_OSAL).c++ $(filter-out $(wildcard test/osal-*.c++),$(wildcard test/*.c++)) $(call select_by,MDBX_BUILD_CXX,,src/mdbx.c++) TEST_INC := $(wildcard test/*.h++) TEST_OBJ := $(patsubst %.c++,%.o,$(TEST_SRC)) TAR ?= $(shell which gnu-tar || echo tar) From c2703065800c24cc8ba31081dcf090f9d54c8d1b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Tue, 8 Nov 2022 19:44:42 +0300 Subject: [PATCH 200/364] =?UTF-8?q?mdbx-windows:=20=D1=83=D1=82=D0=BE?= =?UTF-8?q?=D1=87=D0=BD=D0=B5=D0=BD=D0=B8=D0=B5=20=D0=BF=D1=80=D0=BE=D0=B2?= =?UTF-8?q?=D0=B5=D1=80=D0=BE=D0=BA=20=D0=BC=D0=B0=D0=BA=D1=80=D0=BE=D1=81?= =?UTF-8?q?=D0=BE=D0=B2=20MinGW=20=D0=B8=20=D1=83=D1=81=D1=82=D1=80=D0=B0?= =?UTF-8?q?=D0=BD=D0=B5=D0=BD=D0=B8=D0=B5=20=D0=BF=D1=80=D0=B5=D0=B4=D1=83?= =?UTF-8?q?=D0=BF=D1=80=D0=B5=D0=B6=D0=B4=D0=B5=D0=BD=D0=B8=D0=B9.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- example/example-mdbx.c | 6 ++++++ src/base.h | 4 ---- src/core.c | 2 ++ src/internals.h | 2 +- src/mdbx.c++ | 2 +- test/osal-windows.c++ | 4 ++-- 6 files changed, 12 insertions(+), 8 deletions(-) diff --git a/example/example-mdbx.c b/example/example-mdbx.c index 501c430d..a3735f9a 100644 --- a/example/example-mdbx.c +++ b/example/example-mdbx.c @@ -18,7 +18,13 @@ * . */ +#if (defined(__MINGW__) || defined(__MINGW32__) || defined(__MINGW64__)) && \ + !defined(__USE_MINGW_ANSI_STDIO) +#define __USE_MINGW_ANSI_STDIO 1 +#endif /* MinGW */ + #include "mdbx.h" + #include #include #include diff --git a/src/base.h b/src/base.h index 187b5270..3533d575 100644 --- a/src/base.h +++ b/src/base.h @@ -236,10 +236,6 @@ __extern_C key_t ftok(const char *, int); #elif _WIN32_WINNT < 0x0500 #error At least 'Windows 2000' API is required for libmdbx. #endif /* _WIN32_WINNT */ -#if (defined(__MINGW32__) || defined(__MINGW64__)) && \ - !defined(__USE_MINGW_ANSI_STDIO) -#define __USE_MINGW_ANSI_STDIO 1 -#endif /* MinGW */ #ifndef WIN32_LEAN_AND_MEAN #define WIN32_LEAN_AND_MEAN #endif /* WIN32_LEAN_AND_MEAN */ diff --git a/src/core.c b/src/core.c index fb30b30e..40ed3bd1 100644 --- a/src/core.c +++ b/src/core.c @@ -24419,6 +24419,8 @@ __dll_export "MINGW-64 " MDBX_STRINGIFY(__MINGW64_MAJOR_VERSION) "." MDBX_STRINGIFY(__MINGW64_MINOR_VERSION) #elif defined(__MINGW32__) "MINGW-32 " MDBX_STRINGIFY(__MINGW32_MAJOR_VERSION) "." MDBX_STRINGIFY(__MINGW32_MINOR_VERSION) + #elif defined(__MINGW__) + "MINGW " MDBX_STRINGIFY(__MINGW_MAJOR_VERSION) "." MDBX_STRINGIFY(__MINGW_MINOR_VERSION) #elif defined(__IBMC__) "IBM C " MDBX_STRINGIFY(__IBMC__) #elif defined(__GNUC__) diff --git a/src/internals.h b/src/internals.h index 55b17b3b..d29b1068 100644 --- a/src/internals.h +++ b/src/internals.h @@ -126,7 +126,7 @@ #if (defined(__MINGW__) || defined(__MINGW32__) || defined(__MINGW64__)) && \ !defined(__USE_MINGW_ANSI_STDIO) #define __USE_MINGW_ANSI_STDIO 1 -#endif /* __USE_MINGW_ANSI_STDIO */ +#endif /* MinGW */ #if (defined(_WIN32) || defined(_WIN64)) && !defined(UNICODE) #define UNICODE diff --git a/src/mdbx.c++ b/src/mdbx.c++ index 78a4ead0..17716c4b 100644 --- a/src/mdbx.c++ +++ b/src/mdbx.c++ @@ -12,7 +12,7 @@ #if (defined(__MINGW__) || defined(__MINGW32__) || defined(__MINGW64__)) && \ !defined(__USE_MINGW_ANSI_STDIO) #define __USE_MINGW_ANSI_STDIO 1 -#endif /* __USE_MINGW_ANSI_STDIO */ +#endif /* MinGW */ #include "../mdbx.h++" diff --git a/test/osal-windows.c++ b/test/osal-windows.c++ index 711a9b12..f656d70a 100644 --- a/test/osal-windows.c++ +++ b/test/osal-windows.c++ @@ -363,11 +363,11 @@ actor_status osal_actor_info(const mdbx_pid_t pid) { case /* STATUS_ASSERTION_FAILURE */ 0xC0000420L: case /* STATUS_HEAP_CORRUPTION */ 0xC0000374L: case /* STATUS_CONTROL_STACK_VIOLATION */ 0xC00001B2L: - log_error("pid %u, exception 0x%x", pid, ExitCode); + log_error("pid %zu, exception 0x%x", (intptr_t)pid, (unsigned)ExitCode); status = as_coredump; break; default: - log_error("pid %u, exit code %u", pid, ExitCode); + log_error("pid %zu, exit code %u", (intptr_t)pid, (unsigned)ExitCode); status = as_failed; break; } From 144cbbabb8096ef1e11ce3c32ca7d4f194ba3fd4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Tue, 8 Nov 2022 20:12:05 +0300 Subject: [PATCH 201/364] =?UTF-8?q?mdbx-test:=20=D0=BF=D0=BE=D0=B4=D0=B4?= =?UTF-8?q?=D0=B5=D1=80=D0=B6=D0=BA=D0=B0=20MinGW=20=D0=B2=20=D1=81=D0=BA?= =?UTF-8?q?=D1=80=D0=B8=D0=BF=D1=82=D0=B0=D1=85=20=D1=82=D0=B5=D1=81=D1=82?= =?UTF-8?q?=D0=B8=D1=80=D0=BE=D0=B2=D0=B0=D0=BD=D0=B8=D1=8F=20=D0=B4=D0=BB?= =?UTF-8?q?=D1=8F=20CI.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test/long_stochastic.sh | 10 ++++- test/stochastic_small.sh | 95 +++++++++++++++++++++++++++++++++++----- 2 files changed, 94 insertions(+), 11 deletions(-) diff --git a/test/long_stochastic.sh b/test/long_stochastic.sh index d75607ec..9ee13e1c 100755 --- a/test/long_stochastic.sh +++ b/test/long_stochastic.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -LIST=basic +LIST=--hill FROM=1 UPTO=9999999 MONITOR= @@ -222,6 +222,14 @@ case ${UNAME} in ;; MSYS*|MINGW*) + if [ -z "${TESTDB_DIR:-}" ]; then + for old_test_dir in $(ls -d /tmp/mdbx-test.[0-9]* 2>/dev/null); do + rm -rf $old_test_dir + done + TESTDB_DIR="/tmp/mdbx-test.$$" + fi + mkdir -p $TESTDB_DIR && rm -f $TESTDB_DIR/* + echo "FIXME: Fake support for ${UNAME}" ram_avail_mb=32768 ;; diff --git a/test/stochastic_small.sh b/test/stochastic_small.sh index b0aa1cb7..1ae45ea7 100755 --- a/test/stochastic_small.sh +++ b/test/stochastic_small.sh @@ -1,18 +1,17 @@ #!/usr/bin/env bash -if ! which make cc c++ tee >/dev/null; then - echo "Please install the following prerequisites: make cc c++ tee banner" >&2 - exit 1 -fi -LIST=--hill +LIST=basic FROM=1 -UPTO=9999 +UPTO=9999999 MONITOR= LOOPS= SKIP_MAKE=no +GEOMETRY_JITTER=yes BANNER="$(which banner 2>/dev/null | echo echo)" UNAME="$(uname -s 2>/dev/null || echo Unknown)" DB_UPTO_MB=17408 +PAGESIZE=min + while [ -n "$1" ] do @@ -20,6 +19,10 @@ do --help) echo "--multi Engage multi-process test scenario (default)" echo "--single Execute series of single-process tests (for QEMU, etc)" + echo "--nested Execute only 'nested' testcase" + echo "--hill Execute only 'hill' testcase" + echo "--append Execute only 'append' testcase" + echo "--ttl Execute only 'ttl' testcase" echo "--with-valgrind Run tests under Valgrind's memcheck tool" echo "--skip-make Don't (re)build libmdbx and test's executable" echo "--from NN Start iterating from the NN ops per test case" @@ -27,6 +30,8 @@ do echo "--loops NN Stop after the NN loops" echo "--dir PATH Specifies directory for test DB and other files (it will be cleared)" echo "--db-upto-mb NN Limits upper size of test DB to the NN megabytes" + echo "--no-geometry-jitter Disable jitter for geometry upper-size" + echo "--pagesize NN Use specified page size (256 is minimal and used by default) " echo "--help Print this usage help and exit" exit -2 ;; @@ -36,6 +41,18 @@ do --single) LIST="--nested --hill --append --ttl --copy" ;; + --nested) + LIST="--nested" + ;; + --hill) + LIST="--hill" + ;; + --append) + LIST="--append" + ;; + --ttl) + LIST="--ttl" + ;; --with-valgrind) echo " NOTE: Valgrind could produce some false-positive warnings" echo " in multi-process environment with shared memory." @@ -88,6 +105,42 @@ do fi shift ;; + --no-geometry-jitter) + GEOMETRY_JITTER=no + ;; + --pagesize|--page-size) + case "$2" in + min|max|256|512|1024|2048|4096|8192|16386|32768|65536) + PAGESIZE=$2 + ;; + 1|1k|1K|k|K) + PAGESIZE=$((1024*1)) + ;; + 2|2k|2K) + PAGESIZE=$((1024*2)) + ;; + 4|4k|4K) + PAGESIZE=$((1024*4)) + ;; + 8|8k|8K) + PAGESIZE=$((1024*8)) + ;; + 16|16k|16K) + PAGESIZE=$((1024*16)) + ;; + 32|32k|32K) + PAGESIZE=$((1024*32)) + ;; + 64|64k|64K) + PAGESIZE=$((1024*64)) + ;; + *) + echo "Invalig page size '$2'" + exit -2 + ;; + esac + shift + ;; *) echo "Unknown option '$1'" exit -2 @@ -107,6 +160,11 @@ if [ -z "$MONITOR" ]; then export MALLOC_CHECK_=7 MALLOC_PERTURB_=42 fi +if ! which $([ "$SKIP_MAKE" == "no" ] && echo make cc c++) tee >/dev/null; then + echo "Please install the following prerequisites: make cc c++ tee banner" >&2 + exit 1 +fi + ############################################################################### # 1. clean data from prev runs and examine available RAM @@ -123,9 +181,9 @@ case ${UNAME} in mkdir -p $TESTDB_DIR && rm -f $TESTDB_DIR/* if LC_ALL=C free | grep -q -i available; then - ram_avail_mb=$(($(LC_ALL=C free | grep -i Mem: | tr -s [:blank:] ' ' | cut -d ' ' -f 7) / 1024)) + ram_avail_mb=$(($(LC_ALL=C free | grep -i Mem: | tr -s '[:blank:]' ' ' | cut -d ' ' -f 7) / 1024)) else - ram_avail_mb=$(($(LC_ALL=C free | grep -i Mem: | tr -s [:blank:] ' ' | cut -d ' ' -f 4) / 1024)) + ram_avail_mb=$(($(LC_ALL=C free | grep -i Mem: | tr -s '[:blank:]' ' ' | cut -d ' ' -f 4) / 1024)) fi ;; @@ -164,6 +222,19 @@ case ${UNAME} in echo "pagesize ${pagesize}K, freepages ${freepages}, ram_avail_mb ${ram_avail_mb}" ;; + MSYS*|MINGW*) + if [ -z "${TESTDB_DIR:-}" ]; then + for old_test_dir in $(ls -d /tmp/mdbx-test.[0-9]* 2>/dev/null); do + rm -rf $old_test_dir + done + TESTDB_DIR="/tmp/mdbx-test.$$" + fi + mkdir -p $TESTDB_DIR && rm -f $TESTDB_DIR/* + + echo "FIXME: Fake support for ${UNAME}" + ram_avail_mb=32768 + ;; + *) echo "FIXME: ${UNAME} not supported by this script" exit 2 @@ -236,6 +307,10 @@ case ${UNAME} in fi ;; + MSYS*|MINGW*) + echo "FIXME: Fake support for ${UNAME}" + ;; + *) echo "FIXME: ${UNAME} not supported by this script" exit 2 @@ -300,8 +375,8 @@ function probe { rm -f ${TESTDB_DIR}/* || failed for case in $LIST do - echo "Run ./mdbx_test ${speculum} --random-writemap=no --ignore-dbfull --repeat=1 --pathname=${TESTDB_DIR}/long.db --cleanup-after=no $@ $case" - ${MONITOR} ./mdbx_test ${speculum} --random-writemap=no --ignore-dbfull --repeat=1 --pathname=${TESTDB_DIR}/long.db --cleanup-before=yes --cleanup-after=no "$@" $case | check_deep \ + echo "Run ./mdbx_test ${speculum} --random-writemap=no --ignore-dbfull --repeat=11 --pathname=${TESTDB_DIR}/long.db --cleanup-after=no --geometry-jitter=${GEOMETRY_JITTER} $@ $case" + ${MONITOR} ./mdbx_test ${speculum} --random-writemap=no --ignore-dbfull --repeat=11 --pathname=${TESTDB_DIR}/long.db --cleanup-after=no --geometry-jitter=${GEOMETRY_JITTER} "$@" $case | check_deep \ && ${MONITOR} ./mdbx_chk ${TESTDB_DIR}/long.db | tee ${TESTDB_DIR}/long-chk.log \ && ([ ! -e ${TESTDB_DIR}/long.db-copy ] || ${MONITOR} ./mdbx_chk ${TESTDB_DIR}/long.db-copy | tee ${TESTDB_DIR}/long-chk-copy.log) \ || failed From 2dfdac28211ccd1b1250072792f2879aef6c0df5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Tue, 8 Nov 2022 23:32:34 +0300 Subject: [PATCH 202/364] =?UTF-8?q?mdbx-windows:=20=D0=BF=D0=BE=D0=B2?= =?UTF-8?q?=D1=82=D0=BE=D1=80=20=D1=87=D1=82=D0=B5=D0=BD=D0=B8=D1=8F=20?= =?UTF-8?q?=D0=B7=D0=B0=D0=B3=D0=BE=D0=BB=D0=BE=D0=B2=D0=BA=D0=B0=20=D0=BF?= =?UTF-8?q?=D1=80=D0=B8=20`ERROR=5FLOCK=5FVIOLATION`.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core.c | 37 ++++++++++++++++++++++++++++++------- 1 file changed, 30 insertions(+), 7 deletions(-) diff --git a/src/core.c b/src/core.c index 40ed3bd1..494da680 100644 --- a/src/core.c +++ b/src/core.c @@ -11425,19 +11425,42 @@ __cold static int read_header(MDBX_env *env, MDBX_meta *dest, TRACE("reading meta[%d]: offset %u, bytes %u, retry-left %u", meta_number, offset, MIN_PAGESIZE, retryleft); int err = osal_pread(env->me_lazy_fd, buffer, MIN_PAGESIZE, offset); + if (err == MDBX_ENODATA && offset == 0 && loop_count == 0 && + env->me_dxb_mmap.filesize == 0 && + mode_bits /* non-zero for DB creation */ != 0) { + NOTICE("read meta: empty file (%d, %s)", err, mdbx_strerror(err)); + return err; + } +#if defined(_WIN32) || defined(_WIN64) + if (err == ERROR_LOCK_VIOLATION) { + SleepEx(0, true); + err = osal_pread(env->me_lazy_fd, buffer, MIN_PAGESIZE, offset); + if (err == ERROR_LOCK_VIOLATION && --retryleft) { + WARNING("read meta[%u,%u]: %i, %s", offset, MIN_PAGESIZE, err, + mdbx_strerror(err)); + continue; + } + } +#endif /* Windows */ if (err != MDBX_SUCCESS) { - if (err == MDBX_ENODATA && offset == 0 && loop_count == 0 && - env->me_dxb_mmap.filesize == 0 && - mode_bits /* non-zero for DB creation */ != 0) - NOTICE("read meta: empty file (%d, %s)", err, mdbx_strerror(err)); - else - ERROR("read meta[%u,%u]: %i, %s", offset, MIN_PAGESIZE, err, - mdbx_strerror(err)); + ERROR("read meta[%u,%u]: %i, %s", offset, MIN_PAGESIZE, err, + mdbx_strerror(err)); return err; } char again[MIN_PAGESIZE]; err = osal_pread(env->me_lazy_fd, again, MIN_PAGESIZE, offset); +#if defined(_WIN32) || defined(_WIN64) + if (err == ERROR_LOCK_VIOLATION) { + SleepEx(0, true); + err = osal_pread(env->me_lazy_fd, again, MIN_PAGESIZE, offset); + if (err == ERROR_LOCK_VIOLATION && --retryleft) { + WARNING("read meta[%u,%u]: %i, %s", offset, MIN_PAGESIZE, err, + mdbx_strerror(err)); + continue; + } + } +#endif /* Windows */ if (err != MDBX_SUCCESS) { ERROR("read meta[%u,%u]: %i, %s", offset, MIN_PAGESIZE, err, mdbx_strerror(err)); From 1b0d747e7b42f715bfd9469b82e63f878a77bcca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Wed, 9 Nov 2022 00:36:06 +0300 Subject: [PATCH 203/364] =?UTF-8?q?mdbx:=20=D0=BE=D0=B1=D0=BD=D0=BE=D0=B2?= =?UTF-8?q?=D0=BB=D0=B5=D0=BD=D0=B8=D0=B5=20ChangeLog.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ChangeLog.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ChangeLog.md b/ChangeLog.md index 4e4ca18a..a880461e 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -8,6 +8,8 @@ and [by Yandex](https://translated.turbopages.org/proxy_u/ru-en.en/https/gitflic Новое: + - Требования к CMake понижены до версии 3.0.2 для возможности сборки для устаревших платформ. + - Добавлена возможность профилирования работы GC в сложных и/или нагруженных сценариях (например Ethereum/Erigon). По-умолчанию соответствующий код отключен, а для его активации необходимо указать опцию сборки `MDBX_ENABLE_PROFGC=1`. @@ -105,6 +107,7 @@ and [by Yandex](https://translated.turbopages.org/proxy_u/ru-en.en/https/gitflic Исправления (без корректировок вышеперечисленных новых функций): + - Устранения ряда предупреждений при сборке посредством MinGW. - Устранение ложно-положительных сообщений от Valgrind об использовании не инициализированных данных из-за выравнивающих зазоров в `struct troika`. - Исправлен возврат неожиданной ошибки `MDBX_BUSY` из функций `mdbx_env_set_option()`, From 67f4098bfa6fb52bfa5b6b708928c7ea2e35b2e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Wed, 9 Nov 2022 12:39:06 +0300 Subject: [PATCH 204/364] mdbx: add admonition of insecure for RISC-V. --- mdbx.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/mdbx.h b/mdbx.h index b1cfc0f0..98e8b494 100644 --- a/mdbx.h +++ b/mdbx.h @@ -75,6 +75,14 @@ OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. #ifndef LIBMDBX_H #define LIBMDBX_H +#if defined(__riscv) || defined(__riscv__) || defined(__RISCV) || \ + defined(__RISCV__) +#warning The RISC-V architecture is intentionally insecure by design. \ + Please delete this admonition at your own risk, \ + if you make such decision informed and consciously. \ + Refer to https://clck.ru/32d9xH for more information. +#endif /* RISC-V */ + #ifdef _MSC_VER #pragma warning(push, 1) #pragma warning(disable : 4548) /* expression before comma has no effect; \ From ec0ec90f157a9c580bfbdf5ae50490694dfbe73b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Wed, 9 Nov 2022 16:41:46 +0300 Subject: [PATCH 205/364] =?UTF-8?q?mdbx:=20=D0=BA=D0=BE=D1=80=D1=80=D0=B5?= =?UTF-8?q?=D0=BA=D1=82=D0=B8=D1=80=D0=BE=D0=B2=D0=BA=D0=B0=20ChangeLog.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ChangeLog.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ChangeLog.md b/ChangeLog.md index a880461e..fda8ce33 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -4,7 +4,7 @@ ChangeLog English version [by Google](https://gitflic-ru.translate.goog/project/erthink/libmdbx/blob?file=ChangeLog.md&_x_tr_sl=ru&_x_tr_tl=en) and [by Yandex](https://translated.turbopages.org/proxy_u/ru-en.en/https/gitflic.ru/project/erthink/libmdbx/blob?file=ChangeLog.md). -## v0.12.2 (Ярыгин) запланировано на 2022-11-07 +## v0.12.2 (Ярыгин) запланировано на 2022-11-11 Новое: @@ -138,6 +138,7 @@ and [by Yandex](https://translated.turbopages.org/proxy_u/ru-en.en/https/gitflic Мелочи: + - Добавлено предупреждение о небезопасности RISC-V. - Добавлено описание параметров `MDBX_debug_func` и `MDBX_debug_func`. - Добавлено обходное решение для минимизации ложно-положительных конфликтов при использовании файловых блокировок в Windows. From 13c256026ef0199847080cf3e66db9990c3aa5b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Wed, 9 Nov 2022 23:37:50 +0300 Subject: [PATCH 206/364] =?UTF-8?q?mdbx-cmake:=20=D1=81=D0=B8=D0=BD=D1=85?= =?UTF-8?q?=D1=80=D0=BE=D0=BD=D0=B8=D0=B7=D0=B0=D1=86=D0=B8=D1=8F=20=D0=BA?= =?UTF-8?q?=D0=BE=D0=BD=D1=81=D1=82=D1=80=D1=83=D0=BA=D1=86=D0=B8=D0=B9=20?= =?UTF-8?q?CMake=20=D0=BC=D0=B5=D0=B6=D0=B4=D1=83=20=D0=BF=D1=80=D0=BE?= =?UTF-8?q?=D0=B5=D0=BA=D1=82=D0=B0=D0=BC=D0=B8.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CMakeLists.txt | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index bd5bd1c2..04856291 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -243,25 +243,21 @@ else() option(BUILD_FOR_NATIVE_CPU "Generate code for the compiling machine CPU" OFF) endif() - if((CMAKE_CONFIGURATION_TYPES OR NOT CMAKE_BUILD_TYPE_UPPERCASE STREQUAL "DEBUG") - AND (NOT CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 6.0) - AND (NOT CMAKE_COMPILER_IS_CLANG OR CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 4.0)) - set(INTERPROCEDURAL_OPTIMIZATION_DEFAULT ON) - else() - set(INTERPROCEDURAL_OPTIMIZATION_DEFAULT OFF) - endif() - if(CMAKE_INTERPROCEDURAL_OPTIMIZATION_AVAILABLE OR GCC_LTO_AVAILABLE OR MSVC_LTO_AVAILABLE OR CLANG_LTO_AVAILABLE) - option(INTERPROCEDURAL_OPTIMIZATION "Enable interprocedural/LTO optimization" ${INTERPROCEDURAL_OPTIMIZATION_DEFAULT}) - else() - set(INTERPROCEDURAL_OPTIMIZATION OFF) + if((CMAKE_CONFIGURATION_TYPES OR NOT CMAKE_BUILD_TYPE_UPPERCASE STREQUAL "DEBUG") AND + ((MSVC_LTO_AVAILABLE AND NOT CMAKE_C_COMPILER_VERSION VERSION_LESS 19) OR + (GCC_LTO_AVAILABLE AND NOT CMAKE_C_COMPILER_VERSION VERSION_LESS 7) OR + (CLANG_LTO_AVAILABLE AND NOT CMAKE_C_COMPILER_VERSION VERSION_LESS 5))) + set(INTERPROCEDURAL_OPTIMIZATION_DEFAULT ON) + else() + set(INTERPROCEDURAL_OPTIMIZATION_DEFAULT OFF) + endif() + option(INTERPROCEDURAL_OPTIMIZATION "Enable interprocedural/LTO optimization." ${INTERPROCEDURAL_OPTIMIZATION_DEFAULT}) endif() if(INTERPROCEDURAL_OPTIMIZATION) - if(GCC_LTO_AVAILABLE AND - # Отключаем LTO для G++ < 7.0, ибо падает: lto1: internal compiler error: in splice_child_die, at dwarf2out.c:5030 - (NOT CMAKE_CXX_COMPILER_LOADED OR NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 7)) + if(GCC_LTO_AVAILABLE) set(LTO_ENABLED TRUE) set(CMAKE_AR ${CMAKE_GCC_AR} CACHE PATH "Path to ar program with LTO-plugin" FORCE) set(CMAKE_C_COMPILER_AR ${CMAKE_AR} CACHE PATH "Path to ar program with LTO-plugin" FORCE) From bc80fbbeeafaf23b9d521c3539aa4254cd434fd6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Thu, 10 Nov 2022 10:49:01 +0300 Subject: [PATCH 207/364] mdbx-test: fix stochastic scripts after prev commit. --- test/long_stochastic.sh | 2 +- test/stochastic_small.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/test/long_stochastic.sh b/test/long_stochastic.sh index 9ee13e1c..1dc2b8d2 100755 --- a/test/long_stochastic.sh +++ b/test/long_stochastic.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -LIST=--hill +LIST=basic FROM=1 UPTO=9999999 MONITOR= diff --git a/test/stochastic_small.sh b/test/stochastic_small.sh index 1ae45ea7..4a886bbf 100755 --- a/test/stochastic_small.sh +++ b/test/stochastic_small.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -LIST=basic +LIST=--hill FROM=1 UPTO=9999999 MONITOR= From 90f39c88a0c585086a792a5ac26fc566508a2619 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Thu, 10 Nov 2022 00:17:48 +0300 Subject: [PATCH 208/364] mdbx-test: add `--dont-check-ram-size` option to stochastic scripts. --- test/long_stochastic.sh | 26 ++++++++++++++++++-------- test/stochastic_small.sh | 27 ++++++++++++++++++--------- 2 files changed, 36 insertions(+), 17 deletions(-) diff --git a/test/long_stochastic.sh b/test/long_stochastic.sh index 1dc2b8d2..906bacee 100755 --- a/test/long_stochastic.sh +++ b/test/long_stochastic.sh @@ -11,6 +11,7 @@ BANNER="$(which banner 2>/dev/null | echo echo)" UNAME="$(uname -s 2>/dev/null || echo Unknown)" DB_UPTO_MB=17408 PAGESIZE=min +DONT_CHECK_RAM=no while [ -n "$1" ] do @@ -31,6 +32,7 @@ do echo "--db-upto-mb NN Limits upper size of test DB to the NN megabytes" echo "--no-geometry-jitter Disable jitter for geometry upper-size" echo "--pagesize NN Use specified page size (256 is minimal and used by default) " + echo "--dont-check-ram-size Don't check available RAM " echo "--help Print this usage help and exit" exit -2 ;; @@ -140,6 +142,9 @@ do esac shift ;; + --dont-check-ram-size) + DONT_CHECK_RAM=yes + ;; *) echo "Unknown option '$1'" exit -2 @@ -246,11 +251,15 @@ rm -f ${TESTDB_DIR}/* # 2. estimate reasonable RAM space for test-db echo "=== ${ram_avail_mb}M RAM available" -ram_reserve4logs_mb=1234 -if [ $ram_avail_mb -lt $ram_reserve4logs_mb ]; then - echo "=== At least ${ram_reserve4logs_mb}Mb RAM required" - exit 3 -fi +if [ $DONT_CHECK_RAM = yes ]; then + db_size_mb=$DB_UPTO_MB + ram_reserve4logs_mb=64 +else + ram_reserve4logs_mb=1234 + if [ $ram_avail_mb -lt $ram_reserve4logs_mb ]; then + echo "=== At least ${ram_reserve4logs_mb}Mb RAM required" + exit 3 + fi # # В режимах отличных от MDBX_WRITEMAP изменения до записи в файл @@ -270,9 +279,10 @@ fi # that malloc() will not return the allocated memory to the # system immediately, as well some space is required for logs. # -db_size_mb=$(((ram_avail_mb - ram_reserve4logs_mb) / 4)) -if [ $db_size_mb -gt $DB_UPTO_MB ]; then - db_size_mb=$DB_UPTO_MB + db_size_mb=$(((ram_avail_mb - ram_reserve4logs_mb) / 4)) + if [ $db_size_mb -gt $DB_UPTO_MB ]; then + db_size_mb=$DB_UPTO_MB + fi fi echo "=== use ${db_size_mb}M for DB" diff --git a/test/stochastic_small.sh b/test/stochastic_small.sh index 4a886bbf..50497f85 100755 --- a/test/stochastic_small.sh +++ b/test/stochastic_small.sh @@ -11,7 +11,7 @@ BANNER="$(which banner 2>/dev/null | echo echo)" UNAME="$(uname -s 2>/dev/null || echo Unknown)" DB_UPTO_MB=17408 PAGESIZE=min - +DONT_CHECK_RAM=no while [ -n "$1" ] do @@ -32,6 +32,7 @@ do echo "--db-upto-mb NN Limits upper size of test DB to the NN megabytes" echo "--no-geometry-jitter Disable jitter for geometry upper-size" echo "--pagesize NN Use specified page size (256 is minimal and used by default) " + echo "--dont-check-ram-size Don't check available RAM " echo "--help Print this usage help and exit" exit -2 ;; @@ -141,6 +142,9 @@ do esac shift ;; + --dont-check-ram-size) + DONT_CHECK_RAM=yes + ;; *) echo "Unknown option '$1'" exit -2 @@ -247,11 +251,15 @@ rm -f ${TESTDB_DIR}/* # 2. estimate reasonable RAM space for test-db echo "=== ${ram_avail_mb}M RAM available" -ram_reserve4logs_mb=1234 -if [ $ram_avail_mb -lt $ram_reserve4logs_mb ]; then - echo "=== At least ${ram_reserve4logs_mb}Mb RAM required" - exit 3 -fi +if [ $DONT_CHECK_RAM = yes ]; then + db_size_mb=$DB_UPTO_MB + ram_reserve4logs_mb=64 +else + ram_reserve4logs_mb=1234 + if [ $ram_avail_mb -lt $ram_reserve4logs_mb ]; then + echo "=== At least ${ram_reserve4logs_mb}Mb RAM required" + exit 3 + fi # # В режимах отличных от MDBX_WRITEMAP изменения до записи в файл @@ -271,9 +279,10 @@ fi # that malloc() will not return the allocated memory to the # system immediately, as well some space is required for logs. # -db_size_mb=$(((ram_avail_mb - ram_reserve4logs_mb) / 4)) -if [ $db_size_mb -gt $DB_UPTO_MB ]; then - db_size_mb=$DB_UPTO_MB + db_size_mb=$(((ram_avail_mb - ram_reserve4logs_mb) / 4)) + if [ $db_size_mb -gt $DB_UPTO_MB ]; then + db_size_mb=$DB_UPTO_MB + fi fi echo "=== use ${db_size_mb}M for DB" From 53d78bbad5285b3676ba2d196756eea539429be2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Thu, 10 Nov 2022 00:20:38 +0300 Subject: [PATCH 209/364] mdbx-make: use `--dont-check-ram-size` for small-tests targets. --- GNUmakefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/GNUmakefile b/GNUmakefile index 2ea6bf91..85147f74 100644 --- a/GNUmakefile +++ b/GNUmakefile @@ -418,7 +418,7 @@ smoke-fault: build-test test: build-test @echo ' RUNNING `test/long_stochastic.sh --loops 2`...' - $(QUIET)test/long_stochastic.sh --loops 2 --db-upto-mb 256 --skip-make >$(TEST_LOG) || (cat $(TEST_LOG) && false) + $(QUIET)test/long_stochastic.sh --dont-check-ram-size --loops 2 --db-upto-mb 256 --skip-make >$(TEST_LOG) || (cat $(TEST_LOG) && false) long-test: build-test @echo ' RUNNING `test/long_stochastic.sh --loops 42`...' @@ -426,7 +426,7 @@ long-test: build-test test-singleprocess: build-test @echo ' RUNNING `test/long_stochastic.sh --single --loops 2`...' - $(QUIET)test/long_stochastic.sh --single --loops 2 --db-upto-mb 256 --skip-make >$(TEST_LOG) || (cat $(TEST_LOG) && false) + $(QUIET)test/long_stochastic.sh --dont-check-ram-size --single --loops 2 --db-upto-mb 256 --skip-make >$(TEST_LOG) || (cat $(TEST_LOG) && false) test-valgrind: CFLAGS_EXTRA=-Ofast -DMDBX_USE_VALGRIND test-valgrind: build-test From 8ffb0bb3d881a9da3f9c5b731e7cf38532cb8ba6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Thu, 10 Nov 2022 15:09:42 +0300 Subject: [PATCH 210/364] =?UTF-8?q?mdbx-cmake:=20=D0=BF=D0=BE=D0=B4=D0=B4?= =?UTF-8?q?=D0=B5=D1=80=D0=B6=D0=BA=D0=B0=20=D0=B2=D1=81=D0=B5=D1=85=20?= =?UTF-8?q?=D0=BE=D1=81=D0=BD=D0=BE=D0=B2=D0=BD=D1=8B=D1=85=20=D0=BE=D0=BF?= =?UTF-8?q?=D1=86=D0=B8=D0=B9=20=D0=BF=D1=80=D0=B8=20=D1=81=D0=B1=D0=BE?= =?UTF-8?q?=D1=80=D0=BA=D0=B5=20=D0=BF=D0=BE=D1=81=D1=80=D0=B5=D0=B4=D1=81?= =?UTF-8?q?=D1=82=D0=B2=D0=BE=D0=BC=20CMake.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CMakeLists.txt | 10 ++++++++++ src/config.h.in | 5 +++++ src/options.h | 2 +- 3 files changed, 16 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 04856291..a5167d62 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -519,6 +519,16 @@ add_mdbx_option(MDBX_TRUST_RTC "Does a system have battery-backed Real-Time Cloc mark_as_advanced(MDBX_TRUST_RTC) option(MDBX_FORCE_ASSERTIONS "Force enable assertion checking" OFF) option(MDBX_DISABLE_VALIDATION "Disable some checks to reduce an overhead and detection probability of database corruption to a values closer to the LMDB" OFF) +option(MDBX_ENABLE_REFUND "Zerocost auto-compactification during write-transactions" ON) +option(MDBX_ENABLE_MADVISE "Using POSIX' madvise() and/or similar hints" ON) +if (CMAKE_TARGET_BITNESS GREATER 32) + set(MDBX_BIGFOOT_DEFAULT ON) +else() + set(MDBX_BIGFOOT_DEFAULT OFF) +endif() +option(MDBX_ENABLE_BIGFOOT "Chunking long list of retired pages during huge transactions commit to avoid use sequences of pages" ${MDBX_BIGFOOT_DEFAULT}) +option(MDBX_ENABLE_PGOP_STAT "Gathering statistics for page operations" ON) +option(MDBX_ENABLE_PROFGC "Profiling of GC search and updates" OFF) if(NOT MDBX_AMALGAMATED_SOURCE) if(CMAKE_CONFIGURATION_TYPES OR CMAKE_BUILD_TYPE_UPPERCASE STREQUAL "DEBUG") diff --git a/src/config.h.in b/src/config.h.in index 786a8c0b..05c561b1 100644 --- a/src/config.h.in +++ b/src/config.h.in @@ -28,6 +28,11 @@ #endif #cmakedefine01 MDBX_DISABLE_VALIDATION #cmakedefine01 MDBX_AVOID_MSYNC +#cmakedefine01 MDBX_ENABLE_REFUND +#cmakedefine01 MDBX_ENABLE_MADVISE +#cmakedefine01 MDBX_ENABLE_BIGFOOT +#cmakedefine01 MDBX_ENABLE_PGOP_STAT +#cmakedefine01 MDBX_ENABLE_PROFGC /* Windows */ #cmakedefine01 MDBX_WITHOUT_MSVC_CRT diff --git a/src/options.h b/src/options.h index 11df6967..2ab0dce6 100644 --- a/src/options.h +++ b/src/options.h @@ -99,7 +99,7 @@ #error MDBX_ENABLE_BIGFOOT must be defined as 0 or 1 #endif /* MDBX_ENABLE_BIGFOOT */ -/** Controls use of POSIX madvise() hints and friends. */ +/** Controls using of POSIX' madvise() and/or similar hints. */ #ifndef MDBX_ENABLE_MADVISE #define MDBX_ENABLE_MADVISE 1 #elif !(MDBX_ENABLE_MADVISE == 0 || MDBX_ENABLE_MADVISE == 1) From 70e80067760ab24dff5fbc7d8e5f49194378262e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Thu, 10 Nov 2022 15:54:31 +0300 Subject: [PATCH 211/364] =?UTF-8?q?mdbx-docs:=20=D0=BF=D0=B5=D1=80=D0=B5?= =?UTF-8?q?=D0=BD=D0=B0=D0=BF=D1=80=D0=B0=D0=B2=D0=BB=D0=B5=D0=BD=D0=B8?= =?UTF-8?q?=D0=B5=20github-=D1=81=D1=81=D1=8B=D0=BB=D0=BE=D0=BA=20=D0=BD?= =?UTF-8?q?=D0=B0=20web-archive.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ChangeLog.md | 130 ++++++++++++++++++------------------- README.md | 6 +- TODO.md | 22 +++---- mdbx.h++ | 2 +- src/core.c | 20 +++--- src/mdbx_chk.c | 2 +- src/osal.c | 12 ++-- test/valgrind_suppress.txt | 2 +- 8 files changed, 98 insertions(+), 98 deletions(-) diff --git a/ChangeLog.md b/ChangeLog.md index fda8ce33..d162ba94 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -353,7 +353,7 @@ New: - Support build by MinGW' make from command line without CMake. - Added `mdbx::filesystem` C++ API namespace that corresponds to `std::filesystem` or `std::experimental::filesystem`. - Created [website](https://libmdbx.dqdkfa.ru/) for online auto-generated documentation. - - Used `https://web.archive.org/web/20220414235959/https://github.com/erthink/` for dead (or temporarily lost) resources deleted by ~~Github~~. + - Used `https://web.archive.org/web/https://github.com/erthink/libmdbx` for dead (or temporarily lost) resources deleted by ~~Github~~. - Added `--loglevel=` command-line option to the `mdbx_test` tool. - Added few fast smoke-like tests into CMake builds. @@ -393,7 +393,7 @@ Minors: The stable release with the complete workaround for an incoherence flaw of Linux unified page/buffer cache. Nonetheless the cause for this trouble may be an issue of Intel CPU cache/MESI. -See [issue#269](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/269) for more information. +See [issue#269](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/269) for more information. Acknowledgements: @@ -402,8 +402,8 @@ Acknowledgements: Fixes: - - [Added complete workaround](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/269) for an incoherence flaw of Linux unified page/buffer cache. - - [Fixed](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/272) cursor reusing for read-only transactions. + - [Added complete workaround](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/269) for an incoherence flaw of Linux unified page/buffer cache. + - [Fixed](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/272) cursor reusing for read-only transactions. - Fixed copy&paste typo inside `mdbx::cursor::find_multivalue()`. Minors: @@ -418,7 +418,7 @@ Minors: ## v0.11.5 at 2022-02-23 The release with the temporary hotfix for a flaw of Linux unified page/buffer cache. -See [issue#269](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/269) for more information. +See [issue#269](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/269) for more information. Acknowledgements: @@ -428,10 +428,10 @@ Acknowledgements: Fixes: - - [Added hotfix](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/269) for a flaw of Linux unified page/buffer cache. - - [Fixed/Reworked](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/pull/270) move-assignment operators for "managed" classes of C++ API. + - [Added hotfix](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/269) for a flaw of Linux unified page/buffer cache. + - [Fixed/Reworked](https://web.archive.org/web/https://github.com/erthink/libmdbx/pull/270) move-assignment operators for "managed" classes of C++ API. - Fixed potential `SIGSEGV` while open DB with overrided non-default page size. - - [Made](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/267) `mdbx_env_open()` idempotence in failure cases. + - [Made](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/267) `mdbx_env_open()` idempotence in failure cases. - Refined/Fixed pages reservation inside `mdbx_update_gc()` to avoid non-reclamation in a rare cases. - Fixed typo in a retained space calculation for the hsr-callback. @@ -464,15 +464,15 @@ New features, extensions and improvements: Fixes: - Fixed handling `MDBX_opt_rp_augment_limit` for GC's records from huge transactions (Erigon/Akula/Ethereum). - - [Fixed](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/258) build on Android (avoid including `sys/sem.h`). - - [Fixed](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/pull/261) missing copy assignment operator for `mdbx::move_result`. + - [Fixed](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/258) build on Android (avoid including `sys/sem.h`). + - [Fixed](https://web.archive.org/web/https://github.com/erthink/libmdbx/pull/261) missing copy assignment operator for `mdbx::move_result`. - Fixed missing `&` for `std::ostream &operator<<()` overloads. - Fixed unexpected `EXDEV` (Cross-device link) error from `mdbx_env_copy()`. - Fixed base64 encoding/decoding bugs in auxillary C++ API. - Fixed overflow of `pgno_t` during checking PNL on 64-bit platforms. - - [Fixed](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/260) excessive PNL checking after sort for spilling. + - [Fixed](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/260) excessive PNL checking after sort for spilling. - Reworked checking `MAX_PAGENO` and DB upper-size geometry limit. - - [Fixed](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/265) build for some combinations of versions of MSVC and Windows SDK. + - [Fixed](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/265) build for some combinations of versions of MSVC and Windows SDK. Minors: @@ -499,10 +499,10 @@ Acknowledgements: New features, extensions and improvements: - - [Added](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/236) `mdbx_cursor_get_batch()`. - - [Added](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/250) `MDBX_SET_UPPERBOUND`. + - [Added](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/236) `mdbx_cursor_get_batch()`. + - [Added](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/250) `MDBX_SET_UPPERBOUND`. - C++ API is finalized now. - - The GC update stage has been [significantly speeded](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/254) when fixing huge Erigon's transactions (Ethereum ecosystem). + - The GC update stage has been [significantly speeded](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/254) when fixing huge Erigon's transactions (Ethereum ecosystem). Fixes: @@ -513,12 +513,12 @@ Minors: - Fixed returning `MDBX_RESULT_TRUE` (unexpected -1) from `mdbx_env_set_option()`. - Added `mdbx_env_get_syncbytes()` and `mdbx_env_get_syncperiod()`. - - [Clarified](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/pull/249) description of `MDBX_INTEGERKEY`. + - [Clarified](https://web.archive.org/web/https://github.com/erthink/libmdbx/pull/249) description of `MDBX_INTEGERKEY`. - Reworked/simplified `mdbx_env_sync_internal()`. - - [Fixed](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/248) extra assertion inside `mdbx_cursor_put()` for `MDBX_DUPFIXED` cases. + - [Fixed](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/248) extra assertion inside `mdbx_cursor_put()` for `MDBX_DUPFIXED` cases. - Avoiding extra looping inside `mdbx_env_info_ex()`. - Explicitly enabled core dumps from stochastic tests scripts on Linux. - - [Fixed](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/253) `mdbx_override_meta()` to avoid false-positive assertions. + - [Fixed](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/253) `mdbx_override_meta()` to avoid false-positive assertions. - For compatibility reverted returning `MDBX_ENODATA`for some cases. @@ -534,10 +534,10 @@ Acknowledgements: Fixes: - - [Fixed compilation](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/pull/239) with `devtoolset-9` on CentOS/RHEL 7. - - [Fixed unexpected `MDBX_PROBLEM` error](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/242) because of update an obsolete meta-page. - - [Fixed returning `MDBX_NOTFOUND` error](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/243) in case an inexact value found for `MDBX_GET_BOTH` operation. - - [Fixed compilation](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/245) without kernel/libc-devel headers. + - [Fixed compilation](https://web.archive.org/web/https://github.com/erthink/libmdbx/pull/239) with `devtoolset-9` on CentOS/RHEL 7. + - [Fixed unexpected `MDBX_PROBLEM` error](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/242) because of update an obsolete meta-page. + - [Fixed returning `MDBX_NOTFOUND` error](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/243) in case an inexact value found for `MDBX_GET_BOTH` operation. + - [Fixed compilation](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/245) without kernel/libc-devel headers. Minors: @@ -554,7 +554,7 @@ Minors: The database format signature has been changed to prevent forward-interoperability with an previous releases, which may lead to a -[false positive diagnosis of database corruption](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/238) +[false positive diagnosis of database corruption](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/238) due to flaws of an old library versions. This change is mostly invisible: @@ -606,7 +606,7 @@ Acknowledgements: Fixes: - Fixed possibility of looping update GC during transaction commit (no public issue since the problem was discovered inside [Positive Technologies](https://www.ptsecurity.ru)). - - Fixed `#pragma pack` to avoid provoking some compilers to generate code with [unaligned access](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/235). + - Fixed `#pragma pack` to avoid provoking some compilers to generate code with [unaligned access](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/235). - Fixed `noexcept` for potentially throwing `txn::put()` of C++ API. Minors: @@ -632,7 +632,7 @@ Extensions and improvements: Fixes: - - Always setup `madvise` while opening DB (fixes https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/231). + - Always setup `madvise` while opening DB (fixes https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/231). - Fixed checking legacy `P_DIRTY` flag (`0x10`) for nested/sub-pages. Minors: @@ -653,11 +653,11 @@ Acknowledgements: - [Lionel Debroux](https://github.com/debrouxl) for fuzzing tests and reporting bugs. - [Sergey Fedotov](https://github.com/SergeyFromHell/) for [`node-mdbx` NodeJS bindings](https://www.npmjs.com/package/node-mdbx). - [Kris Zyp](https://github.com/kriszyp) for [`lmdbx-store` NodeJS bindings](https://github.com/kriszyp/lmdbx-store). - - [Noel Kuntze](https://github.com/Thermi) for [draft Python bindings](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/commits/python-bindings). + - [Noel Kuntze](https://github.com/Thermi) for [draft Python bindings](https://web.archive.org/web/https://github.com/erthink/libmdbx/commits/python-bindings). New features, extensions and improvements: - - [Allow to predefine/override `MDBX_BUILD_TIMESTAMP` for builds reproducibility](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/201). + - [Allow to predefine/override `MDBX_BUILD_TIMESTAMP` for builds reproducibility](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/201). - Added options support for `long-stochastic` script. - Avoided `MDBX_TXN_FULL` error for large transactions when possible. - The `MDBX_READERS_LIMIT` increased to `32767`. @@ -665,7 +665,7 @@ New features, extensions and improvements: - Minimized the size of poisoned/unpoisoned regions to avoid Valgrind/ASAN stuck. - Added more workarounds for QEMU for testing builds for 32-bit platforms, Alpha and Sparc architectures. - `mdbx_chk` now skips iteration & checking of DB' records if corresponding page-tree is corrupted (to avoid `SIGSEGV`, ASAN failures, etc). - - Added more checks for [rare/fuzzing corruption cases](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/217). + - Added more checks for [rare/fuzzing corruption cases](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/217). Backward compatibility break: @@ -677,18 +677,18 @@ Backward compatibility break: Fixes: - Fixed excess meta-pages checks in case `mdbx_chk` is called to check the DB for a specific meta page and thus could prevent switching to the selected meta page, even if the check passed without errors. - - Fixed [recursive use of SRW-lock on Windows cause by `MDBX_NOTLS` option](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/203). - - Fixed [log a warning during a new DB creation](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/205). - - Fixed [false-negative `mdbx_cursor_eof()` result](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/207). - - Fixed [`make install` with non-GNU `install` utility (OSX, BSD)](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/208). - - Fixed [installation by `CMake` in special cases by complete use `GNUInstallDirs`'s variables](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/209). - - Fixed [C++ Buffer issue with `std::string` and alignment](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/191). + - Fixed [recursive use of SRW-lock on Windows cause by `MDBX_NOTLS` option](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/203). + - Fixed [log a warning during a new DB creation](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/205). + - Fixed [false-negative `mdbx_cursor_eof()` result](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/207). + - Fixed [`make install` with non-GNU `install` utility (OSX, BSD)](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/208). + - Fixed [installation by `CMake` in special cases by complete use `GNUInstallDirs`'s variables](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/209). + - Fixed [C++ Buffer issue with `std::string` and alignment](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/191). - Fixed `safe64_reset()` for platforms without atomic 64-bit compare-and-swap. - Fixed hang/shutdown on big-endian platforms without `__cxa_thread_atexit()`. - - Fixed [using bad meta-pages if DB was partially/recoverable corrupted](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/217). + - Fixed [using bad meta-pages if DB was partially/recoverable corrupted](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/217). - Fixed extra `noexcept` for `buffer::&assign_reference()`. - Fixed `bootid` generation on Windows for case of change system' time. - - Fixed [test framework keygen-related issue](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/127). + - Fixed [test framework keygen-related issue](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/127). ## v0.10.1 at 2021-06-01 @@ -709,10 +709,10 @@ New features: Fixes: - Fixed minor "foo not used" warnings from modern C++ compilers when building the C++ part of the library. - - Fixed confusing/messy errors when build library from unfit github's archives (https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/197). + - Fixed confusing/messy errors when build library from unfit github's archives (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/197). - Fixed `#​e​l​s​i​f` typo. - - Fixed rare unexpected `MDBX_PROBLEM` error during altering data in huge transactions due to wrong spilling/oust of dirty pages (https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/195). - - Re-Fixed WSL1/WSL2 detection with distinguishing (https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/97). + - Fixed rare unexpected `MDBX_PROBLEM` error during altering data in huge transactions due to wrong spilling/oust of dirty pages (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/195). + - Re-Fixed WSL1/WSL2 detection with distinguishing (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/97). ## v0.10.0 at 2021-05-09 @@ -735,7 +735,7 @@ New features: and conjointly with the `MDBX_ENV_CHECKPID=0` and `MDBX_TXN_CHECKOWNER=0` options can yield up to 30% more performance compared to LMDB. - Using float point (exponential quantized) representation for internal 16-bit values - of grow step and shrink threshold when huge ones (https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/166). + of grow step and shrink threshold when huge ones (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/166). To minimize the impact on compatibility, only the odd values inside the upper half of the range (i.e. 32769..65533) are used for the new representation. - Added the `mdbx_drop` similar to LMDB command-line tool to purge or delete (sub)database(s). @@ -744,7 +744,7 @@ New features: - The internal node sizes were refined, resulting in a reduction in large/overflow pages in some use cases and a slight increase in limits for a keys size to ≈½ of page size. - Added to `mdbx_chk` output number of keys/items on pages. - - Added explicit `install-strip` and `install-no-strip` targets to the `Makefile` (https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/pull/180). + - Added explicit `install-strip` and `install-no-strip` targets to the `Makefile` (https://web.archive.org/web/https://github.com/erthink/libmdbx/pull/180). - Major rework page splitting (af9b7b560505684249b76730997f9e00614b8113) for - An "auto-appending" feature upon insertion for both ascending and descending key sequences. As a result, the optimality of page filling @@ -752,7 +752,7 @@ New features: inserting ordered sequences of keys, - A "splitting at middle" to make page tree more balanced on average. - Added `mdbx_get_sysraminfo()` to the API. - - Added guessing a reasonable maximum DB size for the default upper limit of geometry (https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/183). + - Added guessing a reasonable maximum DB size for the default upper limit of geometry (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/183). - Major rework internal labeling of a dirty pages (958fd5b9479f52f2124ab7e83c6b18b04b0e7dda) for a "transparent spilling" feature with the gist to make a dirty pages be ready to spilling (writing to a disk) without further altering ones. @@ -768,7 +768,7 @@ New features: - Support `make help` to list available make targets. - Silently `make`'s build by default. - Preliminary [Python bindings](https://github.com/Thermi/libmdbx/tree/python-bindings) is available now - by [Noel Kuntze](https://github.com/Thermi) (https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/147). + by [Noel Kuntze](https://github.com/Thermi) (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/147). Backward compatibility break: @@ -783,22 +783,22 @@ Backward compatibility break: Fixes: - - Fixed performance regression due non-optimal C11 atomics usage (https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/160). - - Fixed "reincarnation" of subDB after it deletion (https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/168). + - Fixed performance regression due non-optimal C11 atomics usage (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/160). + - Fixed "reincarnation" of subDB after it deletion (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/168). - Fixed (disallowing) implicit subDB deletion via operations on `@MAIN`'s DBI-handle. - - Fixed a crash of `mdbx_env_info_ex()` in case of a call for a non-open environment (https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/171). - - Fixed the selecting/adjustment values inside `mdbx_env_set_geometry()` for implicit out-of-range cases (https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/170). - - Fixed `mdbx_env_set_option()` for set initial and limit size of dirty page list ((https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/179). - - Fixed an unreasonably huge default upper limit for DB geometry (https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/183). + - Fixed a crash of `mdbx_env_info_ex()` in case of a call for a non-open environment (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/171). + - Fixed the selecting/adjustment values inside `mdbx_env_set_geometry()` for implicit out-of-range cases (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/170). + - Fixed `mdbx_env_set_option()` for set initial and limit size of dirty page list ((https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/179). + - Fixed an unreasonably huge default upper limit for DB geometry (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/183). - Fixed `constexpr` specifier for the `slice::invalid()`. - - Fixed (no)readahead auto-handling (https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/164). + - Fixed (no)readahead auto-handling (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/164). - Fixed non-alloy build for Windows. - Switched to using Heap-functions instead of LocalAlloc/LocalFree on Windows. - - Fixed `mdbx_env_stat_ex()` to returning statistics of the whole environment instead of MainDB only (https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/190). + - Fixed `mdbx_env_stat_ex()` to returning statistics of the whole environment instead of MainDB only (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/190). - Fixed building by GCC 4.8.5 (added workaround for a preprocessor's bug). - Fixed building C++ part for iOS <= 13.0 (unavailability of `std::filesystem::path`). - Fixed building for Windows target versions prior to Windows Vista (`WIN32_WINNT < 0x0600`). - - Fixed building by MinGW for Windows (https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/155). + - Fixed building by MinGW for Windows (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/155). ------------------------------------------------------------------------------- @@ -821,7 +821,7 @@ Removed options and features: New features: - Package for FreeBSD is available now by Mahlon E. Smith. - - New API functions to get/set various options (https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/128): + - New API functions to get/set various options (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/128): - the maximum number of named databases for the environment; - the maximum number of threads/reader slots; - threshold (since the last unsteady commit) to force flush the data buffers to disk; @@ -834,7 +834,7 @@ New features: - maximal part of the dirty pages may be spilled when necessary; - minimal part of the dirty pages should be spilled when necessary; - how much of the parent transaction dirty pages will be spilled while start each child transaction; - - Unlimited/Dynamic size of retired and dirty page lists (https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/123). + - Unlimited/Dynamic size of retired and dirty page lists (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/123). - Added `-p` option (purge subDB before loading) to `mdbx_load` tool. - Reworked spilling of large transaction and committing of nested transactions: - page spilling code reworked to avoid the flaws and bugs inherited from LMDB; @@ -844,22 +844,22 @@ New features: - Added `MDBX_ENABLE_REFUND` and `MDBX_PNL_ASCENDING` internal/advanced build options. - Added `mdbx_default_pagesize()` function. - Better support architectures with a weak/relaxed memory consistency model (ARM, AARCH64, PPC, MIPS, RISC-V, etc) by means [C11 atomics](https://en.cppreference.com/w/c/atomic). - - Speed up page number lists and dirty page lists (https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/132). + - Speed up page number lists and dirty page lists (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/132). - Added `LIBMDBX_NO_EXPORTS_LEGACY_API` build option. Fixes: - - Fixed missing cleanup (null assigned) in the C++ commit/abort (https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/pull/143). + - Fixed missing cleanup (null assigned) in the C++ commit/abort (https://web.archive.org/web/https://github.com/erthink/libmdbx/pull/143). - Fixed `mdbx_realloc()` for case of nullptr and `MDBX_WITHOUT_MSVC_CRT=ON` for Windows. - - Fixed the possibility to use invalid and renewed (closed & re-opened, dropped & re-created) DBI-handles (https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/146). - - Fixed 4-byte aligned access to 64-bit integers, including access to the `bootid` meta-page's field (https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/153). + - Fixed the possibility to use invalid and renewed (closed & re-opened, dropped & re-created) DBI-handles (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/146). + - Fixed 4-byte aligned access to 64-bit integers, including access to the `bootid` meta-page's field (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/153). - Fixed minor/potential memory leak during page flushing and unspilling. - Fixed handling states of cursors's and subDBs's for nested transactions. - Fixed page leak in extra rare case the list of retired pages changed during update GC on transaction commit. - - Fixed assertions to avoid false-positive UB detection by CLANG/LLVM (https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/153). - - Fixed `MDBX_TXN_FULL` and regressive `MDBX_KEYEXIST` during large transaction commit with `MDBX_LIFORECLAIM` (https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/123). + - Fixed assertions to avoid false-positive UB detection by CLANG/LLVM (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/153). + - Fixed `MDBX_TXN_FULL` and regressive `MDBX_KEYEXIST` during large transaction commit with `MDBX_LIFORECLAIM` (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/123). - Fixed auto-recovery (`weak->steady` with the same boot-id) when Database size at last weak checkpoint is large than at last steady checkpoint. - - Fixed operation on systems with unusual small/large page size, including PowerPC (https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/157). + - Fixed operation on systems with unusual small/large page size, including PowerPC (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/157). ## v0.9.2 at 2020-11-27 @@ -897,11 +897,11 @@ Fixes: - Fixed copy&paste typos. - Fixed minor false-positive GCC warning. - Added workaround for broken `DEFINE_ENUM_FLAG_OPERATORS` from Windows SDK. - - Fixed cursor state after multimap/dupsort repeated deletes (https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/121). + - Fixed cursor state after multimap/dupsort repeated deletes (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/121). - Added `SIGPIPE` suppression for internal thread during `mdbx_env_copy()`. - - Fixed extra-rare `MDBX_KEY_EXIST` error during `mdbx_commit()` (https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/131). - - Fixed spilled pages checking (https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/126). - - Fixed `mdbx_load` for 'plain text' and without `-s name` cases (https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/136). + - Fixed extra-rare `MDBX_KEY_EXIST` error during `mdbx_commit()` (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/131). + - Fixed spilled pages checking (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/126). + - Fixed `mdbx_load` for 'plain text' and without `-s name` cases (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/136). - Fixed save/restore/commit of cursors for nested transactions. - Fixed cursors state in rare/special cases (move next beyond end-of-data, after deletion and so on). - Added workaround for MSVC 19.28 (Visual Studio 16.8) (but may still hang during compilation). diff --git a/README.md b/README.md index 3f78a26b..d25189b8 100644 --- a/README.md +++ b/README.md @@ -353,7 +353,7 @@ named mutexes are used. Historically, _libmdbx_ is a deeply revised and extended descendant of the [Lightning Memory-Mapped Database](https://en.wikipedia.org/wiki/Lightning_Memory-Mapped_Database). At first the development was carried out within the -[ReOpenLDAP](https://web.archive.org/web/20220414235959/https://github.com/erthink/ReOpenLDAP) project. About a +[ReOpenLDAP](https://web.archive.org/web/https://github.com/erthink/ReOpenLDAP) project. About a year later _libmdbx_ was separated into a standalone project, which was [presented at Highload++ 2015 conference](http://www.highload.ru/2015/abstracts/1831.html). @@ -435,7 +435,7 @@ unexpected or broken down. ### Testing The amalgamated source code does not contain any tests for or several reasons. -Please read [the explanation](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/214#issuecomment-870717981) and don't ask to alter this. +Please read [the explanation](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/214#issuecomment-870717981) and don't ask to alter this. So for testing _libmdbx_ itself you need a full source code, i.e. the clone of a git repository, there is no option. The full source code of _libmdbx_ has a [`test` subdirectory](https://gitflic.ru/project/erthink/libmdbx/tree/master/test) with minimalistic test "framework". @@ -618,7 +618,7 @@ Bindings | Rust | [libmdbx-rs](https://github.com/vorot93/libmdbx-rs) | [Artem Vorotnikov](https://github.com/vorot93) | | Rust | [mdbx](https://crates.io/crates/mdbx) | [gcxfd](https://github.com/gcxfd) | | Java | [mdbxjni](https://github.com/castortech/mdbxjni) | [Castor Technologies](https://castortech.com/) | -| Python (draft) | [python-bindings](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/commits/python-bindings) branch | [Noel Kuntze](https://github.com/Thermi) +| Python (draft) | [python-bindings](https://web.archive.org/web/https://github.com/erthink/libmdbx/commits/python-bindings) branch | [Noel Kuntze](https://github.com/Thermi) | .NET (obsolete) | [mdbx.NET](https://github.com/wangjia184/mdbx.NET) | [Jerry Wang](https://github.com/wangjia184) | diff --git a/TODO.md b/TODO.md index 66b0fff9..70016f38 100644 --- a/TODO.md +++ b/TODO.md @@ -11,19 +11,19 @@ For the same reason ~~Github~~ is blacklisted forever. So currently most of the links are broken due to noted malicious ~~Github~~ sabotage. - - [Move most of `mdbx_chk` functional to the library API](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/204). - - [Replace SRW-lock on Windows to allow shrink DB with `MDBX_NOTLS` option](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/210). - - [More flexible support of asynchronous runtime/framework(s)](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/200). - - [Migration guide from LMDB to MDBX](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/199). - - [Support for RAW devices](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/124). - - [Support MessagePack for Keys & Values](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/115). - - [Engage new terminology](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/137). + - [Move most of `mdbx_chk` functional to the library API](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/204). + - [Replace SRW-lock on Windows to allow shrink DB with `MDBX_NOTLS` option](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/210). + - [More flexible support of asynchronous runtime/framework(s)](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/200). + - [Migration guide from LMDB to MDBX](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/199). + - [Support for RAW devices](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/124). + - [Support MessagePack for Keys & Values](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/115). + - [Engage new terminology](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/137). - Packages for [Astra Linux](https://astralinux.ru/), [ALT Linux](https://www.altlinux.org/), [ROSA Linux](https://www.rosalinux.ru/), etc. Done ---- - - [Simple careful mode for working with corrupted DB](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/223). - - [Engage an "overlapped I/O" on Windows](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/224). - - [Large/Overflow pages accounting for dirty-room](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/192). - - [Get rid of dirty-pages list in MDBX_WRITEMAP mode](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/193). + - [Simple careful mode for working with corrupted DB](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/223). + - [Engage an "overlapped I/O" on Windows](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/224). + - [Large/Overflow pages accounting for dirty-room](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/192). + - [Get rid of dirty-pages list in MDBX_WRITEMAP mode](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/193). diff --git a/mdbx.h++ b/mdbx.h++ index dddc1880..bc3e1802 100644 --- a/mdbx.h++ +++ b/mdbx.h++ @@ -287,7 +287,7 @@ namespace mdbx { // To enable all kinds of an compiler optimizations we use a byte-like type // that don't presumes aliases for pointers as does the `char` type and its // derivatives/typedefs. -// Please see todo4recovery://erased_by_github/libmdbx/issues/263 +// Please see https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/263 // for reasoning of the use of `char8_t` type and switching to `__restrict__`. using byte = char8_t; #else diff --git a/src/core.c b/src/core.c index 494da680..00f1b46d 100644 --- a/src/core.c +++ b/src/core.c @@ -4443,7 +4443,7 @@ static void iov_callback4dirtypages(iov_ctx_t *ctx, size_t offset, void *data, osal_flush_incoherent_mmap(env->me_map + offset, bytes, env->me_os_psize); const MDBX_page *const rp = (const MDBX_page *)(env->me_map + offset); /* check with timeout as the workaround - * for todo4recovery://erased_by_github/libmdbx/issues/269 */ + * for https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/269 */ if (unlikely(memcmp(wp, rp, bytes))) { ctx->coherency_timestamp = 0; WARNING("catch delayed/non-arrived page %" PRIaPGNO " %s", wp->mp_pgno, @@ -6598,7 +6598,7 @@ static __inline bool is_gc_usable(const MDBX_txn *txn) { return false; /* avoid (recursive) search inside empty tree and while tree is - updating, todo4recovery://erased_by_github/libmdbx/issues/31 */ + updating, https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/31 */ if (txn->mt_dbs[FREE_DBI].md_entries == 0) return false; @@ -6797,7 +6797,7 @@ static pgr_t page_alloc_slowpath(const MDBX_cursor *mc, const size_t num, /* Stop reclaiming to avoid large/overflow the page list. * This is a rare case while search for a continuously multi-page region * in a large database. - * todo4recovery://erased_by_github/libmdbx/issues/123 */ + * https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/123 */ NOTICE("stop reclaiming to avoid PNL overflow: %zu (current) + %zu " "(chunk) -> %zu", MDBX_PNL_GETSIZE(txn->tw.relist), gc_len, @@ -7972,7 +7972,7 @@ __cold int mdbx_thread_unregister(const MDBX_env *env) { return MDBX_SUCCESS; } -/* check against todo4recovery://erased_by_github/libmdbx/issues/269 */ +/* check against https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/269 */ static bool coherency_check(const MDBX_env *env, const txnid_t txnid, const volatile MDBX_db *dbs, const volatile MDBX_meta *meta, bool report) { @@ -8073,7 +8073,7 @@ __cold static int coherency_timeout(uint64_t *timestamp, pgno_t pgno) { } /* check with timeout as the workaround - * for todo4recovery://erased_by_github/libmdbx/issues/269 */ + * for https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/269 */ __hot static int coherency_check_readed(const MDBX_env *env, const txnid_t txnid, const volatile MDBX_db *dbs, @@ -8310,7 +8310,7 @@ static int txn_renew(MDBX_txn *txn, const unsigned flags) { const meta_ptr_t head = meta_recent(env, &txn->tw.troika); uint64_t timestamp = 0; while ( - "workaround for todo4recovery://erased_by_github/libmdbx/issues/269") { + "workaround for https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/269") { rc = coherency_check_readed(env, head.txnid, head.ptr_v->mm_dbs, head.ptr_v, ×tamp); if (likely(rc == MDBX_SUCCESS)) @@ -11953,7 +11953,7 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending, } uint64_t timestamp = 0; - while ("workaround for todo4recovery://erased_by_github/libmdbx/issues/269") { + while ("workaround for https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/269") { rc = coherency_check_written(env, pending->unsafe_txnid, target, ×tamp); if (likely(rc == MDBX_SUCCESS)) @@ -12475,7 +12475,7 @@ mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t size_now, uint64_t timestamp = 0; while ("workaround for " - "todo4recovery://erased_by_github/libmdbx/issues/269") { + "https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/269") { meta = *head.ptr_c; rc = coherency_check_readed(env, head.txnid, meta.mm_dbs, &meta, ×tamp); @@ -13818,7 +13818,7 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, } else { #if MDBX_MMAP_INCOHERENT_FILE_WRITE /* Temporary `workaround` for OpenBSD kernel's flaw. - * See todo4recovery://erased_by_github/libmdbx/issues/67 */ + * See https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/67 */ if ((flags & MDBX_WRITEMAP) == 0) { if (flags & MDBX_ACCEDE) flags |= MDBX_WRITEMAP; @@ -21169,7 +21169,7 @@ __cold static int fetch_envinfo_ex(const MDBX_env *env, const MDBX_txn *txn, const size_t size_before_pgop_stat = offsetof(MDBX_envinfo, mi_pgop_stat); /* is the environment open? - * (todo4recovery://erased_by_github/libmdbx/issues/171) */ + * (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/171) */ if (unlikely(!env->me_map)) { /* environment not yet opened */ #if 1 diff --git a/src/mdbx_chk.c b/src/mdbx_chk.c index 311695c8..57ea1631 100644 --- a/src/mdbx_chk.c +++ b/src/mdbx_chk.c @@ -1171,7 +1171,7 @@ int main(int argc, char *argv[]) { envflags &= ~MDBX_RDONLY; #if MDBX_MMAP_INCOHERENT_FILE_WRITE /* Temporary `workaround` for OpenBSD kernel's flaw. - * See todo4recovery://erased_by_github/libmdbx/issues/67 */ + * See https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/67 */ envflags |= MDBX_WRITEMAP; #endif /* MDBX_MMAP_INCOHERENT_FILE_WRITE */ break; diff --git a/src/osal.c b/src/osal.c index 494290a2..71046f6c 100644 --- a/src/osal.c +++ b/src/osal.c @@ -1311,7 +1311,7 @@ MDBX_INTERNAL_FUNC int osal_openfile(const enum osal_openfile_purpose purpose, flags |= O_CLOEXEC; #endif /* O_CLOEXEC */ - /* Safeguard for todo4recovery://erased_by_github/libmdbx/issues/144 */ + /* Safeguard for https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/144 */ #if STDIN_FILENO == 0 && STDOUT_FILENO == 1 && STDERR_FILENO == 2 int stub_fd0 = -1, stub_fd1 = -1, stub_fd2 = -1; static const char dev_null[] = "/dev/null"; @@ -1349,7 +1349,7 @@ MDBX_INTERNAL_FUNC int osal_openfile(const enum osal_openfile_purpose purpose, errno = EACCES /* restore errno if file exists */; } - /* Safeguard for todo4recovery://erased_by_github/libmdbx/issues/144 */ + /* Safeguard for https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/144 */ #if STDIN_FILENO == 0 && STDOUT_FILENO == 1 && STDERR_FILENO == 2 if (*fd == STDIN_FILENO) { WARNING("Got STD%s_FILENO/%d, avoid using it by dup(fd)", "IN", @@ -2192,7 +2192,7 @@ MDBX_INTERNAL_FUNC int osal_munmap(osal_mmap_t *map) { VALGRIND_MAKE_MEM_NOACCESS(map->address, map->current); /* Unpoisoning is required for ASAN to avoid false-positive diagnostic * when this memory will re-used by malloc or another mmapping. - * See todo4recovery://erased_by_github/libmdbx/pull/93#issuecomment-613687203 + * See https://web.archive.org/web/https://github.com/erthink/libmdbx/pull/93#issuecomment-613687203 */ MDBX_ASAN_UNPOISON_MEMORY_REGION(map->address, (map->filesize && map->filesize < map->limit) @@ -2271,7 +2271,7 @@ MDBX_INTERNAL_FUNC int osal_mresize(const int flags, osal_mmap_t *map, /* Unpoisoning is required for ASAN to avoid false-positive diagnostic * when this memory will re-used by malloc or another mmapping. - * See todo4recovery://erased_by_github/libmdbx/pull/93#issuecomment-613687203 + * See https://web.archive.org/web/https://github.com/erthink/libmdbx/pull/93#issuecomment-613687203 */ MDBX_ASAN_UNPOISON_MEMORY_REGION(map->address, map->limit); status = NtUnmapViewOfSection(GetCurrentProcess(), map->address); @@ -2552,7 +2552,7 @@ retry_mapview:; /* Unpoisoning is required for ASAN to avoid false-positive diagnostic * when this memory will re-used by malloc or another mmapping. * See - * todo4recovery://erased_by_github/libmdbx/pull/93#issuecomment-613687203 + * https://web.archive.org/web/https://github.com/erthink/libmdbx/pull/93#issuecomment-613687203 */ MDBX_ASAN_UNPOISON_MEMORY_REGION( map->address, @@ -2574,7 +2574,7 @@ retry_mapview:; /* Unpoisoning is required for ASAN to avoid false-positive diagnostic * when this memory will re-used by malloc or another mmapping. * See - * todo4recovery://erased_by_github/libmdbx/pull/93#issuecomment-613687203 + * https://web.archive.org/web/https://github.com/erthink/libmdbx/pull/93#issuecomment-613687203 */ MDBX_ASAN_UNPOISON_MEMORY_REGION( map->address, (map->current < map->limit) ? map->current : map->limit); diff --git a/test/valgrind_suppress.txt b/test/valgrind_suppress.txt index 2a95ff0f..2e67a56b 100644 --- a/test/valgrind_suppress.txt +++ b/test/valgrind_suppress.txt @@ -31,7 +31,7 @@ fun:wipe_steady* } -# memcmp() inside iov_write() as workaround for todo4recovery://erased_by_github/libmdbx/issues/269 +# memcmp() inside iov_write() as workaround for https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/269 { iov-pagecheck-1 Memcheck:Cond From 3704433aa9ccd08be779b17a56172f0583f7d203 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Thu, 10 Nov 2022 16:34:23 +0300 Subject: [PATCH 212/364] =?UTF-8?q?mdbx:=20=D0=BC=D0=B8=D0=BD=D0=BE=D1=80?= =?UTF-8?q?=D0=BD=D0=BE=D0=B5=20=D1=83=D0=B4=D0=B0=D0=BB=D0=B5=D0=BD=D0=B8?= =?UTF-8?q?=D0=B5=20=D0=BC=D0=B5=D1=80=D1=82=D0=B2=D0=BE=D0=B3=D0=BE/?= =?UTF-8?q?=D0=BD=D0=B5=D0=BD=D1=83=D0=B6=D0=BD=D0=BE=D0=B3=D0=BE=20=D0=BA?= =?UTF-8?q?=D0=BE=D0=B4=D0=B0=20=D0=B8=D0=B7=20`page=5Falloc=5Fslowpath()`?= =?UTF-8?q?.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/core.c b/src/core.c index 00f1b46d..cbb94b47 100644 --- a/src/core.c +++ b/src/core.c @@ -6660,9 +6660,6 @@ static pgr_t page_alloc_slowpath(const MDBX_cursor *mc, const size_t num, eASSERT(env, MDBX_PNL_LAST(txn->tw.relist) < txn->mt_next_pgno && MDBX_PNL_FIRST(txn->tw.relist) < txn->mt_next_pgno); range = txn->tw.relist + (MDBX_PNL_ASCENDING ? 1 : re_len); - pgno = *range; - if (num == 1) - goto done; range = scan4seq(range, re_len, num - 1); eASSERT(env, range == scan4range_checker(txn->tw.relist, num - 1)); if (likely(range)) { From 9fca1734c7044ace3cdac14ecea14ff6d9715b3f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Fri, 11 Nov 2022 16:16:33 +0300 Subject: [PATCH 213/364] =?UTF-8?q?mdbx:=20=D0=BE=D0=B1=D0=BD=D0=BE=D0=B2?= =?UTF-8?q?=D0=BB=D0=B5=D0=BD=D0=B8=D0=B5=20ChangeLog.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ChangeLog.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/ChangeLog.md b/ChangeLog.md index d162ba94..1bdad742 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -8,6 +8,8 @@ and [by Yandex](https://translated.turbopages.org/proxy_u/ru-en.en/https/gitflic Новое: + - Поддержка всех основных опций при сборке посредством CMake. + - Требования к CMake понижены до версии 3.0.2 для возможности сборки для устаревших платформ. - Добавлена возможность профилирования работы GC в сложных и/или нагруженных @@ -138,6 +140,8 @@ and [by Yandex](https://translated.turbopages.org/proxy_u/ru-en.en/https/gitflic Мелочи: + - Исторические ссылки cвязанные с удалённым на ~~github~~ проектом перенаправлены на [web.archive.org](https://web.archive.org/web/https://github.com/erthink/libmdbx). + - Синхронизированны конструкции CMake между проектами. - Добавлено предупреждение о небезопасности RISC-V. - Добавлено описание параметров `MDBX_debug_func` и `MDBX_debug_func`. - Добавлено обходное решение для минимизации ложно-положительных From b274a35410261d0fb1f8201450f4ad8e4b6090ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Fri, 11 Nov 2022 16:44:51 +0300 Subject: [PATCH 214/364] =?UTF-8?q?mdbx-doc:=20=D0=B4=D0=BE=D0=BF=D0=BE?= =?UTF-8?q?=D0=BB=D0=BD=D0=B5=D0=BD=D0=B8=D0=B5=20man-=D1=81=D1=82=D1=80?= =?UTF-8?q?=D0=B0=D0=BD=D0=B8=D1=86=20=D1=83=D1=82=D0=B8=D0=BB=D0=B8=D1=82?= =?UTF-8?q?=20=D0=BE=D0=BF=D0=B8=D1=81=D0=B0=D0=BD=D0=B8=D0=B5=D0=BC=20?= =?UTF-8?q?=D0=BE=D0=BF=D1=86=D0=B8=D0=B9=20`-u`=20=D0=B8=20`-U`.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/man1/mdbx_chk.1 | 7 +++++++ src/man1/mdbx_copy.1 | 7 +++++++ src/man1/mdbx_dump.1 | 7 +++++++ 3 files changed, 21 insertions(+) diff --git a/src/man1/mdbx_chk.1 b/src/man1/mdbx_chk.1 index da2e78fb..269a8246 100644 --- a/src/man1/mdbx_chk.1 +++ b/src/man1/mdbx_chk.1 @@ -81,6 +81,13 @@ Turn to a specified meta-page on successful check. .BR \-T Turn to a specified meta-page EVEN ON UNSUCCESSFUL CHECK! .TP +.BR \-u +Warms up the DB before checking via notifying OS kernel of subsequent access to the database pages. +.TP +.BR \-U +Warms up the DB before checking, notifying the OS kernel of subsequent access to the database pages, +then forcibly loads ones by sequential access and tries to lock database pages in memory. +.TP .BR \-n Open MDBX environment(s) which do not use subdirectories. This is legacy option. For now MDBX handles this automatically. diff --git a/src/man1/mdbx_copy.1 b/src/man1/mdbx_copy.1 index 3cb97a34..09cdaa5a 100644 --- a/src/man1/mdbx_copy.1 +++ b/src/man1/mdbx_copy.1 @@ -45,6 +45,13 @@ or unused pages will be omitted from the copy. This option will slow down the backup process as it is more CPU-intensive. Currently it fails if the environment has suffered a page leak. .TP +.BR \-u +Warms up the DB before copying via notifying OS kernel of subsequent access to the database pages. +.TP +.BR \-U +Warms up the DB before copying, notifying the OS kernel of subsequent access to the database pages, +then forcibly loads ones by sequential access and tries to lock database pages in memory. +.TP .BR \-n Open MDBX environment(s) which do not use subdirectories. This is legacy option. For now MDBX handles this automatically. diff --git a/src/man1/mdbx_dump.1 b/src/man1/mdbx_dump.1 index 417488e7..add3fe0e 100644 --- a/src/man1/mdbx_dump.1 +++ b/src/man1/mdbx_dump.1 @@ -66,6 +66,13 @@ Dump a specific subdatabase. If no database is specified, only the main database .BR \-r Rescure mode. Ignore some errors to dump corrupted DB. .TP +.BR \-u +Warms up the DB before dumping via notifying OS kernel of subsequent access to the database pages. +.TP +.BR \-U +Warms up the DB before dumping, notifying the OS kernel of subsequent access to the database pages, +then forcibly loads ones by sequential access and tries to lock database pages in memory. +.TP .BR \-n Dump an MDBX database which does not use subdirectories. This is legacy option. For now MDBX handles this automatically. From d23f695ab31fda4a08b4f30ff7bf7d051c63f4d7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Fri, 11 Nov 2022 16:56:43 +0300 Subject: [PATCH 215/364] =?UTF-8?q?mdbx:=20=D0=B8=D0=BC=D0=BF=D0=BE=D1=80?= =?UTF-8?q?=D1=82=20ChangeLog=20=D0=B4=D0=BB=D1=8F=20=D0=B2=D0=B5=D1=82?= =?UTF-8?q?=D0=BA=D0=B8=20`stable`.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ChangeLog.md | 69 +++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 66 insertions(+), 3 deletions(-) diff --git a/ChangeLog.md b/ChangeLog.md index 1bdad742..5201a414 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -208,10 +208,75 @@ Not a release but preparation for changing feature set and API. ------------------------------------------------------------------------------- +## v0.11.13 at (Swashplate) 2022-11-10 + +The stable bugfix release in memory of [Boris Yuryev](https://ru.wikipedia.org/wiki/Юрьев,_Борис_Николаевич) on his 133rd birthday. + +``` +30 files changed, 405 insertions(+), 136 deletions(-) +Signed-off-by: Леонид Юрьев (Leonid Yuriev) +``` + +Fixes: + + - Fixed builds with older libc versions after using `fcntl64()` (backport). + - Fixed builds with older `stdatomic.h` versions, + where the `ATOMIC_*_LOCK_FREE` macros mistakenly redefined using functions (backport). + - Added workaround for `mremap()` defect to avoid assertion failure (backport). + - Workaround for `encryptfs` bug(s) in the `copy_file_range` implementation (backport). + - Fixed unexpected `MDBX_BUSY` from `mdbx_env_set_option()`, `mdbx_env_set_syncbytes()` + and `mdbx_env_set_syncperiod()` (backport). + - CMake requirements lowered to version 3.0.2 (backport). + +Minors: + + - Minor clarification output of `--help` for `mdbx_test` (backport). + - Added admonition of insecure for RISC-V (backport). + - Stochastic scripts and CMake files synchronized with the `devel` branch. + - Use `--dont-check-ram-size` for small-tests make-targets (backport). + + +## v0.11.12 (Эребуни) at 2022-10-12 + +The stable bugfix release. + +``` +11 files changed, 96 insertions(+), 49 deletions(-) +Signed-off-by: Леонид Юрьев (Leonid Yuriev) +``` + +Fixes: + + - Fixed static assertion failure on platforms where the `off_t` type is wider + than corresponding fields of `struct flock` used for file locking (backport). + Now _libmdbx_ will use `fcntl64(F_GETLK64/F_SETLK64/F_SETLKW64)` if available. + - Fixed assertion check inside `page_retire_ex()` (backport). + +Minors: + + - Fixed `-Wint-to-pointer-cast` warnings while casting to `mdbx_tid_t` (backport). + - Removed needless `LockFileEx()` inside `mdbx_env_copy()` (backport). + + +## v0.11.11 (Тендра-1790) at 2022-09-11 + +The stable bugfix release. + +``` +10 files changed, 38 insertions(+), 21 deletions(-) +Signed-off-by: Леонид Юрьев (Leonid Yuriev) +``` + +Fixes: + + - Fixed an extra check for `MDBX_APPENDDUP` inside `mdbx_cursor_put()` which could result in returning `MDBX_EKEYMISMATCH` for valid cases. + - Fixed an extra ensure/assertion check of `oldest_reader` inside `mdbx_txn_end()`. + - Fixed derived C++ builds by removing `MDBX_INTERNAL_FUNC` for `mdbx_w2mb()` and `mdbx_mb2w()`. + + ## v0.11.10 (the TriColor) at 2022-08-22 The stable bugfix release. -It is planned that this will be the last release of the v0.11 branch. ``` 14 files changed, 263 insertions(+), 252 deletions(-) @@ -239,8 +304,6 @@ Minors: - Minor clarified `iov_page()` failure case. -------------------------------------------------------------------------------- - ## v0.11.9 (Чирчик-1992) at 2022-08-02 From 9b062cf0c7d41297f756c7f7b897ed981022bdbf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Fri, 11 Nov 2022 17:35:32 +0300 Subject: [PATCH 216/364] =?UTF-8?q?mdbx:=20=D0=B2=D1=8B=D0=BF=D1=83=D1=81?= =?UTF-8?q?=D0=BA=20v0.12.2=20(=D0=98=D0=B2=D0=B0=D0=BD=20=D0=AF=D1=80?= =?UTF-8?q?=D1=8B=D0=B3=D0=B8=D0=BD)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Выпуск с существенными доработками и новой функциональностью в память о российском борце [Иване Сергеевиче Ярыгине](https://ru.wikipedia.org/wiki/Ярыгин,_Иван_Сергеевич). На Олимпийских играх в Мюнхене в 1972 году Иван Ярыгин уложил всех соперников на лопатки, суммарно затратив менее 9 минут. Этот рекорд никем не побит до сих пор. Новое: ------ - Поддержка всех основных опций при сборке посредством CMake. - Требования к CMake понижены до версии 3.0.2 для возможности сборки для устаревших платформ. - Добавлена возможность профилирования работы GC в сложных и/или нагруженных сценариях (например Ethereum/Erigon). По-умолчанию соответствующий код отключен, а для его активации необходимо указать опцию сборки `MDBX_ENABLE_PROFGC=1`. - Добавлена функция `mdbx_env_warmup()` для "прогрева" БД с возможностью закрепления страниц в памяти. В утилиты `mdbx_chk`, `mdbx_copy` и `mdbx_dump` добавлены опции `-u` и `-U` для активации соответствующего функционала. - Отключение учета «грязных» страниц в не требующих этого режимах (`MDBX_WRITEMAP` при `MDBX_AVOID_MSYNC=0`). Доработка позволяет снизить накладные расходы и была запланирована давно, но откладывалась так как требовала других изменений. - Вытеснение из памяти (спиллинг) «грязных» страниц с учетом размера large/overflow-страниц. Доработка позволяет корректно соблюдать политику задаваемую опциями `MDBX_opt_txn_dp_limit`, `MDBX_opt_spill_max_denominator`, `MDBX_opt_spill_min_denominator` и была запланирована давно, но откладывалась так как требовала других изменений. - Для Windows в API добавлены UNICODE-зависимые определения макросов `MDBX_DATANAME`, `MDBX_LOCKNAME` и `MDBX_LOCK_SUFFIX`. - Переход на преимущественное использование типа `size_t` для уменьшения накладных расходов на платформе Эльбрус. - В API добавлены функции `mdbx_limits_valsize4page_max()` и `mdbx_env_get_valsize4page_max()` возвращающие максимальный размер в байтах значения, которое может быть размещена в одной large/overflow-странице, а не последовательности из двух или более таких страниц. Для таблиц с поддержкой дубликатов вынос значений на large/overflow-страницы не поддерживается, поэтому результат совпадает с `mdbx_limits_valsize_max()`. - В API добавлены функции `mdbx_limits_pairsize4page_max()`и `mdbx_env_get_pairsize4page_max()` возвращающие в байтах максимальный суммарный размер пары ключ-значение для их размещения на одной листовой страницы, без выноса значения на отдельную large/overflow-страницу. Для таблиц с поддержкой дубликатов вынос значений на large/overflow-страницы не поддерживается, поэтому результат определяет максимальный/допустимый суммарный размер пары ключ-значение. - Реализовано использование асинхронной (overlapped) записи в Windows, включая использования небуфферизированного ввода-вывода и `WriteGather()`. Это позволяет сократить накладные расходы и частично обойти проблемы Windows с низкой производительностью ввода-вывода, включая большие задержки `FlushFileBuffers()`. Новый код также обеспечивает консолидацию записываемых регионов на всех платформах, а на Windows использование событий (events) сведено к минимум, одновременно с автоматических использованием `WriteGather()`. Поэтому ожидается существенное снижение накладных расходов взаимодействия с ОС, а в Windows это ускорение, в некоторых сценариях, может быть кратным в сравнении с LMDB. - Добавлена опция сборки `MDBX_AVOID_MSYNC`, которая определяет поведение libmdbx в режиме `MDBX_WRITE_MAP` (когда данные изменяются непосредственно в отображенных в ОЗУ страницах БД): * Если `MDBX_AVOID_MSYNC=0` (по умолчанию на всех системах кроме Windows), то (как прежде) сохранение данных выполняется посредством `msync()`, либо `FlushViewOfFile()` на Windows. На платформах с полноценной подсистемой виртуальной памяти и адекватным файловым вводом-выводом это обеспечивает минимум накладных расходов (один системный вызов) и максимальную производительность. Однако, на Windows приводит к значительной деградации, в том числе из-за того что после `FlushViewOfFile()` требуется также вызов `FlushFileBuffers()` с массой проблем и суеты внутри ядра ОС. * Если `MDBX_AVOID_MSYNC=1` (по умолчанию только на Windows), то сохранение данных выполняется явной записью в файл каждой измененной страницы БД. Это требует дополнительных накладных расходов, как на отслеживание измененных страниц (ведение списков "грязных" страниц), так и на системные вызовы для их записи. Кроме этого, с точки зрения подсистемы виртуальной памяти ядра ОС, страницы БД измененные в ОЗУ и явно записанные в файл, могут либо оставаться "грязными" и быть повторно записаны ядром ОС позже, либо требовать дополнительных накладных расходов для отслеживания PTE (Page Table Entries), их модификации и дополнительного копирования данных. Тем не менее, по имеющейся информации, на Windows такой путь записи данных в целом обеспечивает более высокую производительность. - Улучшение эвристики включения авто-слияния записей GC. - Изменение формата LCK и семантики некоторых внутренних полей. Версии libmdbx использующие разный формат не смогут работать с одной БД одновременно, а только поочередно (LCK-файл переписывается при открытии первым открывающим БД процессом). - В `C++` API добавлены методы фиксации транзакции с получением информации о задержках. - Added `MDBX_HAVE_BUILT IN_CPU_SUPPORTS` build option to control use GCC's `__builtin_cpu_supports()` function, which could be unavailable on a fake OSes (macos, ios, android, etc). Исправления (без корректировок вышеперечисленных новых функций): ---------------------------------------------------------------- - Устранения ряда предупреждений при сборке посредством MinGW. - Устранение ложно-положительных сообщений от Valgrind об использовании не инициализированных данных из-за выравнивающих зазоров в `struct troika`. - Исправлен возврат неожиданной ошибки `MDBX_BUSY` из функций `mdbx_env_set_option()`, `mdbx_env_set_syncbytes()` и `mdbx_env_set_syncperiod()`. - Небольшие исправления для совместимости с CMake 3.8 - Больше контроля и осторожности (паранойи) для страховки от дефектов `mremap()`. - Костыль для починки сборки со старыми версиями `stdatomic.h` из GNU Lib C, где макросы `ATOMIC_*_LOCK_FREE` ошибочно переопределяются через функции. - Использование `fcntl64(F_GETLK64/F_SETLK64/F_SETLKW64)` при наличии. Это решает проблему срабатывания проверочного утверждения при сборке для платформ где тип `off_t` шире соответствующих полей `структуры flock`, используемой для блокировки файлов. - Доработан сбор информации о задержках при фиксации транзакций: * Устранено искажение замеров длительности обновления GC при включении отладочного внутреннего аудита; * Защита от undeflow-нуля только общей задержки в метриках, чтобы исключить ситуации, когда сумма отдельных стадий больше общей длительности. - Ряд исправлений для устранения срабатываний проверочных утверждения в отладочных сборках. - Более осторожное преобразование к типу `mdbx_tid_t` для устранения предупреждений. - Исправление лишнего сброса данных на диск в режиме `MDBX_SAFE_NOSYNC` при обновлении GC. - Fixed an extra check for `MDBX_APPENDDUP` inside `mdbx_cursor_put()` which could result in returning `MDBX_EKEYMISMATCH` for valid cases. - Fixed nasty `clz()` bug (by using `_BitScanReverse()`, only MSVC builds affected). Мелочи: ------- - Исторические ссылки cвязанные с удалённым на ~~github~~ проектом перенаправлены на [web.archive.org](https://web.archive.org/web/https://github.com/erthink/libmdbx). - Синхронизированны конструкции CMake между проектами. - Добавлено предупреждение о небезопасности RISC-V. - Добавлено описание параметров `MDBX_debug_func` и `MDBX_debug_func`. - Добавлено обходное решение для минимизации ложно-положительных конфликтов при использовании файловых блокировок в Windows. - Проверка атомарности C11-операций c 32/64-битными данными. - Уменьшение в 42 раза значения по-умолчанию для `me_options.dp_limit` в отладочных сборках. - Добавление платформы `gcc-riscv64-linux-gnu` в список для цели `cross-gcc`. - Небольшие правки скрипта `long_stochastic.sh` для работы в Windows. - Удаление ненужного вызова `LockFileEx()` внутри `mdbx_env_copy()`. - Добавлено описание использования файловых дескрипторов в различных режимах. - Добавлено использование `_CrtDbgReport()` в отладочных сборках. - Fixed an extra ensure/assertion check of `oldest_reader` inside `txn_end()`. - Removed description of deprecated usage of `MDBX_NODUPDATA`. - Fixed regression ASAN/Valgring-enabled builds. - Fixed minor MingGW warning. 64 files changed, 5573 insertions(+), 2510 deletions(-) Signed-off-by: Леонид Юрьев (Leonid Yuriev) --- ChangeLog.md | 13 ++++++++++++- src/man1/mdbx_chk.1 | 2 +- src/man1/mdbx_copy.1 | 2 +- src/man1/mdbx_drop.1 | 2 +- src/man1/mdbx_dump.1 | 2 +- src/man1/mdbx_load.1 | 2 +- src/man1/mdbx_stat.1 | 2 +- 7 files changed, 18 insertions(+), 7 deletions(-) diff --git a/ChangeLog.md b/ChangeLog.md index 5201a414..7427de95 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -4,7 +4,18 @@ ChangeLog English version [by Google](https://gitflic-ru.translate.goog/project/erthink/libmdbx/blob?file=ChangeLog.md&_x_tr_sl=ru&_x_tr_tl=en) and [by Yandex](https://translated.turbopages.org/proxy_u/ru-en.en/https/gitflic.ru/project/erthink/libmdbx/blob?file=ChangeLog.md). -## v0.12.2 (Ярыгин) запланировано на 2022-11-11 +## v0.12.2 (Иван Ярыгин) от 2022-11-11 + +Выпуск с существенными доработками и новой функциональностью +в память о российском борце [Иване Сергеевиче Ярыгине](https://ru.wikipedia.org/wiki/Ярыгин,_Иван_Сергеевич). + +На Олимпийских играх в Мюнхене в 1972 году Иван Ярыгин уложил всех соперников на лопатки, +суммарно затратив менее 9 минут. Этот рекорд никем не побит до сих пор. + +``` +64 files changed, 5573 insertions(+), 2510 deletions(-) +Signed-off-by: Леонид Юрьев (Leonid Yuriev) +``` Новое: diff --git a/src/man1/mdbx_chk.1 b/src/man1/mdbx_chk.1 index 269a8246..e0587e99 100644 --- a/src/man1/mdbx_chk.1 +++ b/src/man1/mdbx_chk.1 @@ -1,6 +1,6 @@ .\" Copyright 2015-2022 Leonid Yuriev . .\" Copying restrictions apply. See COPYRIGHT/LICENSE. -.TH MDBX_CHK 1 "2022-08-24" "MDBX 0.12.1" +.TH MDBX_CHK 1 "2022-11-11" "MDBX 0.12.2" .SH NAME mdbx_chk \- MDBX checking tool .SH SYNOPSIS diff --git a/src/man1/mdbx_copy.1 b/src/man1/mdbx_copy.1 index 09cdaa5a..49e2b4d4 100644 --- a/src/man1/mdbx_copy.1 +++ b/src/man1/mdbx_copy.1 @@ -2,7 +2,7 @@ .\" Copyright 2012-2015 Howard Chu, Symas Corp. All Rights Reserved. .\" Copyright 2015,2016 Peter-Service R&D LLC . .\" Copying restrictions apply. See COPYRIGHT/LICENSE. -.TH MDBX_COPY 1 "2022-08-24" "MDBX 0.12.1" +.TH MDBX_COPY 1 "2022-11-11" "MDBX 0.12.2" .SH NAME mdbx_copy \- MDBX environment copy tool .SH SYNOPSIS diff --git a/src/man1/mdbx_drop.1 b/src/man1/mdbx_drop.1 index 099c485b..ec01905b 100644 --- a/src/man1/mdbx_drop.1 +++ b/src/man1/mdbx_drop.1 @@ -1,7 +1,7 @@ .\" Copyright 2021-2022 Leonid Yuriev . .\" Copyright 2014-2021 Howard Chu, Symas Corp. All Rights Reserved. .\" Copying restrictions apply. See COPYRIGHT/LICENSE. -.TH MDBX_DROP 1 "2022-08-24" "MDBX 0.12.1" +.TH MDBX_DROP 1 "2022-11-11" "MDBX 0.12.2" .SH NAME mdbx_drop \- MDBX database delete tool .SH SYNOPSIS diff --git a/src/man1/mdbx_dump.1 b/src/man1/mdbx_dump.1 index add3fe0e..5e173903 100644 --- a/src/man1/mdbx_dump.1 +++ b/src/man1/mdbx_dump.1 @@ -2,7 +2,7 @@ .\" Copyright 2014-2015 Howard Chu, Symas Corp. All Rights Reserved. .\" Copyright 2015,2016 Peter-Service R&D LLC . .\" Copying restrictions apply. See COPYRIGHT/LICENSE. -.TH MDBX_DUMP 1 "2022-08-24" "MDBX 0.12.1" +.TH MDBX_DUMP 1 "2022-11-11" "MDBX 0.12.2" .SH NAME mdbx_dump \- MDBX environment export tool .SH SYNOPSIS diff --git a/src/man1/mdbx_load.1 b/src/man1/mdbx_load.1 index 4ab41fbf..44dbe7d7 100644 --- a/src/man1/mdbx_load.1 +++ b/src/man1/mdbx_load.1 @@ -2,7 +2,7 @@ .\" Copyright 2014-2015 Howard Chu, Symas Corp. All Rights Reserved. .\" Copyright 2015,2016 Peter-Service R&D LLC . .\" Copying restrictions apply. See COPYRIGHT/LICENSE. -.TH MDBX_LOAD 1 "2022-08-24" "MDBX 0.12.1" +.TH MDBX_LOAD 1 "2022-11-11" "MDBX 0.12.2" .SH NAME mdbx_load \- MDBX environment import tool .SH SYNOPSIS diff --git a/src/man1/mdbx_stat.1 b/src/man1/mdbx_stat.1 index a47d52f0..3bc3664a 100644 --- a/src/man1/mdbx_stat.1 +++ b/src/man1/mdbx_stat.1 @@ -2,7 +2,7 @@ .\" Copyright 2012-2015 Howard Chu, Symas Corp. All Rights Reserved. .\" Copyright 2015,2016 Peter-Service R&D LLC . .\" Copying restrictions apply. See COPYRIGHT/LICENSE. -.TH MDBX_STAT 1 "2022-08-24" "MDBX 0.12.1" +.TH MDBX_STAT 1 "2022-11-11" "MDBX 0.12.2" .SH NAME mdbx_stat \- MDBX environment status tool .SH SYNOPSIS From 6c840cf58e93a09206f95a3ea47f799ca1f3da16 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Sun, 13 Nov 2022 20:59:31 +0300 Subject: [PATCH 217/364] =?UTF-8?q?mdbx:=20=D0=BF=D0=BE=D0=B4=D1=81=D1=87?= =?UTF-8?q?=D0=B5=D1=82=20=D0=B3=D1=80=D1=8F=D0=B7=D0=BD=D1=8B=D1=85=20?= =?UTF-8?q?=D1=81=D1=82=D1=80=D0=B0=D0=BD=D0=B8=D1=86=20=D0=B2=20=D1=80?= =?UTF-8?q?=D0=B5=D0=B6=D0=B8=D0=BC=D0=B5=20`MDBX=5FWRITEMAP`=20=D0=B4?= =?UTF-8?q?=D0=BB=D1=8F=20=D1=81=D1=82=D0=B0=D1=82=D0=B8=D1=81=D1=82=D0=B8?= =?UTF-8?q?=D0=BA=D0=B8.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core.c | 138 +++++++++++++++++++++++++----------------------- src/internals.h | 15 ++++-- 2 files changed, 81 insertions(+), 72 deletions(-) diff --git a/src/core.c b/src/core.c index cbb94b47..d60752fc 100644 --- a/src/core.c +++ b/src/core.c @@ -2460,49 +2460,50 @@ __hot static size_t pnl_merge(MDBX_PNL dst, const MDBX_PNL src) { } static void spill_remove(MDBX_txn *txn, size_t idx, pgno_t npages) { - tASSERT(txn, idx > 0 && idx <= MDBX_PNL_GETSIZE(txn->tw.spill_pages) && - txn->tw.spill_least_removed > 0); - txn->tw.spill_least_removed = - (idx < txn->tw.spill_least_removed) ? idx : txn->tw.spill_least_removed; - txn->tw.spill_pages[idx] |= 1; - MDBX_PNL_SETSIZE(txn->tw.spill_pages, - MDBX_PNL_GETSIZE(txn->tw.spill_pages) - - (idx == MDBX_PNL_GETSIZE(txn->tw.spill_pages))); + tASSERT(txn, idx > 0 && idx <= MDBX_PNL_GETSIZE(txn->tw.spilled.list) && + txn->tw.spilled.least_removed > 0); + txn->tw.spilled.least_removed = (idx < txn->tw.spilled.least_removed) + ? idx + : txn->tw.spilled.least_removed; + txn->tw.spilled.list[idx] |= 1; + MDBX_PNL_SETSIZE(txn->tw.spilled.list, + MDBX_PNL_GETSIZE(txn->tw.spilled.list) - + (idx == MDBX_PNL_GETSIZE(txn->tw.spilled.list))); while (unlikely(npages > 1)) { - const pgno_t pgno = (txn->tw.spill_pages[idx] >> 1) + 1; + const pgno_t pgno = (txn->tw.spilled.list[idx] >> 1) + 1; if (MDBX_PNL_ASCENDING) { - if (++idx > MDBX_PNL_GETSIZE(txn->tw.spill_pages) || - (txn->tw.spill_pages[idx] >> 1) != pgno) + if (++idx > MDBX_PNL_GETSIZE(txn->tw.spilled.list) || + (txn->tw.spilled.list[idx] >> 1) != pgno) return; } else { - if (--idx < 1 || (txn->tw.spill_pages[idx] >> 1) != pgno) + if (--idx < 1 || (txn->tw.spilled.list[idx] >> 1) != pgno) return; - txn->tw.spill_least_removed = (idx < txn->tw.spill_least_removed) - ? idx - : txn->tw.spill_least_removed; + txn->tw.spilled.least_removed = (idx < txn->tw.spilled.least_removed) + ? idx + : txn->tw.spilled.least_removed; } - txn->tw.spill_pages[idx] |= 1; - MDBX_PNL_SETSIZE(txn->tw.spill_pages, - MDBX_PNL_GETSIZE(txn->tw.spill_pages) - - (idx == MDBX_PNL_GETSIZE(txn->tw.spill_pages))); + txn->tw.spilled.list[idx] |= 1; + MDBX_PNL_SETSIZE(txn->tw.spilled.list, + MDBX_PNL_GETSIZE(txn->tw.spilled.list) - + (idx == MDBX_PNL_GETSIZE(txn->tw.spilled.list))); --npages; } } static MDBX_PNL spill_purge(MDBX_txn *txn) { - tASSERT(txn, txn->tw.spill_least_removed > 0); - const MDBX_PNL sl = txn->tw.spill_pages; - if (txn->tw.spill_least_removed != INT_MAX) { + tASSERT(txn, txn->tw.spilled.least_removed > 0); + const MDBX_PNL sl = txn->tw.spilled.list; + if (txn->tw.spilled.least_removed != INT_MAX) { size_t len = MDBX_PNL_GETSIZE(sl), r, w; - for (w = r = txn->tw.spill_least_removed; r <= len; ++r) { + for (w = r = txn->tw.spilled.least_removed; r <= len; ++r) { sl[w] = sl[r]; w += 1 - (sl[r] & 1); } for (size_t i = 1; i < w; ++i) tASSERT(txn, (sl[i] & 1) == 0); MDBX_PNL_SETSIZE(sl, w - 1); - txn->tw.spill_least_removed = INT_MAX; + txn->tw.spilled.least_removed = INT_MAX; } else { for (size_t i = 1; i <= MDBX_PNL_GETSIZE(sl); ++i) tASSERT(txn, (sl[i] & 1) == 0); @@ -2558,7 +2559,8 @@ static __inline size_t pnl_search(const MDBX_PNL pnl, pgno_t pgno, } static __inline size_t search_spilled(const MDBX_txn *txn, pgno_t pgno) { - const MDBX_PNL pnl = txn->tw.spill_pages; + tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); + const MDBX_PNL pnl = txn->tw.spilled.list; if (likely(!pnl)) return 0; pgno <<= 1; @@ -2568,7 +2570,7 @@ static __inline size_t search_spilled(const MDBX_txn *txn, pgno_t pgno) { static __inline bool intersect_spilled(const MDBX_txn *txn, pgno_t pgno, pgno_t npages) { - const MDBX_PNL pnl = txn->tw.spill_pages; + const MDBX_PNL pnl = txn->tw.spilled.list; if (likely(!pnl)) return false; const size_t len = MDBX_PNL_GETSIZE(pnl); @@ -4023,7 +4025,7 @@ static bool txn_refund(MDBX_txn *txn) { if (before == txn->mt_next_pgno) return false; - if (txn->tw.spill_pages) + if (txn->tw.spilled.list) /* Squash deleted pagenums if we refunded any */ spill_purge(txn); @@ -4132,7 +4134,7 @@ static int page_retire_ex(MDBX_cursor *mc, const pgno_t pgno, is_frozen = true; if (ASSERT_ENABLED()) { for (MDBX_txn *scan = txn; scan; scan = scan->mt_parent) { - tASSERT(txn, !search_spilled(scan, pgno)); + tASSERT(txn, !txn->tw.spilled.list || !search_spilled(scan, pgno)); tASSERT(txn, !scan->tw.dirtylist || !debug_dpl_find(scan, pgno)); } } @@ -4177,7 +4179,7 @@ static int page_retire_ex(MDBX_cursor *mc, const pgno_t pgno, is_shadowed = IS_SHADOWED(txn, mp); if (is_dirty) { tASSERT(txn, !is_spilled); - tASSERT(txn, !search_spilled(txn, pgno)); + tASSERT(txn, !txn->tw.spilled.list || !search_spilled(txn, pgno)); tASSERT(txn, debug_dpl_find(txn, pgno) == mp || txn->mt_parent || (txn->mt_flags & MDBX_WRITEMAP)); } else { @@ -4559,7 +4561,7 @@ static int spill_page(MDBX_txn *txn, iov_ctx_t *ctx, MDBX_page *dp, int err = iov_page(txn, ctx, dp, npages); if (likely(err == MDBX_SUCCESS) && (!MDBX_AVOID_MSYNC || !(txn->mt_flags & MDBX_WRITEMAP))) - err = pnl_append_range(true, &txn->tw.spill_pages, pgno << 1, npages); + err = pnl_append_range(true, &txn->tw.spilled.list, pgno << 1, npages); return err; } @@ -4758,7 +4760,7 @@ __cold static int txn_spill_slowpath(MDBX_txn *const txn, MDBX_cursor *const m0, if (txn->mt_flags & MDBX_WRITEMAP) { NOTICE("%s-spilling %zu dirty-entries, %zu dirty-npages", "msync", dirty_entries, dirty_npages); - tASSERT(txn, txn->tw.spill_pages == nullptr); + tASSERT(txn, txn->tw.spilled.list == nullptr); const MDBX_env *env = txn->mt_env; rc = osal_msync(&txn->mt_env->me_dxb_mmap, 0, @@ -4782,10 +4784,10 @@ __cold static int txn_spill_slowpath(MDBX_txn *const txn, MDBX_cursor *const m0, tASSERT(txn, txn->tw.dirtylist->pages_including_loose - txn->tw.loose_count >= need_spill_npages); if (!MDBX_AVOID_MSYNC || !(txn->mt_flags & MDBX_WRITEMAP)) { - if (!txn->tw.spill_pages) { - txn->tw.spill_least_removed = INT_MAX; - txn->tw.spill_pages = pnl_alloc(need_spill); - if (unlikely(!txn->tw.spill_pages)) { + if (!txn->tw.spilled.list) { + txn->tw.spilled.least_removed = INT_MAX; + txn->tw.spilled.list = pnl_alloc(need_spill); + if (unlikely(!txn->tw.spilled.list)) { rc = MDBX_ENOMEM; bailout: txn->mt_flags |= MDBX_TXN_ERROR; @@ -4794,7 +4796,7 @@ __cold static int txn_spill_slowpath(MDBX_txn *const txn, MDBX_cursor *const m0, } else { /* purge deleted slots */ spill_purge(txn); - rc = pnl_reserve(&txn->tw.spill_pages, need_spill); + rc = pnl_reserve(&txn->tw.spilled.list, need_spill); (void)rc /* ignore since the resulting list may be shorter and pnl_append() will increase pnl on demand */ ; @@ -4978,7 +4980,7 @@ __cold static int txn_spill_slowpath(MDBX_txn *const txn, MDBX_cursor *const m0, goto bailout; if (!MDBX_AVOID_MSYNC || !(txn->mt_flags & MDBX_WRITEMAP)) { - pnl_sort(txn->tw.spill_pages, (size_t)txn->mt_next_pgno << 1); + pnl_sort(txn->tw.spilled.list, (size_t)txn->mt_next_pgno << 1); txn->mt_flags |= MDBX_TXN_SPILLS; } NOTICE("spilled %u dirty-entries, %u dirty-npages, now have %zu dirty-room", @@ -5484,6 +5486,8 @@ __hot static int __must_check_result page_dirty(MDBX_txn *txn, MDBX_page *mp, tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); if (!txn->tw.dirtylist) { tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) != 0 && !MDBX_AVOID_MSYNC); + txn->tw.writemap_dirty_npages += npages; + tASSERT(txn, txn->tw.spilled.list == nullptr); return MDBX_SUCCESS; } tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); @@ -8332,8 +8336,8 @@ static int txn_renew(MDBX_txn *txn, const unsigned flags) { txn->tw.loose_refund_wl = 0; #endif /* MDBX_ENABLE_REFUND */ MDBX_PNL_SETSIZE(txn->tw.retired_pages, 0); - txn->tw.spill_pages = NULL; - txn->tw.spill_least_removed = 0; + txn->tw.spilled.list = NULL; + txn->tw.spilled.least_removed = 0; txn->tw.last_reclaimed = 0; if (txn->tw.lifo_reclaimed) MDBX_PNL_SETSIZE(txn->tw.lifo_reclaimed, 0); @@ -8627,7 +8631,7 @@ int mdbx_txn_begin_ex(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags, txn->tw.dirtylru = parent->tw.dirtylru; dpl_sort(parent); - if (parent->tw.spill_pages) + if (parent->tw.spilled.list) spill_purge(parent); tASSERT(txn, MDBX_PNL_ALLOCLEN(txn->tw.relist) >= @@ -8704,7 +8708,7 @@ int mdbx_txn_begin_ex(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags, eASSERT(env, (txn->mt_flags & ~(MDBX_WRITEMAP | MDBX_SHRINK_ALLOWED | MDBX_NOMETASYNC | MDBX_SAFE_NOSYNC | MDBX_TXN_SPILLS)) == 0); - assert(!txn->tw.spill_pages && !txn->tw.spill_least_removed); + assert(!txn->tw.spilled.list && !txn->tw.spilled.least_removed); } txn->mt_signature = MDBX_MT_SIGNATURE; txn->mt_userctx = context; @@ -8809,10 +8813,9 @@ int mdbx_txn_info(const MDBX_txn *txn, MDBX_txn_info *info, bool scan_rlt) { env, txn->mt_child ? (size_t)txn->tw.retired_pages : MDBX_PNL_GETSIZE(txn->tw.retired_pages)); info->txn_space_leftover = pgno2bytes(env, txn->tw.dirtyroom); - info->txn_space_dirty = - txn->tw.dirtylist - ? pgno2bytes(env, txn->tw.dirtylist->pages_including_loose) - : 0; + info->txn_space_dirty = pgno2bytes( + env, txn->tw.dirtylist ? txn->tw.dirtylist->pages_including_loose + : txn->tw.writemap_dirty_npages); info->txn_reader_lag = INT64_MAX; MDBX_lockinfo *const lck = env->me_lck_mmap.lck; if (scan_rlt && lck) { @@ -9128,8 +9131,8 @@ static int txn_end(MDBX_txn *txn, const unsigned mode) { txn->mt_flags = MDBX_TXN_FINISHED; txn->mt_owner = 0; env->me_txn = txn->mt_parent; - pnl_free(txn->tw.spill_pages); - txn->tw.spill_pages = nullptr; + pnl_free(txn->tw.spilled.list); + txn->tw.spilled.list = nullptr; if (txn == env->me_txn0) { eASSERT(env, txn->mt_parent == NULL); /* Export or close DBI handles created in this txn */ @@ -10476,7 +10479,8 @@ static __inline void txn_merge(MDBX_txn *const parent, MDBX_txn *const txn, MDBX_PNL_SETSIZE(parent->tw.retired_pages, w); /* Filter-out parent spill list */ - if (parent->tw.spill_pages && MDBX_PNL_GETSIZE(parent->tw.spill_pages) > 0) { + if (parent->tw.spilled.list && + MDBX_PNL_GETSIZE(parent->tw.spilled.list) > 0) { const MDBX_PNL sl = spill_purge(parent); size_t len = MDBX_PNL_GETSIZE(sl); if (len) { @@ -10564,10 +10568,10 @@ static __inline void txn_merge(MDBX_txn *const parent, MDBX_txn *const txn, } /* Remove anything in our spill list from parent's dirty list */ - if (txn->tw.spill_pages) { - tASSERT(txn, pnl_check_allocated(txn->tw.spill_pages, + if (txn->tw.spilled.list) { + tASSERT(txn, pnl_check_allocated(txn->tw.spilled.list, (size_t)parent->mt_next_pgno << 1)); - dpl_sift(parent, txn->tw.spill_pages, true); + dpl_sift(parent, txn->tw.spilled.list, true); tASSERT(parent, parent->tw.dirtyroom + parent->tw.dirtylist->length == (parent->mt_parent ? parent->mt_parent->tw.dirtyroom @@ -10719,23 +10723,23 @@ static __inline void txn_merge(MDBX_txn *const parent, MDBX_txn *const txn, tASSERT(parent, dirtylist_check(parent)); dpl_free(txn); - if (txn->tw.spill_pages) { - if (parent->tw.spill_pages) { + if (txn->tw.spilled.list) { + if (parent->tw.spilled.list) { /* Must not fail since space was preserved above. */ - pnl_merge(parent->tw.spill_pages, txn->tw.spill_pages); - pnl_free(txn->tw.spill_pages); + pnl_merge(parent->tw.spilled.list, txn->tw.spilled.list); + pnl_free(txn->tw.spilled.list); } else { - parent->tw.spill_pages = txn->tw.spill_pages; - parent->tw.spill_least_removed = txn->tw.spill_least_removed; + parent->tw.spilled.list = txn->tw.spilled.list; + parent->tw.spilled.least_removed = txn->tw.spilled.least_removed; } tASSERT(parent, dirtylist_check(parent)); } parent->mt_flags &= ~MDBX_TXN_HAS_CHILD; - if (parent->tw.spill_pages) { - assert(pnl_check_allocated(parent->tw.spill_pages, + if (parent->tw.spilled.list) { + assert(pnl_check_allocated(parent->tw.spilled.list, (size_t)parent->mt_next_pgno << 1)); - if (MDBX_PNL_GETSIZE(parent->tw.spill_pages)) + if (MDBX_PNL_GETSIZE(parent->tw.spilled.list)) parent->mt_flags |= MDBX_TXN_SPILLS; } } @@ -10806,8 +10810,8 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { sizeof(parent->mt_geo)) == 0); tASSERT(txn, memcmp(&parent->mt_canary, &txn->mt_canary, sizeof(parent->mt_canary)) == 0); - tASSERT(txn, !txn->tw.spill_pages || - MDBX_PNL_GETSIZE(txn->tw.spill_pages) == 0); + tASSERT(txn, !txn->tw.spilled.list || + MDBX_PNL_GETSIZE(txn->tw.spilled.list) == 0); tASSERT(txn, txn->tw.loose_count == 0); /* fast completion of pure nested transaction */ @@ -10827,10 +10831,10 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { goto fail; } - if (txn->tw.spill_pages) { - if (parent->tw.spill_pages) { - rc = pnl_need(&parent->tw.spill_pages, - MDBX_PNL_GETSIZE(txn->tw.spill_pages)); + if (txn->tw.spilled.list) { + if (parent->tw.spilled.list) { + rc = pnl_need(&parent->tw.spilled.list, + MDBX_PNL_GETSIZE(txn->tw.spilled.list)); if (unlikely(rc != MDBX_SUCCESS)) goto fail; } @@ -14277,7 +14281,7 @@ __cold static int env_close(MDBX_env *env) { dpl_free(env->me_txn0); txl_free(env->me_txn0->tw.lifo_reclaimed); pnl_free(env->me_txn0->tw.retired_pages); - pnl_free(env->me_txn0->tw.spill_pages); + pnl_free(env->me_txn0->tw.spilled.list); pnl_free(env->me_txn0->tw.relist); osal_free(env->me_txn0); env->me_txn0 = nullptr; diff --git a/src/internals.h b/src/internals.h index d29b1068..1c0c6f98 100644 --- a/src/internals.h +++ b/src/internals.h @@ -1088,11 +1088,16 @@ struct MDBX_txn { MDBX_page *loose_pages; /* Number of loose pages (tw.loose_pages) */ size_t loose_count; - size_t spill_least_removed; - /* The sorted list of dirty pages we temporarily wrote to disk - * because the dirty list was full. page numbers in here are - * shifted left by 1, deleted slots have the LSB set. */ - MDBX_PNL spill_pages; + union { + struct { + size_t least_removed; + /* The sorted list of dirty pages we temporarily wrote to disk + * because the dirty list was full. page numbers in here are + * shifted left by 1, deleted slots have the LSB set. */ + MDBX_PNL list; + } spilled; + size_t writemap_dirty_npages; + }; } tw; }; }; From eaf063ca9b2272be054476bbba973ee0c4ee1e0f Mon Sep 17 00:00:00 2001 From: Jan Biedermann Date: Mon, 14 Nov 2022 18:03:20 +0300 Subject: [PATCH 218/364] mdbx: fix typo of `||` inside `#if` byte-order condition. https://gitflic.ru/project/erthink/libmdbx/merge-request/4 --- src/base.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/base.h b/src/base.h index 3533d575..bf5a5007 100644 --- a/src/base.h +++ b/src/base.h @@ -293,8 +293,8 @@ __extern_C key_t ftok(const char *, int); /* Byteorder */ #if defined(i386) || defined(__386) || defined(__i386) || defined(__i386__) || \ - defined(i486) || defined(__i486) || defined(__i486__) || \ - defined(i586) | defined(__i586) || defined(__i586__) || defined(i686) || \ + defined(i486) || defined(__i486) || defined(__i486__) || defined(i586) || \ + defined(__i586) || defined(__i586__) || defined(i686) || \ defined(__i686) || defined(__i686__) || defined(_M_IX86) || \ defined(_X86_) || defined(__THW_INTEL__) || defined(__I86__) || \ defined(__INTEL__) || defined(__x86_64) || defined(__x86_64__) || \ From 0f92baaa5e4a692a394bdb9e9607736449725b9c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Tue, 15 Nov 2022 01:08:23 +0300 Subject: [PATCH 219/364] =?UTF-8?q?mdbx:=20=D0=BE=D0=B1=D0=BD=D0=BE=D0=B2?= =?UTF-8?q?=D0=BB=D0=B5=D0=BD=D0=B8=D0=B5=20debug=5Fbegin.h=20=D0=B8=20deb?= =?UTF-8?q?ug=5Fend.h?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/debug_begin.h | 56 +++++++++++++++++++++++------------------------ src/debug_end.h | 14 ++++++------ 2 files changed, 35 insertions(+), 35 deletions(-) diff --git a/src/debug_begin.h b/src/debug_begin.h index 9a904095..521e99cf 100644 --- a/src/debug_begin.h +++ b/src/debug_begin.h @@ -1,42 +1,42 @@ #if defined(__GNUC__) && !defined(__LCC__) -#pragma push_macro("mdbx_trace") -#pragma push_macro("mdbx_debug") -#pragma push_macro("mdbx_verbose") -#pragma push_macro("mdbx_notice") -#pragma push_macro("mdbx_warning") -#pragma push_macro("mdbx_error") -#pragma push_macro("mdbx_assert") +#pragma push_macro("TRACE") +#pragma push_macro("DEBUG") +#pragma push_macro("VERBOSE") +#pragma push_macro("NOTICE") +#pragma push_macro("WARNING") +#pragma push_macro("ERROR") +#pragma push_macro("eASSERT") -#undef mdbx_trace -#define mdbx_trace(fmt, ...) \ - mdbx_debug_log(MDBX_LOG_TRACE, __func__, __LINE__, fmt "\n", __VA_ARGS__) +#undef TRACE +#define TRACE(fmt, ...) \ + debug_log(MDBX_LOG_TRACE, __func__, __LINE__, fmt "\n", __VA_ARGS__) -#undef mdbx_debug -#define mdbx_debug(fmt, ...) \ - mdbx_debug_log(MDBX_LOG_DEBUG, __func__, __LINE__, fmt "\n", __VA_ARGS__) +#undef DEBUG +#define DEBUG(fmt, ...) \ + debug_log(MDBX_LOG_DEBUG, __func__, __LINE__, fmt "\n", __VA_ARGS__) -#undef mdbx_verbose -#define mdbx_verbose(fmt, ...) \ - mdbx_debug_log(MDBX_LOG_VERBOSE, __func__, __LINE__, fmt "\n", __VA_ARGS__) +#undef VERBOSE +#define VERBOSE(fmt, ...) \ + debug_log(MDBX_LOG_VERBOSE, __func__, __LINE__, fmt "\n", __VA_ARGS__) -#undef mdbx_notice -#define mdbx_notice(fmt, ...) \ - mdbx_debug_log(MDBX_LOG_NOTICE, __func__, __LINE__, fmt "\n", __VA_ARGS__) +#undef NOTICE +#define NOTICE(fmt, ...) \ + debug_log(MDBX_LOG_NOTICE, __func__, __LINE__, fmt "\n", __VA_ARGS__) -#undef mdbx_warning -#define mdbx_warning(fmt, ...) \ - mdbx_debug_log(MDBX_LOG_WARN, __func__, __LINE__, fmt "\n", __VA_ARGS__) +#undef WARNING +#define WARNING(fmt, ...) \ + debug_log(MDBX_LOG_WARN, __func__, __LINE__, fmt "\n", __VA_ARGS__) -#undef mdbx_error -#define mdbx_error(fmt, ...) \ - mdbx_debug_log(MDBX_LOG_ERROR, __func__, __LINE__, fmt "\n", __VA_ARGS__) +#undef ERROR +#define ERROR(fmt, ...) \ + debug_log(MDBX_LOG_ERROR, __func__, __LINE__, fmt "\n", __VA_ARGS__) -#undef mdbx_assert -#define mdbx_assert(env, expr) mdbx_ensure(env, expr) +#undef eASSERT +#define eASSERT(env, expr) ENSURE(env, expr) #if !defined(__clang__) -#pragma GCC optimize("-O0") +#pragma GCC optimize("-Og") #endif #endif /* GCC only */ diff --git a/src/debug_end.h b/src/debug_end.h index a854f715..bbf66526 100644 --- a/src/debug_end.h +++ b/src/debug_end.h @@ -1,12 +1,12 @@ #if defined(__GNUC__) && !defined(__LCC__) -#pragma pop_macro("mdbx_trace") -#pragma pop_macro("mdbx_debug") -#pragma pop_macro("mdbx_verbose") -#pragma pop_macro("mdbx_notice") -#pragma pop_macro("mdbx_warning") -#pragma pop_macro("mdbx_error") -#pragma pop_macro("mdbx_assert") +#pragma pop_macro("TRACE") +#pragma pop_macro("DEBUG") +#pragma pop_macro("VERBOSE") +#pragma pop_macro("NOTICE") +#pragma pop_macro("WARNING") +#pragma pop_macro("ERROR") +#pragma pop_macro("eASSERT") #if !defined(__clang__) #pragma GCC reset_options From 3563ed00e3884a9417e3fe7abeca5e3ff9d77a18 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Tue, 15 Nov 2022 14:42:05 +0300 Subject: [PATCH 220/364] =?UTF-8?q?mdbx:=20=D0=B8=D1=81=D0=BF=D0=BE=D0=BB?= =?UTF-8?q?=D1=8C=D0=B7=D0=BE=D0=B2=D0=B0=D0=BD=D0=B8=D0=B5=20=D0=BD=D0=B5?= =?UTF-8?q?-=D1=81=D0=BF=D0=B0=D1=80=D0=B5=D0=BD=D0=BD=D0=BE=D0=B3=D0=BE?= =?UTF-8?q?=20=D0=BA=D1=83=D1=80=D1=81=D0=BE=D1=80=D0=B0=20=D0=B8=20`gc=5F?= =?UTF-8?q?cursor=5Finit()`=20=D0=B2=D0=BD=D1=83=D1=82=D1=80=D0=B8=20`upda?= =?UTF-8?q?te=5Fgc()`.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core.c | 91 +++++++++++++++++++++++++----------------------------- 1 file changed, 42 insertions(+), 49 deletions(-) diff --git a/src/core.c b/src/core.c index d60752fc..cf34a1a2 100644 --- a/src/core.c +++ b/src/core.c @@ -9399,7 +9399,7 @@ typedef struct gc_update_context { #if MDBX_ENABLE_BIGFOOT txnid_t bigfoot; #endif /* MDBX_ENABLE_BIGFOOT */ - MDBX_cursor_couple cursor; + MDBX_cursor cursor; } gcu_context_t; static __inline int gcu_context_init(MDBX_txn *txn, gcu_context_t *ctx) { @@ -9408,7 +9408,7 @@ static __inline int gcu_context_init(MDBX_txn *txn, gcu_context_t *ctx) { #if MDBX_ENABLE_BIGFOOT ctx->bigfoot = txn->mt_txnid; #endif /* MDBX_ENABLE_BIGFOOT */ - return cursor_init(&ctx->cursor.outer, txn, FREE_DBI); + return gc_cursor_init(&ctx->cursor, txn); } static __always_inline size_t gcu_backlog_size(MDBX_txn *txn) { @@ -9427,10 +9427,10 @@ static int gcu_clean_stored_retired(MDBX_txn *txn, gcu_context_t *ctx) { #endif /* MDBX_ENABLE_BIGFOOT */ key.iov_len = sizeof(txnid_t); const struct cursor_set_result csr = - cursor_set(&ctx->cursor.outer, &key, &val, MDBX_SET); + cursor_set(&ctx->cursor, &key, &val, MDBX_SET); if (csr.err == MDBX_SUCCESS && csr.exact) { ctx->retired_stored = 0; - err = mdbx_cursor_del(&ctx->cursor.outer, 0); + err = mdbx_cursor_del(&ctx->cursor, 0); TRACE("== clear-4linear, backlog %zu, err %d", gcu_backlog_size(txn), err); } @@ -9472,14 +9472,14 @@ static int gcu_prepare_backlog(MDBX_txn *txn, gcu_context_t *ctx, key.iov_base = val.iov_base = nullptr; key.iov_len = sizeof(txnid_t); val.iov_len = MDBX_PNL_SIZEOF(txn->tw.retired_pages); - err = cursor_spill(&ctx->cursor.outer, &key, &val); + err = cursor_spill(&ctx->cursor, &key, &val); if (unlikely(err != MDBX_SUCCESS)) return err; } tASSERT(txn, txn->mt_flags & MDBX_TXN_UPDATE_GC); txn->mt_flags -= MDBX_TXN_UPDATE_GC; - err = cursor_touch(&ctx->cursor.outer); + err = cursor_touch(&ctx->cursor); TRACE("== after-touch, backlog %zu, err %d", gcu_backlog_size(txn), err); if (unlikely(pages4retiredlist > 1) && @@ -9489,17 +9489,17 @@ static int gcu_prepare_backlog(MDBX_txn *txn, gcu_context_t *ctx, err = gcu_clean_stored_retired(txn, ctx); if (unlikely(err != MDBX_SUCCESS)) return err; - err = page_alloc_slowpath(&ctx->cursor.outer, (pgno_t)pages4retiredlist, + err = page_alloc_slowpath(&ctx->cursor, (pgno_t)pages4retiredlist, MDBX_ALLOC_GC | MDBX_ALLOC_RESERVE) .err; TRACE("== after-4linear, backlog %zu, err %d", gcu_backlog_size(txn), err); - cASSERT(&ctx->cursor.outer, + cASSERT(&ctx->cursor, gcu_backlog_size(txn) >= pages4retiredlist || err != MDBX_SUCCESS); } while (gcu_backlog_size(txn) < backlog4cow + pages4retiredlist && err == MDBX_SUCCESS) - err = page_alloc_slowpath(&ctx->cursor.outer, 0, + err = page_alloc_slowpath(&ctx->cursor, 0, MDBX_ALLOC_GC | MDBX_ALLOC_SLOT | MDBX_ALLOC_RESERVE | MDBX_ALLOC_BACKLOG) .err; @@ -9534,8 +9534,8 @@ static int update_gc(MDBX_txn *txn, gcu_context_t *ctx) { const char *const dbg_prefix_mode = ctx->lifo ? " lifo" : " fifo"; (void)dbg_prefix_mode; txn->mt_flags += MDBX_TXN_UPDATE_GC; - ctx->cursor.outer.mc_next = txn->mt_cursors[FREE_DBI]; - txn->mt_cursors[FREE_DBI] = &ctx->cursor.outer; + ctx->cursor.mc_next = txn->mt_cursors[FREE_DBI]; + txn->mt_cursors[FREE_DBI] = &ctx->cursor; /* txn->tw.relist[] can grow and shrink during this call. * txn->tw.last_reclaimed and txn->tw.retired_pages[] can only grow. @@ -9597,7 +9597,7 @@ retry: ctx->cleaned_id <= env->me_lck->mti_oldest_reader.weak); key.iov_base = &ctx->cleaned_id; key.iov_len = sizeof(ctx->cleaned_id); - rc = mdbx_cursor_get(&ctx->cursor.outer, &key, NULL, MDBX_SET); + rc = mdbx_cursor_get(&ctx->cursor, &key, NULL, MDBX_SET); if (rc == MDBX_NOTFOUND) continue; if (unlikely(rc != MDBX_SUCCESS)) @@ -9610,18 +9610,17 @@ retry: tASSERT(txn, ctx->cleaned_id <= env->me_lck->mti_oldest_reader.weak); TRACE("%s: cleanup-reclaimed-id [%zu]%" PRIaTXN, dbg_prefix_mode, ctx->cleaned_slot, ctx->cleaned_id); - tASSERT(txn, *txn->mt_cursors == &ctx->cursor.outer); - rc = mdbx_cursor_del(&ctx->cursor.outer, 0); + tASSERT(txn, *txn->mt_cursors == &ctx->cursor); + rc = mdbx_cursor_del(&ctx->cursor, 0); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } while (ctx->cleaned_slot < MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed)); txl_sort(txn->tw.lifo_reclaimed); } } else { - /* If using records from GC which we have not yet deleted, - * now delete them and any we reserved for tw.relist. */ + /* Удаляем оставшиеся вынутые из GC записи. */ while (ctx->cleaned_id <= txn->tw.last_reclaimed) { - rc = cursor_first(&ctx->cursor.outer, &key, NULL); + rc = cursor_first(&ctx->cursor, &key, NULL); if (rc == MDBX_NOTFOUND) break; if (unlikely(rc != MDBX_SUCCESS)) @@ -9646,8 +9645,8 @@ retry: tASSERT(txn, ctx->cleaned_id <= env->me_lck->mti_oldest_reader.weak); TRACE("%s: cleanup-reclaimed-id %" PRIaTXN, dbg_prefix_mode, ctx->cleaned_id); - tASSERT(txn, *txn->mt_cursors == &ctx->cursor.outer); - rc = mdbx_cursor_del(&ctx->cursor.outer, 0); + tASSERT(txn, *txn->mt_cursors == &ctx->cursor); + rc = mdbx_cursor_del(&ctx->cursor, 0); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } @@ -9682,7 +9681,7 @@ retry: if (txn->tw.loose_count > 0) { TRACE("%s: try allocate gc-slot for %zu loose-pages", dbg_prefix_mode, txn->tw.loose_count); - rc = page_alloc_slowpath(&ctx->cursor.outer, 0, + rc = page_alloc_slowpath(&ctx->cursor, 0, MDBX_ALLOC_GC | MDBX_ALLOC_SLOT | MDBX_ALLOC_RESERVE) .err; @@ -9768,8 +9767,7 @@ retry: if (unlikely(!ctx->retired_stored)) { /* Make sure last page of GC is touched and on retired-list */ txn->mt_flags -= MDBX_TXN_UPDATE_GC; - rc = page_search(&ctx->cursor.outer, NULL, - MDBX_PS_LAST | MDBX_PS_MODIFY); + rc = page_search(&ctx->cursor, NULL, MDBX_PS_LAST | MDBX_PS_MODIFY); txn->mt_flags += MDBX_TXN_UPDATE_GC; if (unlikely(rc != MDBX_SUCCESS) && rc != MDBX_NOTFOUND) goto bailout; @@ -9801,7 +9799,7 @@ retry: ? env->me_maxgc_ov1page : left; data.iov_len = (chunk + 1) * sizeof(pgno_t); - rc = mdbx_cursor_put(&ctx->cursor.outer, &key, &data, MDBX_RESERVE); + rc = mdbx_cursor_put(&ctx->cursor, &key, &data, MDBX_RESERVE); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; @@ -9839,7 +9837,7 @@ retry: do { gcu_prepare_backlog(txn, ctx, true); data.iov_len = MDBX_PNL_SIZEOF(txn->tw.retired_pages); - rc = mdbx_cursor_put(&ctx->cursor.outer, &key, &data, MDBX_RESERVE); + rc = mdbx_cursor_put(&ctx->cursor, &key, &data, MDBX_RESERVE); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; /* Retry if tw.retired_pages[] grew during the Put() */ @@ -9906,17 +9904,17 @@ retry: left > (MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed) - ctx->reused_slot) * env->me_maxgc_ov1page && !ctx->dense) { - /* LY: need just a txn-id for save page list. */ + /* Hужен свобожный для для сохранения списка страниц. */ bool need_cleanup = false; - txnid_t snap_oldest; + txnid_t snap_oldest = 0; retry_rid: txn->mt_flags -= MDBX_TXN_UPDATE_GC; do { - snap_oldest = txn_oldest_reader(txn); - rc = page_alloc_slowpath(&ctx->cursor.outer, 0, + rc = page_alloc_slowpath(&ctx->cursor, 0, MDBX_ALLOC_GC | MDBX_ALLOC_SLOT | MDBX_ALLOC_RESERVE) .err; + snap_oldest = env->me_lck->mti_oldest_reader.weak; if (likely(rc == MDBX_SUCCESS)) { TRACE("%s: took @%" PRIaTXN " from GC", dbg_prefix_mode, MDBX_PNL_LAST(txn->tw.lifo_reclaimed)); @@ -9956,7 +9954,8 @@ retry: ctx->rid); } - /* LY: GC is empty, will look any free txn-id in high2low order. */ + /* В GC нет годных к переработке записей, + * будем использовать свободные id в обратном порядке. */ while (MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed) < prefer_max_scatter && left > (MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed) - ctx->reused_slot) * @@ -9974,26 +9973,20 @@ retry: } tASSERT(txn, ctx->rid >= MIN_TXNID && ctx->rid <= MAX_TXNID); - --ctx->rid; + ctx->rid -= 1; key.iov_base = &ctx->rid; key.iov_len = sizeof(ctx->rid); - rc = mdbx_cursor_get(&ctx->cursor.outer, &key, &data, MDBX_SET_KEY); + rc = mdbx_cursor_get(&ctx->cursor, &key, &data, MDBX_SET_KEY); if (unlikely(rc == MDBX_SUCCESS)) { - DEBUG("%s: GC's id %" PRIaTXN " is used, continue bottom-up search", + DEBUG("%s: GC's id %" PRIaTXN " is present, going to first", dbg_prefix_mode, ctx->rid); - ++ctx->rid; - rc = mdbx_cursor_get(&ctx->cursor.outer, &key, &data, MDBX_FIRST); - if (rc == MDBX_NOTFOUND) { - DEBUG("%s: GC is empty (going dense-mode)", dbg_prefix_mode); - ctx->dense = true; - break; - } + rc = cursor_first(&ctx->cursor, &key, nullptr); if (unlikely(rc != MDBX_SUCCESS || key.iov_len != sizeof(txnid_t))) { rc = MDBX_CORRUPTED; goto bailout; } - txnid_t gc_first = unaligned_peek_u64(4, key.iov_base); + const txnid_t gc_first = unaligned_peek_u64(4, key.iov_base); if (gc_first <= MIN_TXNID) { DEBUG("%s: no free GC's id(s) less than %" PRIaTXN " (going dense-mode)", @@ -10041,13 +10034,13 @@ retry: tASSERT(txn, txn->tw.lifo_reclaimed == NULL); if (unlikely(ctx->rid == 0)) { ctx->rid = txn_oldest_reader(txn); - rc = mdbx_cursor_get(&ctx->cursor.outer, &key, NULL, MDBX_FIRST); - if (rc == MDBX_SUCCESS) { + rc = cursor_first(&ctx->cursor, &key, nullptr); + if (likely(rc == MDBX_SUCCESS)) { if (unlikely(key.iov_len != sizeof(txnid_t))) { rc = MDBX_CORRUPTED; goto bailout; } - txnid_t gc_first = unaligned_peek_u64(4, key.iov_base); + const txnid_t gc_first = unaligned_peek_u64(4, key.iov_base); if (ctx->rid >= gc_first) ctx->rid = gc_first - 1; if (unlikely(ctx->rid == 0)) { @@ -10138,7 +10131,7 @@ retry: TRACE("%s: reserve %zu [%zu...%zu) @%" PRIaTXN, dbg_prefix_mode, chunk, ctx->settled + 1, ctx->settled + chunk + 1, reservation_gc_id); gcu_prepare_backlog(txn, ctx, true); - rc = mdbx_cursor_put(&ctx->cursor.outer, &key, &data, + rc = mdbx_cursor_put(&ctx->cursor, &key, &data, MDBX_RESERVE | MDBX_NOOVERWRITE); tASSERT(txn, pnl_check_allocated(txn->tw.relist, txn->mt_next_pgno - MDBX_ENABLE_REFUND)); @@ -10186,7 +10179,7 @@ retry: size_t left = amount; if (txn->tw.lifo_reclaimed == nullptr) { tASSERT(txn, ctx->lifo == 0); - rc = cursor_first(&ctx->cursor.outer, &key, &data); + rc = cursor_first(&ctx->cursor, &key, &data); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } else { @@ -10220,7 +10213,7 @@ retry: dbg_prefix_mode, fill_gc_id, ctx->filled_slot); key.iov_base = &fill_gc_id; key.iov_len = sizeof(fill_gc_id); - rc = mdbx_cursor_get(&ctx->cursor.outer, &key, &data, MDBX_SET_KEY); + rc = mdbx_cursor_get(&ctx->cursor, &key, &data, MDBX_SET_KEY); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } @@ -10247,7 +10240,7 @@ retry: } chunk = left; } - rc = mdbx_cursor_put(&ctx->cursor.outer, &key, &data, + rc = mdbx_cursor_put(&ctx->cursor, &key, &data, MDBX_CURRENT | MDBX_RESERVE); txn->mt_flags &= ~MDBX_TXN_FROZEN_RE; if (unlikely(rc != MDBX_SUCCESS)) @@ -10298,7 +10291,7 @@ retry: if (txn->tw.lifo_reclaimed == nullptr) { tASSERT(txn, ctx->lifo == 0); - rc = cursor_next(&ctx->cursor.outer, &key, &data, MDBX_NEXT); + rc = cursor_next(&ctx->cursor, &key, &data, MDBX_NEXT); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } else { @@ -10329,7 +10322,7 @@ retry: ctx->cleaned_slot == MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed)); bailout: - txn->mt_cursors[FREE_DBI] = ctx->cursor.outer.mc_next; + txn->mt_cursors[FREE_DBI] = ctx->cursor.mc_next; MDBX_PNL_SETSIZE(txn->tw.relist, 0); #if MDBX_ENABLE_PROFGC From e518edcfedf1b04669e02616b5dcb04a9d2de2be Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Tue, 15 Nov 2022 16:12:30 +0300 Subject: [PATCH 221/364] =?UTF-8?q?mdbx:=20=D1=83=D0=BD=D0=B8=D1=84=D0=B8?= =?UTF-8?q?=D1=86=D0=B8=D1=80=D0=BE=D0=B2=D0=B0=D0=BD=D0=B8=D0=B5=20=D0=B8?= =?UTF-8?q?=D0=BD=D0=B8=D1=86=D0=B8=D0=B0=D0=BB=D0=B8=D0=B7=D0=B0=D1=86?= =?UTF-8?q?=D0=B8=D0=B8=20`mp=5Ftxnid`=20=D0=B2=D0=BD=D1=83=D1=82=D1=80?= =?UTF-8?q?=D0=B8=20`page=5Fdirty()`.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/core.c b/src/core.c index cf34a1a2..8f4c12f3 100644 --- a/src/core.c +++ b/src/core.c @@ -5484,6 +5484,7 @@ __cold static pgno_t find_largest_snapshot(const MDBX_env *env, __hot static int __must_check_result page_dirty(MDBX_txn *txn, MDBX_page *mp, pgno_t npages) { tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); + mp->mp_txnid = txn->mt_front; if (!txn->tw.dirtylist) { tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) != 0 && !MDBX_AVOID_MSYNC); txn->tw.writemap_dirty_npages += npages; @@ -5500,7 +5501,6 @@ __hot static int __must_check_result page_dirty(MDBX_txn *txn, MDBX_page *mp, #endif /* xMDBX_DEBUG_SPILLING == 2 */ int rc; - mp->mp_txnid = txn->mt_front; if (unlikely(txn->tw.dirtyroom == 0)) { if (txn->tw.loose_count) { MDBX_page *loose = txn->tw.loose_pages; @@ -17082,7 +17082,6 @@ static pgr_t page_new(MDBX_cursor *mc, const unsigned flags) { DEBUG("db %u allocated new page %" PRIaPGNO, mc->mc_dbi, ret.page->mp_pgno); ret.page->mp_flags = (uint16_t)flags; - ret.page->mp_txnid = mc->mc_txn->mt_front; cASSERT(mc, *mc->mc_dbistate & DBI_DIRTY); cASSERT(mc, mc->mc_txn->mt_flags & MDBX_TXN_DIRTY); #if MDBX_ENABLE_PGOP_STAT @@ -17114,7 +17113,6 @@ static pgr_t page_new_large(MDBX_cursor *mc, const unsigned npages) { DEBUG("db %u allocated new large-page %" PRIaPGNO ", num %u", mc->mc_dbi, ret.page->mp_pgno, npages); ret.page->mp_flags = P_OVERFLOW; - ret.page->mp_txnid = mc->mc_txn->mt_front; cASSERT(mc, *mc->mc_dbistate & DBI_DIRTY); cASSERT(mc, mc->mc_txn->mt_flags & MDBX_TXN_DIRTY); #if MDBX_ENABLE_PGOP_STAT From 3e05d1a4272c463455020032f35e8758817e7ada Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Wed, 16 Nov 2022 11:48:02 +0300 Subject: [PATCH 222/364] =?UTF-8?q?mdbx:=20=D0=BE=D0=BF=D1=82=D0=B8=D0=BC?= =?UTF-8?q?=D0=B8=D0=B7=D0=B0=D1=86=D0=B8=D1=8F=20`page=5Fcopy()`=20=D0=B4?= =?UTF-8?q?=D0=BB=D1=8F=20`LEAF2`=20=D0=B8=20=D0=B4=D0=BE=D0=B1=D0=B0?= =?UTF-8?q?=D0=B2=D0=BB=D0=B5=D0=BD=D0=B8=D0=B5=20=D0=BF=D0=B0=D1=80=D0=B0?= =?UTF-8?q?=D0=BD=D0=BE=D0=B8=D0=B4=D0=B0=D0=BB=D1=8C=D0=BD=D0=BE=D0=B3?= =?UTF-8?q?=D0=BE=20=D0=BA=D0=BE=D0=BD=D1=82=D1=80=D0=BE=D0=BB=D1=8F=20?= =?UTF-8?q?=D0=BE=D1=82=20=D0=BF=D0=B5=D1=80=D0=B5=D0=BF=D0=BE=D0=BB=D0=BD?= =?UTF-8?q?=D0=B5=D0=BD=D0=B8=D1=8F.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core.c | 45 +++++++++++++++++++++++++++++++++------------ 1 file changed, 33 insertions(+), 12 deletions(-) diff --git a/src/core.c b/src/core.c index 8f4c12f3..08f13186 100644 --- a/src/core.c +++ b/src/core.c @@ -752,7 +752,7 @@ __cold static const char *pagetype_caption(const uint8_t type, } } -__cold static __must_check_result int MDBX_PRINTF_ARGS(2, 3) +__cold static int MDBX_PRINTF_ARGS(2, 3) bad_page(const MDBX_page *mp, const char *fmt, ...) { if (LOG_ENABLED(MDBX_LOG_ERROR)) { static const MDBX_page *prev; @@ -7232,26 +7232,47 @@ __hot static pgr_t page_alloc(const MDBX_cursor *mc) { return page_alloc_slowpath(mc, 1, MDBX_ALLOC_ALL); } -/* Copy the used portions of a non-large/overflow page. */ -__hot static void page_copy(MDBX_page *dst, const MDBX_page *src, - size_t psize) { +/* Copy the used portions of a page. */ +__hot static void page_copy(MDBX_page *const dst, const MDBX_page *const src, + const size_t size) { STATIC_ASSERT(UINT16_MAX > MAX_PAGESIZE - PAGEHDRSZ); STATIC_ASSERT(MIN_PAGESIZE > PAGEHDRSZ + NODESIZE * 4); + char *copy_dst = (void *)dst; + const char *copy_src = (const void *)src; + size_t copy_len = size; + if (src->mp_flags & P_LEAF2) { + copy_len = PAGEHDRSZ + src->mp_leaf2_ksize * page_numkeys(src); + if (unlikely(copy_len > size)) + goto bailout; + } if ((src->mp_flags & (P_LEAF2 | P_OVERFLOW)) == 0) { - size_t upper = src->mp_upper, lower = src->mp_lower, unused = upper - lower; - + size_t upper = src->mp_upper, lower = src->mp_lower; + intptr_t unused = upper - lower; /* If page isn't full, just copy the used portion. Adjust * alignment so memcpy may copy words instead of bytes. */ - if (unused >= MDBX_CACHELINE_SIZE * 2) { + if (unused > MDBX_CACHELINE_SIZE * 3) { lower = ceil_powerof2(lower + PAGEHDRSZ, sizeof(void *)); upper = floor_powerof2(upper + PAGEHDRSZ, sizeof(void *)); - memcpy(dst, src, lower); - dst = (void *)((char *)dst + upper); - src = (void *)((char *)src + upper); - psize -= upper; + if (unlikely(upper > copy_len)) + goto bailout; + memcpy(copy_dst, copy_src, lower); + copy_dst += upper; + copy_src += upper; + copy_len -= upper; } } - memcpy(dst, src, psize); + memcpy(copy_dst, copy_src, copy_len); + return; + +bailout: + if (src->mp_flags & P_LEAF2) + bad_page(src, "%s addr %p, n-keys %zu, ksize %u", + "invalid/corrupted source page", __Wpedantic_format_voidptr(src), + page_numkeys(src), src->mp_leaf2_ksize); + else + bad_page(src, "%s addr %p, upper %u", "invalid/corrupted source page", + __Wpedantic_format_voidptr(src), src->mp_upper); + memset(dst, -1, size); } /* Pull a page off the txn's spill list, if present. From f73cd7a4917dd11b8a4ef248099f669190f58806 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Wed, 16 Nov 2022 18:45:37 +0300 Subject: [PATCH 223/364] =?UTF-8?q?mdbx:=20=D1=83=D0=BF=D1=80=D0=BE=D1=89?= =?UTF-8?q?=D0=B5=D0=BD=D0=B8=D0=B5=20`page=5Falloc=5Fslowpath()`.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Упрощение за счет удаления проверки флага `MDBX_ALLOC_GC`, который всегда взведен при вызове page_alloc_slowpath(). --- src/core.c | 590 ++++++++++++++++++++++++++--------------------------- 1 file changed, 294 insertions(+), 296 deletions(-) diff --git a/src/core.c b/src/core.c index 08f13186..878fe784 100644 --- a/src/core.c +++ b/src/core.c @@ -5394,6 +5394,7 @@ static txnid_t find_oldest_reader(MDBX_env *const env, const txnid_t steady) { MDBX_lockinfo *const lck = env->me_lck_mmap.lck; if (unlikely(lck == NULL /* exclusive without-lck mode */)) { eASSERT(env, env->me_lck == (void *)&env->x_lckless_stub); + env->me_lck->mti_readers_refresh_flag.weak = nothing_changed; return env->me_lck->mti_oldest_reader.weak = steady; } @@ -6623,6 +6624,14 @@ static int gc_cursor_init(MDBX_cursor *mc, MDBX_txn *txn) { return cursor_init(mc, txn, FREE_DBI); } +__hot static bool is_already_reclaimed(const MDBX_txn *txn, txnid_t id) { + const size_t len = MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed); + for (size_t i = 1; i <= len; ++i) + if (txn->tw.lifo_reclaimed[i] == id) + return true; + return false; +} + static pgr_t page_alloc_slowpath(const MDBX_cursor *mc, const size_t num, char flags) { #if MDBX_ENABLE_PROFGC @@ -6642,11 +6651,9 @@ static pgr_t page_alloc_slowpath(const MDBX_cursor *mc, const size_t num, prof->spe_counter += 1; #endif /* MDBX_ENABLE_PROFGC */ + eASSERT(env, flags & MDBX_ALLOC_GC); eASSERT(env, num == 0 || !(flags & MDBX_ALLOC_SLOT)); eASSERT(env, num > 0 || !(flags & MDBX_ALLOC_NEW)); - eASSERT(env, (flags & (MDBX_ALLOC_SLOT | MDBX_ALLOC_RESERVE | - MDBX_ALLOC_BACKLOG)) == 0 || - (flags & MDBX_ALLOC_GC)); eASSERT(env, (flags & (MDBX_ALLOC_SLOT | MDBX_ALLOC_RESERVE | MDBX_ALLOC_BACKLOG)) == 0 || (flags & MDBX_ALLOC_NEW) == 0); @@ -6654,9 +6661,12 @@ static pgr_t page_alloc_slowpath(const MDBX_cursor *mc, const size_t num, txn->mt_next_pgno - MDBX_ENABLE_REFUND)); pgno_t pgno = 0, *range = nullptr; - size_t re_len = MDBX_PNL_GETSIZE(txn->tw.relist); + size_t newnext, re_len = MDBX_PNL_GETSIZE(txn->tw.relist); if (num > 1) { eASSERT(env, !(flags & MDBX_ALLOC_SLOT)); + eASSERT(env, (txn->mt_flags & MDBX_TXN_FROZEN_RE) == 0); + if (unlikely(txn->mt_flags & MDBX_TXN_FROZEN_RE)) + goto no_gc; #if MDBX_ENABLE_PROFGC prof->xpages += 1; #endif /* MDBX_ENABLE_PROFGC */ @@ -6672,334 +6682,322 @@ static pgr_t page_alloc_slowpath(const MDBX_cursor *mc, const size_t num, } } } else { - eASSERT(env, (flags & (MDBX_ALLOC_SLOT | MDBX_ALLOC_RESERVE)) || - MDBX_PNL_GETSIZE(txn->tw.relist) == 0); + eASSERT(env, + (flags & (MDBX_ALLOC_SLOT | MDBX_ALLOC_RESERVE)) || re_len == 0); } //--------------------------------------------------------------------------- - if (likely(flags & MDBX_ALLOC_GC)) { - if (unlikely(!is_gc_usable(txn))) - goto no_gc; + if (unlikely(!is_gc_usable(txn))) + goto no_gc; - eASSERT(env, (flags & (MDBX_ALLOC_COALESCE | MDBX_ALLOC_LIFO)) == 0); - flags += (env->me_flags & MDBX_LIFORECLAIM) ? MDBX_ALLOC_LIFO : 0; + eASSERT(env, (flags & (MDBX_ALLOC_COALESCE | MDBX_ALLOC_LIFO)) == 0); + flags += (env->me_flags & MDBX_LIFORECLAIM) ? MDBX_ALLOC_LIFO : 0; - const unsigned coalesce_threshold = env->me_maxgc_ov1page >> 2; - if (txn->mt_dbs[FREE_DBI].md_branch_pages && - MDBX_PNL_GETSIZE(txn->tw.relist) < coalesce_threshold && num) - flags += MDBX_ALLOC_COALESCE; + const unsigned coalesce_threshold = env->me_maxgc_ov1page >> 2; + if (txn->mt_dbs[FREE_DBI].md_branch_pages && + MDBX_PNL_GETSIZE(txn->tw.relist) < coalesce_threshold && num) + flags += MDBX_ALLOC_COALESCE; - MDBX_cursor recur; - ret.err = gc_cursor_init(&recur, txn); - if (unlikely(ret.err != MDBX_SUCCESS)) - goto fail; + MDBX_cursor recur; + ret.err = gc_cursor_init(&recur, txn); + if (unlikely(ret.err != MDBX_SUCCESS)) + goto fail; - retry_gc_refresh_oldest:; - txnid_t oldest = txn_oldest_reader(txn); - if (unlikely(!oldest)) - goto no_gc; +retry_gc_refresh_oldest:; + txnid_t oldest = txn_oldest_reader(txn); +retry_gc_have_oldest: + if (unlikely(oldest >= txn->mt_txnid)) { + ERROR("unexpected/invalid oldest-readed txnid %" PRIaTXN + " for current-txnid %" PRIaTXN, + oldest, txn->mt_txnid); + ret.err = MDBX_PROBLEM; + goto fail; + } + const txnid_t detent = oldest + 1; - retry_gc_have_oldest: - if (unlikely(oldest >= txn->mt_txnid)) { - ERROR("unexpected/invalid oldest-readed txnid %" PRIaTXN - " for current-txnid %" PRIaTXN, - oldest, txn->mt_txnid); - ret.err = MDBX_PROBLEM; - goto fail; - } - const txnid_t detent = oldest + 1; - - txnid_t last = 0; - bool should_scan = false; - MDBX_cursor_op op = MDBX_FIRST; - if (flags & MDBX_ALLOC_LIFO) { - if (!txn->tw.lifo_reclaimed) { - txn->tw.lifo_reclaimed = txl_alloc(); - if (unlikely(!txn->tw.lifo_reclaimed)) { - ret.err = MDBX_ENOMEM; - goto fail; - } + txnid_t id = 0; + bool should_scan = false; + MDBX_cursor_op op = MDBX_FIRST; + if (flags & MDBX_ALLOC_LIFO) { + if (!txn->tw.lifo_reclaimed) { + txn->tw.lifo_reclaimed = txl_alloc(); + if (unlikely(!txn->tw.lifo_reclaimed)) { + ret.err = MDBX_ENOMEM; + goto fail; } - /* Begin lookup backward from oldest reader */ - last = detent - 1; - op = MDBX_SET_RANGE; - } else if (txn->tw.last_reclaimed) { - /* Continue lookup forward from last-reclaimed */ - last = txn->tw.last_reclaimed + 1; - if (last >= detent) - goto no_gc; - op = MDBX_SET_RANGE; } + /* Begin lookup backward from oldest reader */ + id = detent - 1; + op = MDBX_SET_RANGE; + } else if (txn->tw.last_reclaimed) { + /* Continue lookup forward from last-reclaimed */ + id = txn->tw.last_reclaimed + 1; + if (id >= detent) + goto depleted_gc; + op = MDBX_SET_RANGE; + } - next_gc:; - MDBX_val key; - key.iov_base = &last; - key.iov_len = sizeof(last); +next_gc:; + MDBX_val key; + key.iov_base = &id; + key.iov_len = sizeof(id); #if MDBX_ENABLE_PROFGC - prof->rsteps += 1; + prof->rsteps += 1; #endif /* MDBX_ENABLE_PROFGC */ - /* Seek first/next GC record */ - ret.err = mdbx_cursor_get(&recur, &key, NULL, op); - if (unlikely(ret.err != MDBX_SUCCESS)) { - if (unlikely(ret.err != MDBX_NOTFOUND)) - goto fail; - if ((flags & MDBX_ALLOC_LIFO) && op == MDBX_SET_RANGE) { - op = MDBX_PREV; - goto next_gc; - } - goto depleted_gc; - } - if (unlikely(key.iov_len != sizeof(txnid_t))) { - ret.err = MDBX_CORRUPTED; + /* Seek first/next GC record */ + ret.err = mdbx_cursor_get(&recur, &key, NULL, op); + if (unlikely(ret.err != MDBX_SUCCESS)) { + if (unlikely(ret.err != MDBX_NOTFOUND)) goto fail; - } - last = unaligned_peek_u64(4, key.iov_base); - if (flags & MDBX_ALLOC_LIFO) { + if ((flags & MDBX_ALLOC_LIFO) && op == MDBX_SET_RANGE) { op = MDBX_PREV; - if (last >= detent) - goto next_gc; - /* skip IDs of records that already reclaimed */ - for (size_t i = MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed); i > 0; --i) - if (txn->tw.lifo_reclaimed[i] == last) - goto next_gc; - } else { - op = MDBX_NEXT; - if (unlikely(last >= detent)) - goto depleted_gc; + goto next_gc; } + goto depleted_gc; + } + if (unlikely(key.iov_len != sizeof(txnid_t))) { + ret.err = MDBX_CORRUPTED; + goto fail; + } + id = unaligned_peek_u64(4, key.iov_base); + if (flags & MDBX_ALLOC_LIFO) { + op = MDBX_PREV; + if (id >= detent || is_already_reclaimed(txn, id)) + goto next_gc; + } else { + op = MDBX_NEXT; + if (unlikely(id >= detent)) + goto depleted_gc; + } - /* Reading next GC record */ - MDBX_val data; - MDBX_page *const mp = recur.mc_pg[recur.mc_top]; - if (unlikely((ret.err = node_read(&recur, - page_node(mp, recur.mc_ki[recur.mc_top]), - &data, mp)) != MDBX_SUCCESS)) + /* Reading next GC record */ + MDBX_val data; + MDBX_page *const mp = recur.mc_pg[recur.mc_top]; + if (unlikely( + (ret.err = node_read(&recur, page_node(mp, recur.mc_ki[recur.mc_top]), + &data, mp)) != MDBX_SUCCESS)) + goto fail; + + eASSERT(env, (txn->mt_flags & MDBX_TXN_FROZEN_RE) == 0); + pgno_t *gc_pnl = (pgno_t *)data.iov_base; + if (unlikely(data.iov_len % sizeof(pgno_t) || + data.iov_len < MDBX_PNL_SIZEOF(gc_pnl) || + !pnl_check(gc_pnl, txn->mt_next_pgno))) { + ret.err = MDBX_CORRUPTED; + goto fail; + } + const size_t gc_len = MDBX_PNL_GETSIZE(gc_pnl); + if (unlikely(/* list is too long already */ MDBX_PNL_GETSIZE( + txn->tw.relist) >= env->me_options.rp_augment_limit) && + ((/* not a slot-request from gc-update */ + (flags & MDBX_ALLOC_SLOT) == 0 && + /* have enough unallocated space */ txn->mt_geo.upper >= + txn->mt_next_pgno + num) || + gc_len + MDBX_PNL_GETSIZE(txn->tw.relist) >= MDBX_PGL_LIMIT)) { + /* Stop reclaiming to avoid large/overflow the page list. + * This is a rare case while search for a continuously multi-page region + * in a large database. + * https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/123 + */ + NOTICE("stop reclaiming to avoid PNL overflow: %zu (current) + %zu " + "(chunk) -> %zu", + MDBX_PNL_GETSIZE(txn->tw.relist), gc_len, + gc_len + MDBX_PNL_GETSIZE(txn->tw.relist)); + goto depleted_gc; + } + + /* Remember ID of readed GC record */ + txn->tw.last_reclaimed = id; + if (flags & MDBX_ALLOC_LIFO) { + ret.err = txl_append(&txn->tw.lifo_reclaimed, id); + if (unlikely(ret.err != MDBX_SUCCESS)) goto fail; + } - eASSERT(env, (txn->mt_flags & MDBX_TXN_FROZEN_RE) == 0); - pgno_t *gc_pnl = (pgno_t *)data.iov_base; - if (unlikely(data.iov_len % sizeof(pgno_t) || - data.iov_len < MDBX_PNL_SIZEOF(gc_pnl) || - !pnl_check(gc_pnl, txn->mt_next_pgno))) { + /* Append PNL from GC record to tw.relist */ + ret.err = pnl_need(&txn->tw.relist, gc_len); + if (unlikely(ret.err != MDBX_SUCCESS)) + goto fail; + + if (LOG_ENABLED(MDBX_LOG_EXTRA)) { + DEBUG_EXTRA("readed GC-pnl txn %" PRIaTXN " root %" PRIaPGNO + " len %zu, PNL", + id, txn->mt_dbs[FREE_DBI].md_root, gc_len); + for (size_t i = gc_len; i; i--) + DEBUG_EXTRA_PRINT(" %" PRIaPGNO, gc_pnl[i]); + DEBUG_EXTRA_PRINT(", next_pgno %u\n", txn->mt_next_pgno); + } + + /* Merge in descending sorted order */ + re_len = pnl_merge(txn->tw.relist, gc_pnl); + should_scan = true; + if (AUDIT_ENABLED()) { + if (unlikely(!pnl_check(txn->tw.relist, txn->mt_next_pgno))) { ret.err = MDBX_CORRUPTED; goto fail; } - const size_t gc_len = MDBX_PNL_GETSIZE(gc_pnl); - if (unlikely(/* list is too long already */ MDBX_PNL_GETSIZE( - txn->tw.relist) >= env->me_options.rp_augment_limit) && - ((/* not a slot-request from gc-update */ - (flags & MDBX_ALLOC_SLOT) == 0 && - /* have enough unallocated space */ txn->mt_geo.upper >= - txn->mt_next_pgno + num) || - gc_len + MDBX_PNL_GETSIZE(txn->tw.relist) >= MDBX_PGL_LIMIT)) { - /* Stop reclaiming to avoid large/overflow the page list. - * This is a rare case while search for a continuously multi-page region - * in a large database. - * https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/123 */ - NOTICE("stop reclaiming to avoid PNL overflow: %zu (current) + %zu " - "(chunk) -> %zu", - MDBX_PNL_GETSIZE(txn->tw.relist), gc_len, - gc_len + MDBX_PNL_GETSIZE(txn->tw.relist)); - goto depleted_gc; - } + } else { + eASSERT(env, pnl_check_allocated(txn->tw.relist, txn->mt_next_pgno)); + } + eASSERT(env, dirtylist_check(txn)); - /* Remember ID of readed GC record */ - txn->tw.last_reclaimed = last; - if (flags & MDBX_ALLOC_LIFO) { - ret.err = txl_append(&txn->tw.lifo_reclaimed, last); - if (unlikely(ret.err != MDBX_SUCCESS)) - goto fail; - } + eASSERT(env, + re_len == 0 || MDBX_PNL_MOST(txn->tw.relist) < txn->mt_next_pgno); + if (MDBX_ENABLE_REFUND && re_len && + unlikely(MDBX_PNL_MOST(txn->tw.relist) == txn->mt_next_pgno - 1)) { + /* Refund suitable pages into "unallocated" space */ + txn_refund(txn); + re_len = MDBX_PNL_GETSIZE(txn->tw.relist); + } + eASSERT(env, re_len == MDBX_PNL_GETSIZE(txn->tw.relist)); + eASSERT(env, pnl_check_allocated(txn->tw.relist, + txn->mt_next_pgno - MDBX_ENABLE_REFUND)); - /* Append PNL from GC record to tw.relist */ - ret.err = pnl_need(&txn->tw.relist, gc_len); - if (unlikely(ret.err != MDBX_SUCCESS)) - goto fail; - txn->tw.relist = txn->tw.relist; + /* Done for a kick-reclaim mode, actually no page needed */ + if (unlikely(flags & MDBX_ALLOC_SLOT)) { + eASSERT(env, ret.err == MDBX_SUCCESS); + goto early_exit; + } - if (LOG_ENABLED(MDBX_LOG_EXTRA)) { - DEBUG_EXTRA("readed GC-pnl txn %" PRIaTXN " root %" PRIaPGNO - " len %zu, PNL", - last, txn->mt_dbs[FREE_DBI].md_root, gc_len); - for (size_t i = gc_len; i; i--) - DEBUG_EXTRA_PRINT(" %" PRIaPGNO, gc_pnl[i]); - DEBUG_EXTRA_PRINT(", next_pgno %u\n", txn->mt_next_pgno); - } + /* TODO: delete reclaimed records */ - /* Merge in descending sorted order */ - re_len = pnl_merge(txn->tw.relist, gc_pnl); - should_scan = true; - if (AUDIT_ENABLED()) { - if (unlikely(!pnl_check(txn->tw.relist, txn->mt_next_pgno))) { - ret.err = MDBX_CORRUPTED; - goto fail; - } - } else { - eASSERT(env, pnl_check_allocated(txn->tw.relist, txn->mt_next_pgno)); - } - eASSERT(env, dirtylist_check(txn)); - - eASSERT(env, - re_len == 0 || MDBX_PNL_MOST(txn->tw.relist) < txn->mt_next_pgno); - if (MDBX_ENABLE_REFUND && re_len && - unlikely(MDBX_PNL_MOST(txn->tw.relist) == txn->mt_next_pgno - 1)) { - /* Refund suitable pages into "unallocated" space */ - if (txn_refund(txn)) - re_len = MDBX_PNL_GETSIZE(txn->tw.relist); - } - eASSERT(env, re_len == MDBX_PNL_GETSIZE(txn->tw.relist)); - eASSERT(env, pnl_check_allocated(txn->tw.relist, - txn->mt_next_pgno - MDBX_ENABLE_REFUND)); - - /* Done for a kick-reclaim mode, actually no page needed */ - if (unlikely(flags & MDBX_ALLOC_SLOT)) { - eASSERT(env, ret.err == MDBX_SUCCESS); - goto early_exit; - } - - /* TODO: delete reclaimed records */ - - /* Don't try to coalesce too much. */ - eASSERT(env, op == MDBX_PREV || op == MDBX_NEXT); - if (flags & MDBX_ALLOC_COALESCE) { - if (re_len /* current size */ < coalesce_threshold) { + /* Don't try to coalesce too much. */ + eASSERT(env, op == MDBX_PREV || op == MDBX_NEXT); + if (flags & MDBX_ALLOC_COALESCE) { + if (re_len /* current size */ < coalesce_threshold) { #if MDBX_ENABLE_PROFGC - env->me_lck->mti_pgop_stat.gc_prof.coalescences += 1; + env->me_lck->mti_pgop_stat.gc_prof.coalescences += 1; #endif /* MDBX_ENABLE_PROFGC */ - goto next_gc; - } - TRACE("clear %s %s", "MDBX_ALLOC_COALESCE", "since got threshold"); - flags &= ~MDBX_ALLOC_COALESCE; - } - - scan: - eASSERT(env, should_scan); - if (re_len >= num) { - eASSERT(env, MDBX_PNL_LAST(txn->tw.relist) < txn->mt_next_pgno && - MDBX_PNL_FIRST(txn->tw.relist) < txn->mt_next_pgno); - range = txn->tw.relist + (MDBX_PNL_ASCENDING ? 1 : re_len); - pgno = *range; - if (num == 1) - goto done; - range = scan4seq(range, re_len, num - 1); - eASSERT(env, range == scan4range_checker(txn->tw.relist, num - 1)); - if (likely(range)) { - pgno = *range; - goto done; - } - } - should_scan = false; - if (ret.err == MDBX_SUCCESS) goto next_gc; - - depleted_gc: - ret.err = MDBX_NOTFOUND; - if (should_scan) - goto scan; - - //------------------------------------------------------------------------- - - /* There is no suitable pages in the GC and to be able to allocate - * we should CHOICE one of: - * - make a new steady checkpoint if reclaiming was stopped by - * the last steady-sync, or wipe it in the MDBX_UTTERLY_NOSYNC mode; - * - kick lagging reader(s) if reclaiming was stopped by ones of it. - * - extend the database file. */ - - /* Will use new pages from the map if nothing is suitable in the GC. */ - pgno = txn->mt_next_pgno; - const size_t newnext = num + pgno; - - const meta_ptr_t recent = meta_recent(env, &txn->tw.troika); - const meta_ptr_t prefer_steady = meta_prefer_steady(env, &txn->tw.troika); - /* does reclaiming stopped at the last steady point? */ - if (recent.ptr_c != prefer_steady.ptr_c && prefer_steady.is_steady && - detent == prefer_steady.txnid + 1) { - DEBUG("gc-kick-steady: recent %" PRIaTXN "-%s, steady %" PRIaTXN - "-%s, detent %" PRIaTXN, - recent.txnid, durable_caption(recent.ptr_c), prefer_steady.txnid, - durable_caption(prefer_steady.ptr_c), detent); - const pgno_t autosync_threshold = - atomic_load32(&env->me_lck->mti_autosync_threshold, mo_Relaxed); - const uint64_t autosync_period = - atomic_load64(&env->me_lck->mti_autosync_period, mo_Relaxed); - uint64_t eoos_timestamp; - /* wipe the last steady-point if one of: - * - UTTERLY_NOSYNC mode AND auto-sync threshold is NOT specified - * - UTTERLY_NOSYNC mode AND free space at steady-point is exhausted - * otherwise, make a new steady-point if one of: - * - auto-sync threshold is specified and reached; - * - upper limit of database size is reached; - * - database is full (with the current file size) - * AND auto-sync threshold it NOT specified */ - if (F_ISSET(env->me_flags, MDBX_UTTERLY_NOSYNC) && - ((autosync_threshold | autosync_period) == 0 || - newnext >= prefer_steady.ptr_c->mm_geo.now)) { - /* wipe steady checkpoint in MDBX_UTTERLY_NOSYNC mode - * without any auto-sync threshold(s). */ -#if MDBX_ENABLE_PROFGC - env->me_lck->mti_pgop_stat.gc_prof.wipes += 1; -#endif /* MDBX_ENABLE_PROFGC */ - ret.err = wipe_steady(txn, detent); - DEBUG("gc-wipe-steady, rc %d", ret.err); - if (unlikely(ret.err != MDBX_SUCCESS)) - goto fail; - eASSERT(env, prefer_steady.ptr_c != - meta_prefer_steady(env, &txn->tw.troika).ptr_c); - goto retry_gc_refresh_oldest; - } - if ((flags & (MDBX_ALLOC_BACKLOG | MDBX_ALLOC_NEW)) == 0 || - (autosync_threshold && - atomic_load64(&env->me_lck->mti_unsynced_pages, mo_Relaxed) >= - autosync_threshold) || - (autosync_period && - (eoos_timestamp = - atomic_load64(&env->me_lck->mti_eoos_timestamp, mo_Relaxed)) && - osal_monotime() - eoos_timestamp >= autosync_period) || - newnext >= txn->mt_geo.upper || - (newnext >= txn->mt_end_pgno && - (autosync_threshold | autosync_period) == 0)) { - /* make steady checkpoint. */ -#if MDBX_ENABLE_PROFGC - env->me_lck->mti_pgop_stat.gc_prof.flushes += 1; -#endif /* MDBX_ENABLE_PROFGC */ - MDBX_meta meta = *recent.ptr_c; - ret.err = sync_locked(env, env->me_flags & MDBX_WRITEMAP, &meta, - &txn->tw.troika); - DEBUG("gc-make-steady, rc %d", ret.err); - eASSERT(env, ret.err != MDBX_RESULT_TRUE); - if (unlikely(ret.err != MDBX_SUCCESS)) - goto fail; - eASSERT(env, prefer_steady.ptr_c != - meta_prefer_steady(env, &txn->tw.troika).ptr_c); - goto retry_gc_refresh_oldest; - } } + TRACE("clear %s %s", "MDBX_ALLOC_COALESCE", "since got threshold"); + flags &= ~MDBX_ALLOC_COALESCE; + } - if (env->me_lck_mmap.lck && - unlikely(true == - atomic_load32(&env->me_lck_mmap.lck->mti_readers_refresh_flag, - mo_AcquireRelease))) { - oldest = txn_oldest_reader(txn); - if (oldest >= detent) - goto retry_gc_have_oldest; - } - - /* avoid kick lagging reader(s) if is enough unallocated space - * at the end of database file. */ - if ((flags & MDBX_ALLOC_NEW) && newnext <= txn->mt_end_pgno) { - eASSERT(env, range == nullptr); +scan: + eASSERT(env, should_scan); + if (re_len >= num) { + eASSERT(env, MDBX_PNL_LAST(txn->tw.relist) < txn->mt_next_pgno && + MDBX_PNL_FIRST(txn->tw.relist) < txn->mt_next_pgno); + range = txn->tw.relist + (MDBX_PNL_ASCENDING ? 1 : re_len); + pgno = *range; + if (num == 1) + goto done; + range = scan4seq(range, re_len, num - 1); + eASSERT(env, range == scan4range_checker(txn->tw.relist, num - 1)); + if (likely(range)) { + pgno = *range; goto done; } + } + should_scan = false; + if (ret.err == MDBX_SUCCESS) + goto next_gc; - if (oldest < txn->mt_txnid - xMDBX_TXNID_STEP) { - oldest = kick_longlived_readers(env, oldest); - if (oldest >= detent) - goto retry_gc_have_oldest; +depleted_gc: + ret.err = MDBX_NOTFOUND; + if (should_scan) + goto scan; + + //------------------------------------------------------------------------- + + /* There is no suitable pages in the GC and to be able to allocate + * we should CHOICE one of: + * - make a new steady checkpoint if reclaiming was stopped by + * the last steady-sync, or wipe it in the MDBX_UTTERLY_NOSYNC mode; + * - kick lagging reader(s) if reclaiming was stopped by ones of it. + * - extend the database file. */ + + /* Will use new pages from the map if nothing is suitable in the GC. */ + newnext = (pgno = txn->mt_next_pgno) + num; + + /* Does reclaiming stopped at the last steady point? */ + const meta_ptr_t recent = meta_recent(env, &txn->tw.troika); + const meta_ptr_t prefer_steady = meta_prefer_steady(env, &txn->tw.troika); + if (recent.ptr_c != prefer_steady.ptr_c && prefer_steady.is_steady && + detent == prefer_steady.txnid + 1) { + DEBUG("gc-kick-steady: recent %" PRIaTXN "-%s, steady %" PRIaTXN + "-%s, detent %" PRIaTXN, + recent.txnid, durable_caption(recent.ptr_c), prefer_steady.txnid, + durable_caption(prefer_steady.ptr_c), detent); + const pgno_t autosync_threshold = + atomic_load32(&env->me_lck->mti_autosync_threshold, mo_Relaxed); + const uint64_t autosync_period = + atomic_load64(&env->me_lck->mti_autosync_period, mo_Relaxed); + uint64_t eoos_timestamp; + /* wipe the last steady-point if one of: + * - UTTERLY_NOSYNC mode AND auto-sync threshold is NOT specified + * - UTTERLY_NOSYNC mode AND free space at steady-point is exhausted + * otherwise, make a new steady-point if one of: + * - auto-sync threshold is specified and reached; + * - upper limit of database size is reached; + * - database is full (with the current file size) + * AND auto-sync threshold it NOT specified */ + if (F_ISSET(env->me_flags, MDBX_UTTERLY_NOSYNC) && + ((autosync_threshold | autosync_period) == 0 || + newnext >= prefer_steady.ptr_c->mm_geo.now)) { + /* wipe steady checkpoint in MDBX_UTTERLY_NOSYNC mode + * without any auto-sync threshold(s). */ +#if MDBX_ENABLE_PROFGC + env->me_lck->mti_pgop_stat.gc_prof.wipes += 1; +#endif /* MDBX_ENABLE_PROFGC */ + ret.err = wipe_steady(txn, detent); + DEBUG("gc-wipe-steady, rc %d", ret.err); + if (unlikely(ret.err != MDBX_SUCCESS)) + goto fail; + eASSERT(env, prefer_steady.ptr_c != + meta_prefer_steady(env, &txn->tw.troika).ptr_c); + goto retry_gc_refresh_oldest; } + if ((flags & (MDBX_ALLOC_BACKLOG | MDBX_ALLOC_NEW)) == 0 || + (autosync_threshold && + atomic_load64(&env->me_lck->mti_unsynced_pages, mo_Relaxed) >= + autosync_threshold) || + (autosync_period && + (eoos_timestamp = + atomic_load64(&env->me_lck->mti_eoos_timestamp, mo_Relaxed)) && + osal_monotime() - eoos_timestamp >= autosync_period) || + newnext >= txn->mt_geo.upper || + (newnext >= txn->mt_end_pgno && + (autosync_threshold | autosync_period) == 0)) { + /* make steady checkpoint. */ +#if MDBX_ENABLE_PROFGC + env->me_lck->mti_pgop_stat.gc_prof.flushes += 1; +#endif /* MDBX_ENABLE_PROFGC */ + MDBX_meta meta = *recent.ptr_c; + ret.err = sync_locked(env, env->me_flags & MDBX_WRITEMAP, &meta, + &txn->tw.troika); + DEBUG("gc-make-steady, rc %d", ret.err); + eASSERT(env, ret.err != MDBX_RESULT_TRUE); + if (unlikely(ret.err != MDBX_SUCCESS)) + goto fail; + eASSERT(env, prefer_steady.ptr_c != + meta_prefer_steady(env, &txn->tw.troika).ptr_c); + goto retry_gc_refresh_oldest; + } + } + + if (unlikely(true == atomic_load32(&env->me_lck->mti_readers_refresh_flag, + mo_AcquireRelease))) { + oldest = txn_oldest_reader(txn); + if (oldest >= detent) + goto retry_gc_have_oldest; + } + + /* Avoid kick lagging reader(s) if is enough unallocated space + * at the end of database file. */ + if ((flags & MDBX_ALLOC_NEW) && newnext <= txn->mt_end_pgno) { + eASSERT(env, range == nullptr); + goto done; + } + + if (oldest < txn->mt_txnid - xMDBX_TXNID_STEP) { + oldest = kick_longlived_readers(env, oldest); + if (oldest >= detent) + goto retry_gc_have_oldest; } //--------------------------------------------------------------------------- @@ -7011,8 +7009,7 @@ no_gc: } /* Will use new pages from the map if nothing is suitable in the GC. */ - pgno = txn->mt_next_pgno; - const size_t newnext = num + pgno; + newnext = (pgno = txn->mt_next_pgno) + num; if (newnext <= txn->mt_end_pgno) goto done; @@ -7055,6 +7052,7 @@ done: eASSERT(env, (txn->mt_flags & MDBX_TXN_FROZEN_RE) == 0); eASSERT(env, pgno == *range); eASSERT(env, pgno + num <= txn->mt_next_pgno && pgno >= NUM_METAS); + eASSERT(env, re_len == MDBX_PNL_GETSIZE(txn->tw.relist)); /* Cutoff allocated pages from tw.relist */ #if MDBX_PNL_ASCENDING for (const pgno_t *const end = re_list + re_len - num; range <= end; From 4a257133cb3d5b60417e79d35463fb763c2b971b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Fri, 18 Nov 2022 20:03:46 +0300 Subject: [PATCH 224/364] =?UTF-8?q?mdbx:=20=D1=83=D1=81=D1=82=D1=80=D0=B0?= =?UTF-8?q?=D0=BD=D0=B5=D0=BD=D0=B8=D0=B5=20=D0=BD=D0=B5=D1=81=D1=83=D1=89?= =?UTF-8?q?=D0=B5=D1=81=D1=82=D0=B2=D0=B5=D0=BD=D0=BD=D1=8B=D1=85=20=D0=BF?= =?UTF-8?q?=D1=80=D0=B5=D0=B4=D1=83=D0=BF=D1=80=D0=B5=D0=B6=D0=B4=D0=B5?= =?UTF-8?q?=D0=BD=D0=B8=D0=B9=20Coverity.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/src/core.c b/src/core.c index 878fe784..16a5fe6c 100644 --- a/src/core.c +++ b/src/core.c @@ -1366,7 +1366,7 @@ __cold void thread_dtor(void *rthc) { if (atomic_load32(&reader->mr_pid, mo_Relaxed) == self_pid) { TRACE("==== thread 0x%" PRIxPTR ", rthc %p, cleanup", osal_thread_self(), __Wpedantic_format_voidptr(reader)); - atomic_cas32(&reader->mr_pid, self_pid, 0); + (void)atomic_cas32(&reader->mr_pid, self_pid, 0); } } @@ -4469,8 +4469,9 @@ static void iov_callback4dirtypages(iov_ctx_t *ctx, size_t offset, void *data, unsigned npages = IS_OVERFLOW(wp) ? wp->mp_pages : 1u; size_t chunk = pgno2bytes(env, npages); eASSERT(env, bytes >= chunk); + MDBX_page *next = (MDBX_page *)((char *)wp + chunk); dpage_free(env, wp, npages); - wp = (MDBX_page *)((char *)wp + chunk); + wp = next; offset += chunk; bytes -= chunk; } while (bytes); @@ -9797,6 +9798,8 @@ retry: do { if (ctx->bigfoot > txn->mt_txnid) { rc = gcu_clean_stored_retired(txn, ctx); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; tASSERT(txn, ctx->bigfoot <= txn->mt_txnid); } @@ -13632,13 +13635,13 @@ __cold static int handle_env_pathname(MDBX_handle_env_pathname *ctx, } #else struct stat st; - if (stat(pathname, &st)) { + if (stat(pathname, &st) != 0) { rc = errno; if (rc != MDBX_ENOFILE) return rc; if (mode == 0 || (*flags & MDBX_RDONLY) != 0) /* can't open existing */ - return rc; + return rc /* MDBX_ENOFILE */; /* auto-create directory if requested */ const mdbx_mode_t dir_mode = @@ -19454,7 +19457,7 @@ static int page_split(MDBX_cursor *mc, const MDBX_val *const newkey, int rc = MDBX_SUCCESS, foliage = 0; size_t i, ptop; MDBX_env *const env = mc->mc_txn->mt_env; - MDBX_val sepkey, rkey, xdata; + MDBX_val rkey, xdata; MDBX_page *tmp_ki_copy = NULL; DKBUF; @@ -19546,6 +19549,7 @@ static int page_split(MDBX_cursor *mc, const MDBX_val *const newkey, eASSERT(env, split_indx >= minkeys && split_indx <= nkeys - minkeys + 1); cASSERT(mc, !IS_BRANCH(mp) || newindx > 0); + MDBX_val sepkey = {nullptr, 0}; /* It is reasonable and possible to split the page at the begin */ if (unlikely(newindx < minkeys)) { split_indx = minkeys; @@ -19878,7 +19882,7 @@ static int page_split(MDBX_cursor *mc, const MDBX_val *const newkey, break; } } - } else if (!IS_LEAF2(mp)) { + } else if (tmp_ki_copy /* !IS_LEAF2(mp) */) { /* Move nodes */ mc->mc_pg[mc->mc_top] = sister; i = split_indx; @@ -23991,7 +23995,7 @@ __cold int mdbx_env_warmup(const MDBX_env *env, const MDBX_txn *txn, if (getrlimit(RLIMIT_RSS, &rss) == 0 && rss.rlim_cur < estimated_rss) { rss.rlim_cur = estimated_rss; if (rss.rlim_max < estimated_rss) - rss.rlim_max = used_range; + rss.rlim_max = estimated_rss; if (setrlimit(RLIMIT_RSS, &rss)) { rc = errno; WARNING("setrlimit(%s, {%zu, %zu}) error %d", "RLIMIT_RSS", From c46c03e7c8681d279fdb676791d44d92bca7cc68 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Fri, 18 Nov 2022 20:04:09 +0300 Subject: [PATCH 225/364] mdbx: fix nasty typo/rebase/merge bug with calling `msync()` on Linux. --- src/osal.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/src/osal.c b/src/osal.c index 71046f6c..9bd30d04 100644 --- a/src/osal.c +++ b/src/osal.c @@ -1706,10 +1706,15 @@ MDBX_INTERNAL_FUNC int osal_msync(const osal_mmap_t *map, size_t offset, return (int)GetLastError(); #else #if defined(__linux__) || defined(__gnu_linux__) - assert(linux_kernel_version > 0x02061300); /* Since Linux 2.6.19, MS_ASYNC is in fact a no-op. The kernel properly - * tracks dirty pages and flushes them to storage as necessary. */ - return MDBX_SUCCESS; + * tracks dirty pages and flushes ones as necessary. */ + // + // However, this behavior may be changed in custom kernels, + // so just leave such optimization to the libc discretion. + // + // assert(linux_kernel_version > 0x02061300); + // if (mode_bits == MDBX_SYNC_NONE) + // return MDBX_SUCCESS; #endif /* Linux */ if (msync(ptr, length, (mode_bits & MDBX_SYNC_DATA) ? MS_SYNC : MS_ASYNC)) return errno; From 543e52730dd8411928e858c5d89d4663bc40740a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Sat, 19 Nov 2022 18:17:48 +0300 Subject: [PATCH 226/364] =?UTF-8?q?mdbx:=20=D0=B4=D0=BE=D1=80=D0=B0=D0=B1?= =?UTF-8?q?=D0=BE=D1=82=D0=BA=D0=B0=20=D0=BF=D0=BE=D0=B4=D0=B4=D0=B5=D1=80?= =?UTF-8?q?=D0=B6=D0=BA=D0=B8=20=D0=B0=D0=B2=D1=82=D0=BE-=D1=81=D0=BB?= =?UTF-8?q?=D0=B8=D1=8F=D0=BD=D0=B8=D1=8F=20=D0=B7=D0=B0=D0=BF=D0=B8=D1=81?= =?UTF-8?q?=D0=B5=D0=B9=20GC=20=D0=B2=D0=BD=D1=83=D1=82=D1=80=D0=B8=20`pag?= =?UTF-8?q?e=5Falloc=5Fslowpath()`.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core.c | 110 +++++++++++++++++++++++++++++++++++------------------ 1 file changed, 72 insertions(+), 38 deletions(-) diff --git a/src/core.c b/src/core.c index 16a5fe6c..625c291a 100644 --- a/src/core.c +++ b/src/core.c @@ -6595,7 +6595,8 @@ static pgno_t *scan4seq_resolver(pgno_t *range, const size_t len, #define MDBX_ALLOC_RESERVE 16 #define MDBX_ALLOC_BACKLOG 32 #define MDBX_ALLOC_ALL (MDBX_ALLOC_GC | MDBX_ALLOC_NEW) -#define MDBX_ALLOC_LIFO 128 +#define MDBX_ALLOC_SHOULD_SCAN 64 /* internal state */ +#define MDBX_ALLOC_LIFO 128 /* internal state */ static __inline bool is_gc_usable(const MDBX_txn *txn) { /* If txn is updating the GC, then the retired-list cannot play catch-up with @@ -6692,13 +6693,18 @@ static pgr_t page_alloc_slowpath(const MDBX_cursor *mc, const size_t num, if (unlikely(!is_gc_usable(txn))) goto no_gc; - eASSERT(env, (flags & (MDBX_ALLOC_COALESCE | MDBX_ALLOC_LIFO)) == 0); + eASSERT(env, (flags & (MDBX_ALLOC_COALESCE | MDBX_ALLOC_LIFO | + MDBX_ALLOC_SHOULD_SCAN)) == 0); flags += (env->me_flags & MDBX_LIFORECLAIM) ? MDBX_ALLOC_LIFO : 0; - const unsigned coalesce_threshold = env->me_maxgc_ov1page >> 2; - if (txn->mt_dbs[FREE_DBI].md_branch_pages && - MDBX_PNL_GETSIZE(txn->tw.relist) < coalesce_threshold && num) - flags += MDBX_ALLOC_COALESCE; + if (/* Не коагулируем записи при подготовке резерва для обновления GC. + * Иначе попытка увеличить резерв может приводить к необходимости ещё + * большего резерва из-за увеличения списка переработанных страниц. */ + flags < MDBX_ALLOC_COALESCE) { + if (txn->mt_dbs[FREE_DBI].md_branch_pages && + re_len < env->me_maxgc_ov1page / 2) + flags += MDBX_ALLOC_COALESCE; + } MDBX_cursor recur; ret.err = gc_cursor_init(&recur, txn); @@ -6718,7 +6724,6 @@ retry_gc_have_oldest: const txnid_t detent = oldest + 1; txnid_t id = 0; - bool should_scan = false; MDBX_cursor_op op = MDBX_FIRST; if (flags & MDBX_ALLOC_LIFO) { if (!txn->tw.lifo_reclaimed) { @@ -6790,24 +6795,54 @@ next_gc:; ret.err = MDBX_CORRUPTED; goto fail; } + const size_t gc_len = MDBX_PNL_GETSIZE(gc_pnl); - if (unlikely(/* list is too long already */ MDBX_PNL_GETSIZE( - txn->tw.relist) >= env->me_options.rp_augment_limit) && - ((/* not a slot-request from gc-update */ - (flags & MDBX_ALLOC_SLOT) == 0 && - /* have enough unallocated space */ txn->mt_geo.upper >= - txn->mt_next_pgno + num) || - gc_len + MDBX_PNL_GETSIZE(txn->tw.relist) >= MDBX_PGL_LIMIT)) { - /* Stop reclaiming to avoid large/overflow the page list. - * This is a rare case while search for a continuously multi-page region - * in a large database. - * https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/123 - */ - NOTICE("stop reclaiming to avoid PNL overflow: %zu (current) + %zu " - "(chunk) -> %zu", - MDBX_PNL_GETSIZE(txn->tw.relist), gc_len, - gc_len + MDBX_PNL_GETSIZE(txn->tw.relist)); - goto depleted_gc; + TRACE("gc-read: id #%" PRIaTXN " len %zu, re-list will %zu ", id, gc_len, + gc_len + re_len); + + eASSERT(env, re_len == MDBX_PNL_GETSIZE(txn->tw.relist)); + if (unlikely(gc_len + re_len >= env->me_maxgc_ov1page)) { + /* Don't try to coalesce too much. */ + if (flags & MDBX_ALLOC_SHOULD_SCAN) { + eASSERT(env, flags & MDBX_ALLOC_COALESCE); + eASSERT(env, num > 0); +#if MDBX_ENABLE_PROFGC + env->me_lck->mti_pgop_stat.gc_prof.coalescences += 1; +#endif /* MDBX_ENABLE_PROFGC */ + TRACE("clear %s %s", "MDBX_ALLOC_COALESCE", "since got threshold"); + if (re_len >= num) { + eASSERT(env, MDBX_PNL_LAST(txn->tw.relist) < txn->mt_next_pgno && + MDBX_PNL_FIRST(txn->tw.relist) < txn->mt_next_pgno); + range = txn->tw.relist + (MDBX_PNL_ASCENDING ? 1 : re_len); + pgno = *range; + if (num == 1) + goto done; + range = scan4seq(range, re_len, num - 1); + eASSERT(env, range == scan4range_checker(txn->tw.relist, num - 1)); + if (likely(range)) { + pgno = *range; + goto done; + } + } + flags -= MDBX_ALLOC_COALESCE | MDBX_ALLOC_SHOULD_SCAN; + } + if (unlikely(/* list is too long already */ re_len >= + env->me_options.rp_augment_limit) && + ((/* not a slot-request from gc-update */ + (flags & MDBX_ALLOC_SLOT) == 0 && + /* have enough unallocated space */ txn->mt_geo.upper >= + txn->mt_next_pgno + num) || + gc_len + re_len >= MDBX_PGL_LIMIT)) { + /* Stop reclaiming to avoid large/overflow the page list. + * This is a rare case while search for a continuously multi-page region + * in a large database. + * https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/123 + */ + NOTICE("stop reclaiming to avoid PNL overflow: %zu (current) + %zu " + "(chunk) -> %zu", + re_len, gc_len, gc_len + re_len); + goto depleted_gc; + } } /* Remember ID of readed GC record */ @@ -6834,7 +6869,7 @@ next_gc:; /* Merge in descending sorted order */ re_len = pnl_merge(txn->tw.relist, gc_pnl); - should_scan = true; + flags |= MDBX_ALLOC_SHOULD_SCAN; if (AUDIT_ENABLED()) { if (unlikely(!pnl_check(txn->tw.relist, txn->mt_next_pgno))) { ret.err = MDBX_CORRUPTED; @@ -6860,26 +6895,22 @@ next_gc:; /* Done for a kick-reclaim mode, actually no page needed */ if (unlikely(flags & MDBX_ALLOC_SLOT)) { eASSERT(env, ret.err == MDBX_SUCCESS); + TRACE("%s: last id #%" PRIaTXN ", re-len %zu", "early-exit for slot", id, + re_len); goto early_exit; } /* TODO: delete reclaimed records */ - /* Don't try to coalesce too much. */ eASSERT(env, op == MDBX_PREV || op == MDBX_NEXT); if (flags & MDBX_ALLOC_COALESCE) { - if (re_len /* current size */ < coalesce_threshold) { -#if MDBX_ENABLE_PROFGC - env->me_lck->mti_pgop_stat.gc_prof.coalescences += 1; -#endif /* MDBX_ENABLE_PROFGC */ - goto next_gc; - } - TRACE("clear %s %s", "MDBX_ALLOC_COALESCE", "since got threshold"); - flags &= ~MDBX_ALLOC_COALESCE; + TRACE("%s: last id #%" PRIaTXN ", re-len %zu", "coalesce-continue", id, + re_len); + goto next_gc; } scan: - eASSERT(env, should_scan); + eASSERT(env, flags & MDBX_ALLOC_SHOULD_SCAN); if (re_len >= num) { eASSERT(env, MDBX_PNL_LAST(txn->tw.relist) < txn->mt_next_pgno && MDBX_PNL_FIRST(txn->tw.relist) < txn->mt_next_pgno); @@ -6894,13 +6925,16 @@ scan: goto done; } } - should_scan = false; - if (ret.err == MDBX_SUCCESS) + flags -= MDBX_ALLOC_SHOULD_SCAN; + if (ret.err == MDBX_SUCCESS) { + TRACE("%s: last id #%" PRIaTXN ", re-len %zu", "continue-search", id, + re_len); goto next_gc; + } depleted_gc: ret.err = MDBX_NOTFOUND; - if (should_scan) + if (flags & MDBX_ALLOC_SHOULD_SCAN) goto scan; //------------------------------------------------------------------------- From 1f93dfe5fd58646654c25820d8ada29563acb3f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Sat, 19 Nov 2022 23:17:53 +0300 Subject: [PATCH 227/364] =?UTF-8?q?mdbx:=20=D0=BE=D0=B1=D0=BD=D0=BE=D0=B2?= =?UTF-8?q?=D0=BB=D0=B5=D0=BD=D0=B8=D0=B5=20ChangeLog.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ChangeLog.md | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/ChangeLog.md b/ChangeLog.md index 7427de95..8dd430ae 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -4,6 +4,26 @@ ChangeLog English version [by Google](https://gitflic-ru.translate.goog/project/erthink/libmdbx/blob?file=ChangeLog.md&_x_tr_sl=ru&_x_tr_tl=en) and [by Yandex](https://translated.turbopages.org/proxy_u/ru-en.en/https/gitflic.ru/project/erthink/libmdbx/blob?file=ChangeLog.md). +## v0.12.3 в процессе подготовки срочного исправления + +Исправления (без корректировок вышеперечисленных новых функций): + + - Устранение ошибки совершенной в коммите fe20de136c22ed3bc4c6d3f673e79c106e824f60 от 2022-09-18, + в результате чего на Linux в режиме `MDBX_WRITEMAP` никогда не вызывался `msync()`. + Проблема существует только в релизе 0.12.2. + - Добавление подсчета грязных страниц в `MDBX_WRITEMAP` для предоставления посредством `mdbx_txn_info()` + актуальной информации об объеме изменений в процессе транзакций чтения-записи. + - Исправление несущественной опечатки в условиях `#if` определения порядка байт. + +Мелочи: + + - Доработка поддержки авто-слияния записей GC внутри `page_alloc_slowpath()`. + - Устранение несущественных предупреждений Coverity. + + +------------------------------------------------------------------------------- + + ## v0.12.2 (Иван Ярыгин) от 2022-11-11 Выпуск с существенными доработками и новой функциональностью From 12ed2bcfbd77598f45ebcc2eda83840a35c9df82 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Mon, 21 Nov 2022 17:09:31 +0300 Subject: [PATCH 228/364] =?UTF-8?q?mdbx:=20=D0=B8=D1=81=D0=BF=D0=BE=D0=BB?= =?UTF-8?q?=D1=8C=D0=B7=D0=BE=D0=B2=D0=B0=D0=BD=D0=B8=D0=B5=20=D0=B5=D0=B4?= =?UTF-8?q?=D0=B8=D0=BD=D0=BE=D0=B3=D0=BE=20=D0=BA=D1=83=D1=80=D1=81=D0=BE?= =?UTF-8?q?=D1=80=D0=B0=20=D0=B4=D0=BB=D1=8F=20=D0=BF=D0=BE=D0=B8=D1=81?= =?UTF-8?q?=D0=BA=D0=B0=20=D0=B2=20GC.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core.c | 54 ++++++++++++++++++++++++++++++------------------------ 1 file changed, 30 insertions(+), 24 deletions(-) diff --git a/src/core.c b/src/core.c index 625c291a..e7d00a15 100644 --- a/src/core.c +++ b/src/core.c @@ -6617,15 +6617,6 @@ static __inline bool is_gc_usable(const MDBX_txn *txn) { return true; } -static int gc_cursor_init(MDBX_cursor *mc, MDBX_txn *txn) { - if (unlikely(txn->mt_dbs[FREE_DBI].md_flags != MDBX_INTEGERKEY)) { - ERROR("unexpected/invalid db-flags 0x%u for GC/FreeDB", - txn->mt_dbs[FREE_DBI].md_flags); - return MDBX_CORRUPTED; - } - return cursor_init(mc, txn, FREE_DBI); -} - __hot static bool is_already_reclaimed(const MDBX_txn *txn, txnid_t id) { const size_t len = MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed); for (size_t i = 1; i <= len; ++i) @@ -6635,7 +6626,7 @@ __hot static bool is_already_reclaimed(const MDBX_txn *txn, txnid_t id) { } static pgr_t page_alloc_slowpath(const MDBX_cursor *mc, const size_t num, - char flags) { + uint8_t flags) { #if MDBX_ENABLE_PROFGC const uint64_t monotime_before = osal_monotime(); size_t majflt_before; @@ -6706,10 +6697,10 @@ static pgr_t page_alloc_slowpath(const MDBX_cursor *mc, const size_t num, flags += MDBX_ALLOC_COALESCE; } - MDBX_cursor recur; - ret.err = gc_cursor_init(&recur, txn); - if (unlikely(ret.err != MDBX_SUCCESS)) - goto fail; + MDBX_cursor *const gc = + (MDBX_cursor *)((char *)env->me_txn0 + sizeof(MDBX_txn)); + gc->mc_txn = txn; + gc->mc_flags = 0; retry_gc_refresh_oldest:; txnid_t oldest = txn_oldest_reader(txn); @@ -6754,7 +6745,7 @@ next_gc:; #endif /* MDBX_ENABLE_PROFGC */ /* Seek first/next GC record */ - ret.err = mdbx_cursor_get(&recur, &key, NULL, op); + ret.err = mdbx_cursor_get(gc, &key, NULL, op); if (unlikely(ret.err != MDBX_SUCCESS)) { if (unlikely(ret.err != MDBX_NOTFOUND)) goto fail; @@ -6781,10 +6772,9 @@ next_gc:; /* Reading next GC record */ MDBX_val data; - MDBX_page *const mp = recur.mc_pg[recur.mc_top]; - if (unlikely( - (ret.err = node_read(&recur, page_node(mp, recur.mc_ki[recur.mc_top]), - &data, mp)) != MDBX_SUCCESS)) + MDBX_page *const mp = gc->mc_pg[gc->mc_top]; + if (unlikely((ret.err = node_read(gc, page_node(mp, gc->mc_ki[gc->mc_top]), + &data, mp)) != MDBX_SUCCESS)) goto fail; eASSERT(env, (txn->mt_flags & MDBX_TXN_FROZEN_RE) == 0); @@ -7712,7 +7702,9 @@ __cold int mdbx_env_sync_poll(MDBX_env *env) { /* Back up parent txn's cursors, then grab the originals for tracking */ static int cursor_shadow(MDBX_txn *parent, MDBX_txn *nested) { - for (int i = parent->mt_numdbs; --i >= 0;) { + tASSERT(parent, parent->mt_cursors[FREE_DBI] == nullptr); + nested->mt_cursors[FREE_DBI] = nullptr; + for (int i = parent->mt_numdbs; --i > FREE_DBI;) { nested->mt_cursors[i] = NULL; MDBX_cursor *mc = parent->mt_cursors[i]; if (mc != NULL) { @@ -7757,7 +7749,8 @@ static int cursor_shadow(MDBX_txn *parent, MDBX_txn *nested) { * * Returns 0 on success, non-zero on failure. */ static void cursors_eot(MDBX_txn *txn, const bool merge) { - for (intptr_t i = txn->mt_numdbs; --i >= 0;) { + tASSERT(txn, txn->mt_cursors[FREE_DBI] == nullptr); + for (intptr_t i = txn->mt_numdbs; --i > FREE_DBI;) { MDBX_cursor *next, *mc = txn->mt_cursors[i]; if (!mc) continue; @@ -8468,6 +8461,19 @@ static int txn_renew(MDBX_txn *txn, const unsigned flags) { osal_srwlock_AcquireShared(&env->me_remap_guard); } #endif /* Windows */ + } else { + if (unlikely(txn->mt_dbs[FREE_DBI].md_flags != MDBX_INTEGERKEY)) { + ERROR("unexpected/invalid db-flags 0x%u for GC/FreeDB", + txn->mt_dbs[FREE_DBI].md_flags); + rc = MDBX_INCOMPATIBLE; + goto bailout; + } + + tASSERT(txn, txn == env->me_txn0); + MDBX_cursor *const gc = (MDBX_cursor *)((char *)txn + sizeof(MDBX_txn)); + rc = cursor_init(gc, txn, FREE_DBI); + if (rc != MDBX_SUCCESS) + goto bailout; } #if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__) txn_valgrind(env, txn); @@ -9462,7 +9468,7 @@ static __inline int gcu_context_init(MDBX_txn *txn, gcu_context_t *ctx) { #if MDBX_ENABLE_BIGFOOT ctx->bigfoot = txn->mt_txnid; #endif /* MDBX_ENABLE_BIGFOOT */ - return gc_cursor_init(&ctx->cursor, txn); + return cursor_init(&ctx->cursor, txn, FREE_DBI); } static __always_inline size_t gcu_backlog_size(MDBX_txn *txn) { @@ -9543,7 +9549,7 @@ static int gcu_prepare_backlog(MDBX_txn *txn, gcu_context_t *ctx, err = gcu_clean_stored_retired(txn, ctx); if (unlikely(err != MDBX_SUCCESS)) return err; - err = page_alloc_slowpath(&ctx->cursor, (pgno_t)pages4retiredlist, + err = page_alloc_slowpath(&ctx->cursor, pages4retiredlist, MDBX_ALLOC_GC | MDBX_ALLOC_RESERVE) .err; TRACE("== after-4linear, backlog %zu, err %d", gcu_backlog_size(txn), err); @@ -14180,7 +14186,7 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, } if ((flags & MDBX_RDONLY) == 0) { - const size_t tsize = sizeof(MDBX_txn), + const size_t tsize = sizeof(MDBX_txn) + sizeof(MDBX_cursor), size = tsize + env->me_maxdbs * (sizeof(MDBX_db) + sizeof(MDBX_cursor *) + sizeof(MDBX_atomic_uint32_t) + 1); From 141cce0c0f862742b38ad3033dc0579cde366d50 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Tue, 22 Nov 2022 10:12:40 +0300 Subject: [PATCH 229/364] =?UTF-8?q?mdbx:=20=D0=B8=D1=81=D0=BF=D0=BE=D0=BB?= =?UTF-8?q?=D1=8C=D0=B7=D0=BE=D0=B2=D0=B0=D0=BD=D0=B8=D0=B5=20`size=5Ft`?= =?UTF-8?q?=20=D0=B4=D0=BB=D1=8F=20`npages`=20(=D0=BA=D0=BE=D1=81=D0=BC?= =?UTF-8?q?=D0=B5=D1=82=D0=B8=D0=BA=D0=B0).?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core.c | 66 +++++++++++++++++++++++++++--------------------------- 1 file changed, 33 insertions(+), 33 deletions(-) diff --git a/src/core.c b/src/core.c index e7d00a15..739a62a0 100644 --- a/src/core.c +++ b/src/core.c @@ -2459,7 +2459,7 @@ __hot static size_t pnl_merge(MDBX_PNL dst, const MDBX_PNL src) { return total; } -static void spill_remove(MDBX_txn *txn, size_t idx, pgno_t npages) { +static void spill_remove(MDBX_txn *txn, size_t idx, size_t npages) { tASSERT(txn, idx > 0 && idx <= MDBX_PNL_GETSIZE(txn->tw.spilled.list) && txn->tw.spilled.least_removed > 0); txn->tw.spilled.least_removed = (idx < txn->tw.spilled.least_removed) @@ -2569,7 +2569,7 @@ static __inline size_t search_spilled(const MDBX_txn *txn, pgno_t pgno) { } static __inline bool intersect_spilled(const MDBX_txn *txn, pgno_t pgno, - pgno_t npages) { + size_t npages) { const MDBX_PNL pnl = txn->tw.spilled.list; if (likely(!pnl)) return false; @@ -2582,7 +2582,7 @@ static __inline bool intersect_spilled(const MDBX_txn *txn, pgno_t pgno, DEBUG_EXTRA_PRINT("%s\n", "]"); } const pgno_t spilled_range_begin = pgno << 1; - const pgno_t spilled_range_last = ((pgno + npages) << 1) - 1; + const pgno_t spilled_range_last = ((pgno + (pgno_t)npages) << 1) - 1; #if MDBX_PNL_ASCENDING const size_t n = pnl_search(pnl, spilled_range_begin, (size_t)(MAX_PAGENO + 1) << 1); @@ -2946,7 +2946,7 @@ dpl_endpgno(const MDBX_dpl *dl, size_t i) { } static __inline bool dpl_intersect(const MDBX_txn *txn, pgno_t pgno, - pgno_t npages) { + size_t npages) { tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); @@ -3004,7 +3004,7 @@ MDBX_MAYBE_UNUSED static const MDBX_page *debug_dpl_find(const MDBX_txn *txn, return nullptr; } -static void dpl_remove_ex(const MDBX_txn *txn, size_t i, pgno_t npages) { +static void dpl_remove_ex(const MDBX_txn *txn, size_t i, size_t npages) { tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); @@ -3026,7 +3026,7 @@ static void dpl_remove(const MDBX_txn *txn, size_t i) { static __always_inline int __must_check_result dpl_append(MDBX_txn *txn, pgno_t pgno, MDBX_page *page, - pgno_t npages) { + size_t npages) { tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); MDBX_dpl *dl = txn->tw.dirtylist; @@ -3095,7 +3095,7 @@ static __must_check_result __inline int page_retire(MDBX_cursor *mc, MDBX_page *mp); static int __must_check_result page_dirty(MDBX_txn *txn, MDBX_page *mp, - pgno_t npages); + size_t npages); typedef struct page_result { MDBX_page *page; int err; @@ -3104,7 +3104,7 @@ typedef struct page_result { static txnid_t kick_longlived_readers(MDBX_env *env, const txnid_t laggard); static pgr_t page_new(MDBX_cursor *mc, const unsigned flags); -static pgr_t page_new_large(MDBX_cursor *mc, const pgno_t npages); +static pgr_t page_new_large(MDBX_cursor *mc, const size_t npages); static int page_touch(MDBX_cursor *mc); static int cursor_touch(MDBX_cursor *mc); static int touch_dbi(MDBX_cursor *mc); @@ -3703,7 +3703,7 @@ static MDBX_page *page_malloc(MDBX_txn *txn, size_t num) { } /* Free a shadow dirty page */ -static void dpage_free(MDBX_env *env, MDBX_page *dp, pgno_t npages) { +static void dpage_free(MDBX_env *env, MDBX_page *dp, size_t npages) { VALGRIND_MAKE_MEM_UNDEFINED(dp, pgno2bytes(env, npages)); MDBX_ASAN_UNPOISON_MEMORY_REGION(dp, pgno2bytes(env, npages)); if (unlikely(env->me_flags & MDBX_PAGEPERTURB)) @@ -4040,9 +4040,9 @@ static __inline bool txn_refund(MDBX_txn *txn) { #endif /* MDBX_ENABLE_REFUND */ __cold static void kill_page(MDBX_txn *txn, MDBX_page *mp, pgno_t pgno, - pgno_t npages) { + size_t npages) { MDBX_env *const env = txn->mt_env; - DEBUG("kill %u page(s) %" PRIaPGNO, npages, pgno); + DEBUG("kill %zu page(s) %" PRIaPGNO, npages, pgno); eASSERT(env, pgno >= NUM_METAS && npages); if (!IS_FROZEN(txn, mp)) { const size_t bytes = pgno2bytes(env, npages); @@ -4069,7 +4069,7 @@ __cold static void kill_page(MDBX_txn *txn, MDBX_page *mp, pgno_t pgno, /* Remove page from dirty list */ static __inline void page_wash(MDBX_txn *txn, const size_t di, - MDBX_page *const mp, const pgno_t npages) { + MDBX_page *const mp, const size_t npages) { tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); tASSERT(txn, di && di <= txn->tw.dirtylist->length && @@ -4118,7 +4118,7 @@ static int page_retire_ex(MDBX_cursor *mc, const pgno_t pgno, * So for flexibility and avoid extra internal dependencies we just * fallback to reading if dirty list was not allocated yet. */ size_t di = 0, si = 0; - pgno_t npages = 1; + size_t npages = 1; bool is_frozen = false, is_spilled = false, is_shadowed = false; if (unlikely(!mp)) { if (ASSERT_ENABLED() && pageflags) { @@ -4213,12 +4213,12 @@ status_done: } else { npages = mp->mp_pages; cASSERT(mc, mc->mc_db->md_overflow_pages >= npages); - mc->mc_db->md_overflow_pages -= npages; + mc->mc_db->md_overflow_pages -= (pgno_t)npages; } if (is_frozen) { retire: - DEBUG("retire %u page %" PRIaPGNO, npages, pgno); + DEBUG("retire %zu page %" PRIaPGNO, npages, pgno); rc = pnl_append_range(false, &txn->tw.retired_pages, pgno, npages); tASSERT(txn, dirtylist_check(txn)); return rc; @@ -4269,7 +4269,7 @@ status_done: } tASSERT(txn, is_spilled || is_shadowed || (mp && IS_SHADOWED(txn, mp))); } - DEBUG("refunded %u %s page %" PRIaPGNO, npages, kind, pgno); + DEBUG("refunded %zu %s page %" PRIaPGNO, npages, kind, pgno); txn->mt_next_pgno = pgno; txn_refund(txn); return MDBX_SUCCESS; @@ -4338,7 +4338,7 @@ status_done: page_wash(txn, di, mp, npages); reclaim: - DEBUG("reclaim %u %s page %" PRIaPGNO, npages, "dirty", pgno); + DEBUG("reclaim %zu %s page %" PRIaPGNO, npages, "dirty", pgno); rc = pnl_insert_range(&txn->tw.relist, pgno, npages); tASSERT(txn, pnl_check_allocated(txn->tw.relist, txn->mt_next_pgno - MDBX_ENABLE_REFUND)); @@ -4466,7 +4466,7 @@ static void iov_callback4dirtypages(iov_ctx_t *ctx, size_t offset, void *data, do { eASSERT(env, wp->mp_pgno == bytes2pgno(env, offset)); eASSERT(env, (wp->mp_flags & P_ILL_BITS) == 0); - unsigned npages = IS_OVERFLOW(wp) ? wp->mp_pages : 1u; + size_t npages = IS_OVERFLOW(wp) ? wp->mp_pages : 1u; size_t chunk = pgno2bytes(env, npages); eASSERT(env, bytes >= chunk); MDBX_page *next = (MDBX_page *)((char *)wp + chunk); @@ -4500,7 +4500,7 @@ __must_check_result static int iov_write(iov_ctx_t *ctx) { } __must_check_result static int iov_page(MDBX_txn *txn, iov_ctx_t *ctx, - MDBX_page *dp, pgno_t npages) { + MDBX_page *dp, size_t npages) { MDBX_env *const env = txn->mt_env; tASSERT(txn, ctx->err == MDBX_SUCCESS); tASSERT(txn, dp->mp_pgno >= MIN_PAGENO && dp->mp_pgno < txn->mt_next_pgno); @@ -4544,16 +4544,16 @@ __must_check_result static int iov_page(MDBX_txn *txn, iov_ctx_t *ctx, #if MDBX_NEED_WRITTEN_RANGE ctx->flush_begin = (ctx->flush_begin < dp->mp_pgno) ? ctx->flush_begin : dp->mp_pgno; - ctx->flush_end = (ctx->flush_end > dp->mp_pgno + npages) + ctx->flush_end = (ctx->flush_end > dp->mp_pgno + (pgno_t)npages) ? ctx->flush_end - : dp->mp_pgno + npages; + : dp->mp_pgno + (pgno_t)npages; #endif /* MDBX_NEED_WRITTEN_RANGE */ env->me_lck->mti_unsynced_pages.weak += npages; return MDBX_SUCCESS; } static int spill_page(MDBX_txn *txn, iov_ctx_t *ctx, MDBX_page *dp, - const pgno_t npages) { + const size_t npages) { tASSERT(txn, !(txn->mt_flags & MDBX_WRITEMAP) || MDBX_AVOID_MSYNC); #if MDBX_ENABLE_PGOP_STAT txn->mt_env->me_lck->mti_pgop_stat.spill.weak += npages; @@ -4612,16 +4612,16 @@ static unsigned spill_prio(const MDBX_txn *txn, const size_t i, const uint32_t reciprocal) { MDBX_dpl *const dl = txn->tw.dirtylist; const uint32_t age = dpl_age(txn, i); - const unsigned npages = dpl_npages(dl, i); + const size_t npages = dpl_npages(dl, i); const pgno_t pgno = dl->items[i].pgno; if (age == 0) { - DEBUG("skip %s %u page %" PRIaPGNO, "keep", npages, pgno); + DEBUG("skip %s %zu page %" PRIaPGNO, "keep", npages, pgno); return 256; } MDBX_page *const dp = dl->items[i].ptr; if (dp->mp_flags & (P_LOOSE | P_SPILLED)) { - DEBUG("skip %s %u page %" PRIaPGNO, + DEBUG("skip %s %zu page %" PRIaPGNO, (dp->mp_flags & P_LOOSE) ? "loose" : (dp->mp_flags & P_LOOSE) ? "loose" : "parent-spilled", @@ -4635,7 +4635,7 @@ static unsigned spill_prio(const MDBX_txn *txn, const size_t i, if (parent && (parent->mt_flags & MDBX_TXN_SPILLS)) { do if (intersect_spilled(parent, pgno, npages)) { - DEBUG("skip-2 parent-spilled %u page %" PRIaPGNO, npages, pgno); + DEBUG("skip-2 parent-spilled %zu page %" PRIaPGNO, npages, pgno); dp->mp_flags |= P_SPILLED; return 256; } @@ -4649,7 +4649,7 @@ static unsigned spill_prio(const MDBX_txn *txn, const size_t i, return prio = 256 - prio; /* make a large/overflow pages be likely to spill */ - uint32_t factor = npages | npages >> 1; + size_t factor = npages | npages >> 1; factor |= factor >> 2; factor |= factor >> 4; factor |= factor >> 8; @@ -4657,7 +4657,7 @@ static unsigned spill_prio(const MDBX_txn *txn, const size_t i, factor = prio * log2n_powerof2(factor + 1) + /* golden ratio */ 157; factor = (factor < 256) ? 255 - factor : 0; tASSERT(txn, factor < 256 && factor < (256 - prio)); - return prio = factor; + return prio = (unsigned)factor; } /* Spill pages from the dirty list back to disk. @@ -5484,7 +5484,7 @@ __cold static pgno_t find_largest_snapshot(const MDBX_env *env, /* Add a page to the txn's dirty list */ __hot static int __must_check_result page_dirty(MDBX_txn *txn, MDBX_page *mp, - pgno_t npages) { + size_t npages) { tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); mp->mp_txnid = txn->mt_front; if (!txn->tw.dirtylist) { @@ -17165,14 +17165,14 @@ static pgr_t page_new(MDBX_cursor *mc, const unsigned flags) { return ret; } -static pgr_t page_new_large(MDBX_cursor *mc, const unsigned npages) { +static pgr_t page_new_large(MDBX_cursor *mc, const size_t npages) { pgr_t ret = likely(npages == 1) ? page_alloc(mc) : page_alloc_slowpath(mc, npages, MDBX_ALLOC_ALL); if (unlikely(ret.err != MDBX_SUCCESS)) return ret; - DEBUG("db %u allocated new large-page %" PRIaPGNO ", num %u", mc->mc_dbi, + DEBUG("db %u allocated new large-page %" PRIaPGNO ", num %zu", mc->mc_dbi, ret.page->mp_pgno, npages); ret.page->mp_flags = P_OVERFLOW; cASSERT(mc, *mc->mc_dbistate & DBI_DIRTY); @@ -17181,8 +17181,8 @@ static pgr_t page_new_large(MDBX_cursor *mc, const unsigned npages) { mc->mc_txn->mt_env->me_lck->mti_pgop_stat.newly.weak += npages; #endif /* MDBX_ENABLE_PGOP_STAT */ - mc->mc_db->md_overflow_pages += npages; - ret.page->mp_pages = npages; + mc->mc_db->md_overflow_pages += (pgno_t)npages; + ret.page->mp_pages = (pgno_t)npages; cASSERT(mc, !(mc->mc_flags & C_SUB)); return ret; } From da023657f5b9454f955506eba747896998b8bf5c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Tue, 22 Nov 2022 01:11:46 +0300 Subject: [PATCH 230/364] =?UTF-8?q?mdbx:=20=D0=BF=D0=B5=D1=80=D0=B5=D1=80?= =?UTF-8?q?=D0=B0=D0=B1=D0=BE=D1=82=D0=BA=D0=B0=20=D0=B2=D0=BD=D1=83=D1=82?= =?UTF-8?q?=D1=80=D0=B5=D0=BD=D0=BD=D0=B8=D1=85=20=D1=84=D0=BB=D0=B0=D0=B3?= =?UTF-8?q?=D0=BE=D0=B2=20=D1=81=D0=B2=D1=8F=D0=B7=D0=B0=D0=BD=D0=BD=D1=8B?= =?UTF-8?q?=D1=85=20=D1=81=20=D0=B2=D1=8B=D0=B4=D0=B5=D0=BB=D0=B5=D0=BD?= =?UTF-8?q?=D0=B8=D0=B5=D0=BC=20=D1=81=D1=82=D1=80=D0=B0=D0=BD=D0=B8=D1=86?= =?UTF-8?q?=20=D0=B8=D0=B7=20GC.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/bits.md | 4 +- src/core.c | 247 +++++++++++++++++++++--------------------------- src/internals.h | 9 +- 3 files changed, 116 insertions(+), 144 deletions(-) diff --git a/src/bits.md b/src/bits.md index a9b7c2b1..82c9eed4 100644 --- a/src/bits.md +++ b/src/bits.md @@ -5,8 +5,8 @@ N | MASK | ENV | TXN | DB | PUT | DBI | NOD 2 |0000 0004|ALLOC_NEW |TXN_DIRTY |DUPSORT | |DBI_FRESH |F_DUPDATA|P_OVERFLOW| | 3 |0000 0008|ALLOC_SLOT |TXN_SPILLS |INTEGERKEY| |DBI_CREAT | |P_META | | 4 |0000 0010|ALLOC_FAKE |TXN_HAS_CHILD |DUPFIXED |NOOVERWRITE|DBI_VALID | |P_BAD | | -5 |0000 0020| |TXN_UPDATE_GC |INTEGERDUP|NODUPDATA |DBI_USRVALID| |P_LEAF2 | | -6 |0000 0040| |TXN_FROZEN_RE |REVERSEDUP|CURRENT |DBI_DUPDATA | |P_SUBP | | +5 |0000 0020| | |INTEGERDUP|NODUPDATA |DBI_USRVALID| |P_LEAF2 | | +6 |0000 0040| | |REVERSEDUP|CURRENT |DBI_DUPDATA | |P_SUBP | | 7 |0000 0080| | | |ALLDUPS |DBI_AUDITED | | | | 8 |0000 0100| _MAY_MOVE | | | | | | | <= | 9 |0000 0200| _MAY_UNMAP| | | | | | | <= | diff --git a/src/core.c b/src/core.c index 739a62a0..b9240eaf 100644 --- a/src/core.c +++ b/src/core.c @@ -6588,20 +6588,19 @@ static pgno_t *scan4seq_resolver(pgno_t *range, const size_t len, * * Returns 0 on success, non-zero on failure.*/ -#define MDBX_ALLOC_GC 1 -#define MDBX_ALLOC_NEW 2 -#define MDBX_ALLOC_COALESCE 4 -#define MDBX_ALLOC_SLOT 8 -#define MDBX_ALLOC_RESERVE 16 -#define MDBX_ALLOC_BACKLOG 32 -#define MDBX_ALLOC_ALL (MDBX_ALLOC_GC | MDBX_ALLOC_NEW) -#define MDBX_ALLOC_SHOULD_SCAN 64 /* internal state */ -#define MDBX_ALLOC_LIFO 128 /* internal state */ +#define MDBX_ALLOC_DEFAULT 0 +#define MDBX_ALLOC_RESERVE 1 +#define MDBX_ALLOC_UNIMPORTANT 2 +#define MDBX_ALLOC_COALESCE 4 /* внутреннее состояние */ +#define MDBX_ALLOC_SHOULD_SCAN 8 /* внутреннее состояние */ +#define MDBX_ALLOC_LIFO 16 /* внутреннее состояние */ -static __inline bool is_gc_usable(const MDBX_txn *txn) { +static __inline bool is_gc_usable(MDBX_txn *txn, const MDBX_cursor *mc, + const uint8_t flags) { /* If txn is updating the GC, then the retired-list cannot play catch-up with * itself by growing while trying to save it. */ - if (txn->mt_flags & (MDBX_TXN_UPDATE_GC | MDBX_TXN_FROZEN_RE)) + if (mc->mc_dbi == FREE_DBI && !(flags & MDBX_ALLOC_RESERVE) && + !(mc->mc_flags & C_GCU)) return false; /* avoid (recursive) search inside empty tree and while tree is @@ -6609,11 +6608,6 @@ static __inline bool is_gc_usable(const MDBX_txn *txn) { if (txn->mt_dbs[FREE_DBI].md_entries == 0) return false; - /* If our dirty list is already full, we can't touch GC */ - if (unlikely(txn->tw.dirtyroom < txn->mt_dbs[FREE_DBI].md_depth) && - !(txn->mt_dbistate[FREE_DBI] & DBI_DIRTY)) - return false; - return true; } @@ -6644,22 +6638,13 @@ static pgr_t page_alloc_slowpath(const MDBX_cursor *mc, const size_t num, prof->spe_counter += 1; #endif /* MDBX_ENABLE_PROFGC */ - eASSERT(env, flags & MDBX_ALLOC_GC); - eASSERT(env, num == 0 || !(flags & MDBX_ALLOC_SLOT)); - eASSERT(env, num > 0 || !(flags & MDBX_ALLOC_NEW)); - eASSERT(env, (flags & (MDBX_ALLOC_SLOT | MDBX_ALLOC_RESERVE | - MDBX_ALLOC_BACKLOG)) == 0 || - (flags & MDBX_ALLOC_NEW) == 0); + eASSERT(env, num > 0 || (flags & MDBX_ALLOC_RESERVE)); eASSERT(env, pnl_check_allocated(txn->tw.relist, txn->mt_next_pgno - MDBX_ENABLE_REFUND)); pgno_t pgno = 0, *range = nullptr; size_t newnext, re_len = MDBX_PNL_GETSIZE(txn->tw.relist); if (num > 1) { - eASSERT(env, !(flags & MDBX_ALLOC_SLOT)); - eASSERT(env, (txn->mt_flags & MDBX_TXN_FROZEN_RE) == 0); - if (unlikely(txn->mt_flags & MDBX_TXN_FROZEN_RE)) - goto no_gc; #if MDBX_ENABLE_PROFGC prof->xpages += 1; #endif /* MDBX_ENABLE_PROFGC */ @@ -6675,13 +6660,12 @@ static pgr_t page_alloc_slowpath(const MDBX_cursor *mc, const size_t num, } } } else { - eASSERT(env, - (flags & (MDBX_ALLOC_SLOT | MDBX_ALLOC_RESERVE)) || re_len == 0); + eASSERT(env, num == 0 || re_len == 0); } //--------------------------------------------------------------------------- - if (unlikely(!is_gc_usable(txn))) + if (unlikely(!is_gc_usable(txn, mc, flags))) goto no_gc; eASSERT(env, (flags & (MDBX_ALLOC_COALESCE | MDBX_ALLOC_LIFO | @@ -6691,7 +6675,7 @@ static pgr_t page_alloc_slowpath(const MDBX_cursor *mc, const size_t num, if (/* Не коагулируем записи при подготовке резерва для обновления GC. * Иначе попытка увеличить резерв может приводить к необходимости ещё * большего резерва из-за увеличения списка переработанных страниц. */ - flags < MDBX_ALLOC_COALESCE) { + (flags & MDBX_ALLOC_RESERVE) == 0) { if (txn->mt_dbs[FREE_DBI].md_branch_pages && re_len < env->me_maxgc_ov1page / 2) flags += MDBX_ALLOC_COALESCE; @@ -6777,7 +6761,6 @@ next_gc:; &data, mp)) != MDBX_SUCCESS)) goto fail; - eASSERT(env, (txn->mt_flags & MDBX_TXN_FROZEN_RE) == 0); pgno_t *gc_pnl = (pgno_t *)data.iov_base; if (unlikely(data.iov_len % sizeof(pgno_t) || data.iov_len < MDBX_PNL_SIZEOF(gc_pnl) || @@ -6818,8 +6801,7 @@ next_gc:; } if (unlikely(/* list is too long already */ re_len >= env->me_options.rp_augment_limit) && - ((/* not a slot-request from gc-update */ - (flags & MDBX_ALLOC_SLOT) == 0 && + ((/* not a slot-request from gc-update */ num && /* have enough unallocated space */ txn->mt_geo.upper >= txn->mt_next_pgno + num) || gc_len + re_len >= MDBX_PGL_LIMIT)) { @@ -6883,7 +6865,7 @@ next_gc:; txn->mt_next_pgno - MDBX_ENABLE_REFUND)); /* Done for a kick-reclaim mode, actually no page needed */ - if (unlikely(flags & MDBX_ALLOC_SLOT)) { + if (unlikely(num == 0)) { eASSERT(env, ret.err == MDBX_SUCCESS); TRACE("%s: last id #%" PRIaTXN ", re-len %zu", "early-exit for slot", id, re_len); @@ -6901,6 +6883,7 @@ next_gc:; scan: eASSERT(env, flags & MDBX_ALLOC_SHOULD_SCAN); + eASSERT(env, num > 0); if (re_len >= num) { eASSERT(env, MDBX_PNL_LAST(txn->tw.relist) < txn->mt_next_pgno && MDBX_PNL_FIRST(txn->tw.relist) < txn->mt_next_pgno); @@ -6977,8 +6960,7 @@ depleted_gc: meta_prefer_steady(env, &txn->tw.troika).ptr_c); goto retry_gc_refresh_oldest; } - if ((flags & (MDBX_ALLOC_BACKLOG | MDBX_ALLOC_NEW)) == 0 || - (autosync_threshold && + if ((autosync_threshold && atomic_load64(&env->me_lck->mti_unsynced_pages, mo_Relaxed) >= autosync_threshold) || (autosync_period && @@ -6986,7 +6968,7 @@ depleted_gc: atomic_load64(&env->me_lck->mti_eoos_timestamp, mo_Relaxed)) && osal_monotime() - eoos_timestamp >= autosync_period) || newnext >= txn->mt_geo.upper || - (newnext >= txn->mt_end_pgno && + ((num == 0 || newnext >= txn->mt_end_pgno) && (autosync_threshold | autosync_period) == 0)) { /* make steady checkpoint. */ #if MDBX_ENABLE_PROFGC @@ -7014,7 +6996,7 @@ depleted_gc: /* Avoid kick lagging reader(s) if is enough unallocated space * at the end of database file. */ - if ((flags & MDBX_ALLOC_NEW) && newnext <= txn->mt_end_pgno) { + if (!(flags & MDBX_ALLOC_RESERVE) && newnext <= txn->mt_end_pgno) { eASSERT(env, range == nullptr); goto done; } @@ -7028,7 +7010,7 @@ depleted_gc: //--------------------------------------------------------------------------- no_gc: - if ((flags & MDBX_ALLOC_NEW) == 0) { + if (flags & MDBX_ALLOC_RESERVE) { ret.err = MDBX_NOTFOUND; goto fail; } @@ -7071,10 +7053,9 @@ no_gc: done: ret.err = MDBX_SUCCESS; - if (likely((flags & (MDBX_ALLOC_SLOT | MDBX_ALLOC_RESERVE)) == 0)) { + if (likely((flags & MDBX_ALLOC_RESERVE) == 0)) { ENSURE(env, pgno >= NUM_METAS); if (range) { - eASSERT(env, (txn->mt_flags & MDBX_TXN_FROZEN_RE) == 0); eASSERT(env, pgno == *range); eASSERT(env, pgno + num <= txn->mt_next_pgno && pgno >= NUM_METAS); eASSERT(env, re_len == MDBX_PNL_GETSIZE(txn->tw.relist)); @@ -7091,7 +7072,6 @@ done: eASSERT(env, pnl_check_allocated(txn->tw.relist, txn->mt_next_pgno - MDBX_ENABLE_REFUND)); } else { - eASSERT(env, flags & MDBX_ALLOC_NEW); pgno = txn->mt_next_pgno; txn->mt_next_pgno += (pgno_t)num; eASSERT(env, txn->mt_next_pgno <= txn->mt_end_pgno); @@ -7135,8 +7115,9 @@ done: int level; const char *what; if (flags & MDBX_ALLOC_RESERVE) { - level = (flags & MDBX_ALLOC_BACKLOG) ? MDBX_LOG_DEBUG : MDBX_LOG_NOTICE; - what = (flags & MDBX_ALLOC_SLOT) ? "gc-slot/backlog" : "backlog-pages"; + level = + (flags & MDBX_ALLOC_UNIMPORTANT) ? MDBX_LOG_DEBUG : MDBX_LOG_NOTICE; + what = num ? "reserve-pages" : "fetch-slot"; } else { txn->mt_flags |= MDBX_TXN_ERROR; level = MDBX_LOG_ERROR; @@ -7151,7 +7132,7 @@ done: } else { early_exit: DEBUG("return NULL for %zu pages for ALLOC_%s, rc %d", num, - (flags & MDBX_ALLOC_SLOT) ? "SLOT" : "RESERVE", ret.err); + num ? "RESERVE" : "SLOT", ret.err); ret.page = NULL; } @@ -7197,62 +7178,60 @@ __hot static pgr_t page_alloc(const MDBX_cursor *mc) { return ret; } - if (likely(!(txn->mt_flags & MDBX_TXN_FROZEN_RE))) { - MDBX_PNL pnl = txn->tw.relist; - const size_t len = MDBX_PNL_GETSIZE(pnl); - if (likely(len > 0)) { - MDBX_env *const env = txn->mt_env; + MDBX_PNL pnl = txn->tw.relist; + const size_t len = MDBX_PNL_GETSIZE(pnl); + if (likely(len > 0)) { + MDBX_env *const env = txn->mt_env; - MDBX_PNL_SETSIZE(pnl, len - 1); + MDBX_PNL_SETSIZE(pnl, len - 1); #if MDBX_PNL_ASCENDING - const pgno_t pgno = pnl[1]; - for (size_t i = 1; i < len; ++i) - pnl[i] = pnl[i + 1]; + const pgno_t pgno = pnl[1]; + for (size_t i = 1; i < len; ++i) + pnl[i] = pnl[i + 1]; #else - const pgno_t pgno = pnl[len]; + const pgno_t pgno = pnl[len]; #endif #if MDBX_ENABLE_PROFGC - const uint64_t monotime_before = osal_monotime(); - size_t majflt_before; - const uint64_t cputime_before = osal_cputime(&majflt_before); - profgc_stat_t *const prof = - (mc->mc_dbi == FREE_DBI) ? &env->me_lck->mti_pgop_stat.gc_prof.self - : &env->me_lck->mti_pgop_stat.gc_prof.work; + const uint64_t monotime_before = osal_monotime(); + size_t majflt_before; + const uint64_t cputime_before = osal_cputime(&majflt_before); + profgc_stat_t *const prof = (mc->mc_dbi == FREE_DBI) + ? &env->me_lck->mti_pgop_stat.gc_prof.self + : &env->me_lck->mti_pgop_stat.gc_prof.work; #endif /* MDBX_ENABLE_PROFGC */ - pgr_t ret; - if (env->me_flags & MDBX_WRITEMAP) { - ret.page = pgno2page(env, pgno); - MDBX_ASAN_UNPOISON_MEMORY_REGION(ret.page, env->me_psize); - } else { - ret.page = page_malloc(txn, 1); - if (unlikely(!ret.page)) { - ret.err = MDBX_ENOMEM; - goto bailout; - } + pgr_t ret; + if (env->me_flags & MDBX_WRITEMAP) { + ret.page = pgno2page(env, pgno); + MDBX_ASAN_UNPOISON_MEMORY_REGION(ret.page, env->me_psize); + } else { + ret.page = page_malloc(txn, 1); + if (unlikely(!ret.page)) { + ret.err = MDBX_ENOMEM; + goto bailout; } - - VALGRIND_MAKE_MEM_UNDEFINED(ret.page, env->me_psize); - ret.page->mp_pgno = pgno; - ret.page->mp_leaf2_ksize = 0; - ret.page->mp_flags = 0; - tASSERT(txn, ret.page->mp_pgno >= NUM_METAS); - - ret.err = page_dirty(txn, ret.page, 1); - bailout: - tASSERT(txn, pnl_check_allocated(txn->tw.relist, - txn->mt_next_pgno - MDBX_ENABLE_REFUND)); -#if MDBX_ENABLE_PROFGC - size_t majflt_after; - prof->rtime_cpu += osal_cputime(&majflt_after) - cputime_before; - prof->majflt += majflt_after - majflt_before; - prof->xtime_monotonic += osal_monotime() - monotime_before; -#endif /* MDBX_ENABLE_PROFGC */ - return ret; } + + VALGRIND_MAKE_MEM_UNDEFINED(ret.page, env->me_psize); + ret.page->mp_pgno = pgno; + ret.page->mp_leaf2_ksize = 0; + ret.page->mp_flags = 0; + tASSERT(txn, ret.page->mp_pgno >= NUM_METAS); + + ret.err = page_dirty(txn, ret.page, 1); + bailout: + tASSERT(txn, pnl_check_allocated(txn->tw.relist, + txn->mt_next_pgno - MDBX_ENABLE_REFUND)); +#if MDBX_ENABLE_PROFGC + size_t majflt_after; + prof->rtime_cpu += osal_cputime(&majflt_after) - cputime_before; + prof->majflt += majflt_after - majflt_before; + prof->xtime_monotonic += osal_monotime() - monotime_before; +#endif /* MDBX_ENABLE_PROFGC */ + return ret; } - return page_alloc_slowpath(mc, 1, MDBX_ALLOC_ALL); + return page_alloc_slowpath(mc, 1, MDBX_ALLOC_DEFAULT); } /* Copy the used portions of a page. */ @@ -9503,6 +9482,13 @@ static int gcu_clean_stored_retired(MDBX_txn *txn, gcu_context_t *ctx) { return err; } +static int gcu_touch(gcu_context_t *ctx) { + ctx->cursor.mc_flags |= C_GCU; + int err = cursor_touch(&ctx->cursor); + ctx->cursor.mc_flags -= C_GCU; + return err; +} + /* Prepare a backlog of pages to modify GC itself, while reclaiming is * prohibited. It should be enough to prevent search in page_alloc_slowpath() * during a deleting, when GC tree is unbalanced. */ @@ -9537,9 +9523,7 @@ static int gcu_prepare_backlog(MDBX_txn *txn, gcu_context_t *ctx, return err; } - tASSERT(txn, txn->mt_flags & MDBX_TXN_UPDATE_GC); - txn->mt_flags -= MDBX_TXN_UPDATE_GC; - err = cursor_touch(&ctx->cursor); + err = gcu_touch(ctx); TRACE("== after-touch, backlog %zu, err %d", gcu_backlog_size(txn), err); if (unlikely(pages4retiredlist > 1) && @@ -9549,9 +9533,9 @@ static int gcu_prepare_backlog(MDBX_txn *txn, gcu_context_t *ctx, err = gcu_clean_stored_retired(txn, ctx); if (unlikely(err != MDBX_SUCCESS)) return err; - err = page_alloc_slowpath(&ctx->cursor, pages4retiredlist, - MDBX_ALLOC_GC | MDBX_ALLOC_RESERVE) - .err; + err = + page_alloc_slowpath(&ctx->cursor, pages4retiredlist, MDBX_ALLOC_RESERVE) + .err; TRACE("== after-4linear, backlog %zu, err %d", gcu_backlog_size(txn), err); cASSERT(&ctx->cursor, gcu_backlog_size(txn) >= pages4retiredlist || err != MDBX_SUCCESS); @@ -9560,11 +9544,9 @@ static int gcu_prepare_backlog(MDBX_txn *txn, gcu_context_t *ctx, while (gcu_backlog_size(txn) < backlog4cow + pages4retiredlist && err == MDBX_SUCCESS) err = page_alloc_slowpath(&ctx->cursor, 0, - MDBX_ALLOC_GC | MDBX_ALLOC_SLOT | - MDBX_ALLOC_RESERVE | MDBX_ALLOC_BACKLOG) + MDBX_ALLOC_RESERVE | MDBX_ALLOC_UNIMPORTANT) .err; - txn->mt_flags += MDBX_TXN_UPDATE_GC; TRACE("<< backlog %zu, err %d", gcu_backlog_size(txn), err); return (err != MDBX_NOTFOUND) ? err : MDBX_SUCCESS; } @@ -9593,7 +9575,6 @@ static int update_gc(MDBX_txn *txn, gcu_context_t *ctx) { MDBX_env *const env = txn->mt_env; const char *const dbg_prefix_mode = ctx->lifo ? " lifo" : " fifo"; (void)dbg_prefix_mode; - txn->mt_flags += MDBX_TXN_UPDATE_GC; ctx->cursor.mc_next = txn->mt_cursors[FREE_DBI]; txn->mt_cursors[FREE_DBI] = &ctx->cursor; @@ -9741,10 +9722,7 @@ retry: if (txn->tw.loose_count > 0) { TRACE("%s: try allocate gc-slot for %zu loose-pages", dbg_prefix_mode, txn->tw.loose_count); - rc = page_alloc_slowpath(&ctx->cursor, 0, - MDBX_ALLOC_GC | MDBX_ALLOC_SLOT | - MDBX_ALLOC_RESERVE) - .err; + rc = page_alloc_slowpath(&ctx->cursor, 0, MDBX_ALLOC_RESERVE).err; if (rc == MDBX_SUCCESS) { TRACE("%s: retry since gc-slot for %zu loose-pages available", dbg_prefix_mode, txn->tw.loose_count); @@ -9826,9 +9804,9 @@ retry: if (ctx->retired_stored < MDBX_PNL_GETSIZE(txn->tw.retired_pages)) { if (unlikely(!ctx->retired_stored)) { /* Make sure last page of GC is touched and on retired-list */ - txn->mt_flags -= MDBX_TXN_UPDATE_GC; - rc = page_search(&ctx->cursor, NULL, MDBX_PS_LAST | MDBX_PS_MODIFY); - txn->mt_flags += MDBX_TXN_UPDATE_GC; + rc = cursor_last(&ctx->cursor, nullptr, nullptr); + if (likely(rc != MDBX_SUCCESS)) + rc = gcu_touch(ctx); if (unlikely(rc != MDBX_SUCCESS) && rc != MDBX_NOTFOUND) goto bailout; } @@ -9966,16 +9944,12 @@ retry: left > (MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed) - ctx->reused_slot) * env->me_maxgc_ov1page && !ctx->dense) { - /* Hужен свобожный для для сохранения списка страниц. */ + /* Hужен свободный для для сохранения списка страниц. */ bool need_cleanup = false; txnid_t snap_oldest = 0; retry_rid: - txn->mt_flags -= MDBX_TXN_UPDATE_GC; do { - rc = page_alloc_slowpath(&ctx->cursor, 0, - MDBX_ALLOC_GC | MDBX_ALLOC_SLOT | - MDBX_ALLOC_RESERVE) - .err; + rc = page_alloc_slowpath(&ctx->cursor, 0, MDBX_ALLOC_RESERVE).err; snap_oldest = env->me_lck->mti_oldest_reader.weak; if (likely(rc == MDBX_SUCCESS)) { TRACE("%s: took @%" PRIaTXN " from GC", dbg_prefix_mode, @@ -9988,7 +9962,6 @@ retry: left > (MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed) - ctx->reused_slot) * env->me_maxgc_ov1page); - txn->mt_flags += MDBX_TXN_UPDATE_GC; if (likely(rc == MDBX_SUCCESS)) { TRACE("%s: got enough from GC.", dbg_prefix_mode); @@ -10006,7 +9979,7 @@ retry: } else { tASSERT(txn, txn->tw.last_reclaimed == 0); if (unlikely(txn_oldest_reader(txn) != snap_oldest)) - /* should retry page_alloc_slowpath(MDBX_ALLOC_GC) + /* should retry page_alloc_slowpath() * if the oldest reader changes since the last attempt */ goto retry_rid; /* no reclaimable GC entries, @@ -10289,7 +10262,6 @@ retry: key.iov_len = sizeof(fill_gc_id); tASSERT(txn, data.iov_len >= sizeof(pgno_t) * 2); - txn->mt_flags += MDBX_TXN_FROZEN_RE; size_t chunk = data.iov_len / sizeof(pgno_t) - 1; if (unlikely(chunk > left)) { TRACE("%s: chunk %zu > left %zu, @%" PRIaTXN, dbg_prefix_mode, chunk, @@ -10297,14 +10269,11 @@ retry: if ((ctx->loop < 5 && chunk - left > ctx->loop / 2) || chunk - left > env->me_maxgc_ov1page) { data.iov_len = (left + 1) * sizeof(pgno_t); - if (ctx->loop < 7) - txn->mt_flags &= ~MDBX_TXN_FROZEN_RE; } chunk = left; } rc = mdbx_cursor_put(&ctx->cursor, &key, &data, MDBX_CURRENT | MDBX_RESERVE); - txn->mt_flags &= ~MDBX_TXN_FROZEN_RE; if (unlikely(rc != MDBX_SUCCESS)) goto bailout; gcu_clean_reserved(env, data); @@ -15079,7 +15048,8 @@ static __noinline int node_read_bigdata(MDBX_cursor *mc, const MDBX_node *node, if (!MDBX_DISABLE_VALIDATION) { const MDBX_env *env = mc->mc_txn->mt_env; const size_t dsize = data->iov_len; - if (unlikely(node_size_len(node_ks(node), dsize) <= env->me_leaf_nodemax)) + if (unlikely(node_size_len(node_ks(node), dsize) <= env->me_leaf_nodemax) && + mc->mc_dbi != FREE_DBI) poor_page(mp, "too small data (%zu bytes) for bigdata-node", dsize); const unsigned npages = number_of_ovpages(env, dsize); if (unlikely(lp.page->mp_pages != npages)) { @@ -15087,7 +15057,7 @@ static __noinline int node_read_bigdata(MDBX_cursor *mc, const MDBX_node *node, return bad_page(lp.page, "too less n-pages %u for bigdata-node (%zu bytes)", lp.page->mp_pages, dsize); - else + else if (mc->mc_dbi != FREE_DBI) poor_page(lp.page, "extra n-pages %u for bigdata-node (%zu bytes)", lp.page->mp_pages, dsize); } @@ -16183,7 +16153,6 @@ static int touch_dbi(MDBX_cursor *mc) { *mc->mc_dbistate |= DBI_DIRTY; mc->mc_txn->mt_flags |= MDBX_TXN_DIRTY; if (mc->mc_dbi >= CORE_DBS) { - cASSERT(mc, (mc->mc_txn->mt_flags & MDBX_TXN_UPDATE_GC) == 0); /* Touch DB record of named DB */ MDBX_cursor_couple cx; int rc = cursor_init(&cx.outer, mc->mc_txn, MAIN_DBI); @@ -16596,9 +16565,9 @@ __hot int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, /* Large/Overflow page overwrites need special handling */ if (unlikely(node_flags(node) & F_BIGDATA)) { - int dpages = (node_size(key, data) > env->me_leaf_nodemax) - ? number_of_ovpages(env, data->iov_len) - : 0; + const size_t dpages = (node_size(key, data) > env->me_leaf_nodemax) + ? number_of_ovpages(env, data->iov_len) + : 0; const pgno_t pgno = node_largedata_pgno(node); pgr_t lp = page_get_large(mc, pgno, mc->mc_pg[mc->mc_top]->mp_txnid); @@ -16607,13 +16576,13 @@ __hot int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, cASSERT(mc, PAGETYPE_WHOLE(lp.page) == P_OVERFLOW); /* Is the ov page from this txn (or a parent) and big enough? */ - int ovpages = lp.page->mp_pages; - if (!IS_FROZEN(mc->mc_txn, lp.page) && - (unlikely(mc->mc_txn->mt_flags & MDBX_TXN_FROZEN_RE) - ? (ovpages >= dpages) - : (ovpages == - /* LY: add configurable threshold to keep reserve space */ - dpages))) { + const size_t ovpages = lp.page->mp_pages; + const size_t extra_threshold = + (mc->mc_dbi == FREE_DBI) + ? 1 + : /* LY: add configurable threshold to keep reserve space */ 0; + if (!IS_FROZEN(mc->mc_txn, lp.page) && ovpages >= dpages && + ovpages <= dpages + extra_threshold) { /* yes, overwrite it. */ if (!IS_MODIFIABLE(mc->mc_txn, lp.page)) { if (IS_SPILLED(mc->mc_txn, lp.page)) { @@ -17168,7 +17137,7 @@ static pgr_t page_new(MDBX_cursor *mc, const unsigned flags) { static pgr_t page_new_large(MDBX_cursor *mc, const size_t npages) { pgr_t ret = likely(npages == 1) ? page_alloc(mc) - : page_alloc_slowpath(mc, npages, MDBX_ALLOC_ALL); + : page_alloc_slowpath(mc, npages, MDBX_ALLOC_DEFAULT); if (unlikely(ret.err != MDBX_SUCCESS)) return ret; @@ -17279,7 +17248,6 @@ __hot static int __must_check_result node_add_leaf(MDBX_cursor *mc, size_t indx, key ? key->iov_len : 0, DKEY_DEBUG(key)); cASSERT(mc, key != NULL && data != NULL); cASSERT(mc, PAGETYPE_COMPAT(mp) == P_LEAF); - cASSERT(mc, page_room(mp) >= leaf_size(mc->mc_txn->mt_env, key, data)); MDBX_page *largepage = NULL; size_t node_bytes; @@ -17288,6 +17256,7 @@ __hot static int __must_check_result node_add_leaf(MDBX_cursor *mc, size_t indx, STATIC_ASSERT(sizeof(pgno_t) % 2 == 0); node_bytes = node_size_len(key->iov_len, 0) + sizeof(pgno_t) + sizeof(indx_t); + cASSERT(mc, page_room(mp) >= node_bytes); } else if (unlikely(node_size(key, data) > mc->mc_txn->mt_env->me_leaf_nodemax)) { /* Put data on large/overflow page. */ @@ -17301,6 +17270,7 @@ __hot static int __must_check_result node_add_leaf(MDBX_cursor *mc, size_t indx, flags); return MDBX_PROBLEM; } + cASSERT(mc, page_room(mp) >= leaf_size(mc->mc_txn->mt_env, key, data)); const pgno_t ovpages = number_of_ovpages(mc->mc_txn->mt_env, data->iov_len); const pgr_t npr = page_new_large(mc, ovpages); if (unlikely(npr.err != MDBX_SUCCESS)) @@ -17312,10 +17282,12 @@ __hot static int __must_check_result node_add_leaf(MDBX_cursor *mc, size_t indx, flags |= F_BIGDATA; node_bytes = node_size_len(key->iov_len, 0) + sizeof(pgno_t) + sizeof(indx_t); + cASSERT(mc, node_bytes == leaf_size(mc->mc_txn->mt_env, key, data)); } else { + cASSERT(mc, page_room(mp) >= leaf_size(mc->mc_txn->mt_env, key, data)); node_bytes = node_size(key, data) + sizeof(indx_t); + cASSERT(mc, node_bytes == leaf_size(mc->mc_txn->mt_env, key, data)); } - cASSERT(mc, node_bytes == leaf_size(mc->mc_txn->mt_env, key, data)); /* Move higher pointers up one slot. */ const size_t nkeys = page_numkeys(mp); @@ -19056,7 +19028,8 @@ __cold static int page_check(MDBX_cursor *const mc, const MDBX_page *const mp) { "big-node data size (%zu) <> min/max value-length (%zu/%zu)\n", dsize, mc->mc_dbx->md_vlen_min, mc->mc_dbx->md_vlen_max); if (unlikely(node_size_len(node_ks(node), dsize) <= - mc->mc_txn->mt_env->me_leaf_nodemax)) + mc->mc_txn->mt_env->me_leaf_nodemax) && + mc->mc_dbi != FREE_DBI) poor_page(mp, "too small data (%zu bytes) for bigdata-node", dsize); if ((mc->mc_checking & CC_RETIRING) == 0) { @@ -19071,7 +19044,7 @@ __cold static int page_check(MDBX_cursor *const mc, const MDBX_page *const mp) { rc = bad_page(lp.page, "too less n-pages %u for bigdata-node (%zu bytes)", lp.page->mp_pages, dsize); - else + else if (mc->mc_dbi != FREE_DBI) poor_page(lp.page, "extra n-pages %u for bigdata-node (%zu bytes)", lp.page->mp_pages, dsize); diff --git a/src/internals.h b/src/internals.h index 1c0c6f98..e6bcdd9a 100644 --- a/src/internals.h +++ b/src/internals.h @@ -1000,13 +1000,9 @@ struct MDBX_txn { /* Additional flag for sync_locked() */ #define MDBX_SHRINK_ALLOWED UINT32_C(0x40000000) -#define MDBX_TXN_UPDATE_GC 0x20 /* GC is being updated */ -#define MDBX_TXN_FROZEN_RE 0x40 /* list of reclaimed-pgno must not altered */ - #define TXN_FLAGS \ (MDBX_TXN_FINISHED | MDBX_TXN_ERROR | MDBX_TXN_DIRTY | MDBX_TXN_SPILLS | \ - MDBX_TXN_HAS_CHILD | MDBX_TXN_INVALID | MDBX_TXN_UPDATE_GC | \ - MDBX_TXN_FROZEN_RE) + MDBX_TXN_HAS_CHILD | MDBX_TXN_INVALID) #if (TXN_FLAGS & (MDBX_TXN_RW_BEGIN_FLAGS | MDBX_TXN_RO_BEGIN_FLAGS)) || \ ((MDBX_TXN_RW_BEGIN_FLAGS | MDBX_TXN_RO_BEGIN_FLAGS | TXN_FLAGS) & \ @@ -1147,6 +1143,9 @@ struct MDBX_cursor { #define C_SUB 0x04 /* Cursor is a sub-cursor */ #define C_DEL 0x08 /* last op was a cursor_del */ #define C_UNTRACK 0x10 /* Un-track cursor when closing */ +#define C_GCU \ + 0x20 /* Происходит подготовка к обновлению GC, поэтому \ + * можно брать страницы из GC даже для FREE_DBI */ uint8_t mc_flags; /* Cursor checking flags. */ From a1333fc827bba3fb0a2c5d9c718177d27cc2cf5f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Wed, 23 Nov 2022 00:57:02 +0300 Subject: [PATCH 231/364] mdbx: fix SIGSEGV/invalid-deref/invalid-free inside `env_close()` when `mdbx_env_open()` failed in re-open case. Thanks to [@leisim](https://t.me/leisim) for [reporting](https://t.me/libmdbx/3946) this issue. --- src/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/core.c b/src/core.c index b9240eaf..2342a970 100644 --- a/src/core.c +++ b/src/core.c @@ -14280,9 +14280,10 @@ __cold static int env_close(MDBX_env *env) { } if (env->me_dbxs) { - for (size_t i = env->me_numdbs; --i >= CORE_DBS;) + for (size_t i = CORE_DBS; i < env->me_numdbs; ++i) osal_free(env->me_dbxs[i].md_name.iov_base); osal_free(env->me_dbxs); + env->me_numdbs = CORE_DBS; env->me_dbxs = nullptr; } if (env->me_pbuf) { From 61eafe80c14d18cf3a10d67237c3a6dbc60e4d6c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Wed, 23 Nov 2022 01:18:25 +0300 Subject: [PATCH 232/364] =?UTF-8?q?mdbx:=20=D0=B8=D1=81=D0=BF=D0=BE=D0=BB?= =?UTF-8?q?=D1=8C=D0=B7=D0=BE=D0=B2=D0=B0=D0=BD=D0=B8=D0=B5=20https://libm?= =?UTF-8?q?dbx.dqdkfa.ru/dead-github=20=D0=B4=D0=BB=D1=8F=20=D1=83=D0=B4?= =?UTF-8?q?=D0=B0=D0=BB=D0=B5=D0=BD=D0=BD=D1=8B=D1=85=20issues.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ChangeLog.md | 128 ++++++++++++++++++------------------- README.md | 4 +- TODO.md | 22 +++---- mdbx.h++ | 2 +- src/core.c | 21 +++--- src/mdbx_chk.c | 2 +- src/osal.c | 12 ++-- test/valgrind_suppress.txt | 2 +- 8 files changed, 96 insertions(+), 97 deletions(-) diff --git a/ChangeLog.md b/ChangeLog.md index 8dd430ae..16e6b1a4 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -491,7 +491,7 @@ Minors: The stable release with the complete workaround for an incoherence flaw of Linux unified page/buffer cache. Nonetheless the cause for this trouble may be an issue of Intel CPU cache/MESI. -See [issue#269](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/269) for more information. +See [issue#269](https://libmdbx.dqdkfa.ru/dead-github/issues/269) for more information. Acknowledgements: @@ -500,8 +500,8 @@ Acknowledgements: Fixes: - - [Added complete workaround](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/269) for an incoherence flaw of Linux unified page/buffer cache. - - [Fixed](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/272) cursor reusing for read-only transactions. + - [Added complete workaround](https://libmdbx.dqdkfa.ru/dead-github/issues/269) for an incoherence flaw of Linux unified page/buffer cache. + - [Fixed](https://libmdbx.dqdkfa.ru/dead-github/issues/272) cursor reusing for read-only transactions. - Fixed copy&paste typo inside `mdbx::cursor::find_multivalue()`. Minors: @@ -516,7 +516,7 @@ Minors: ## v0.11.5 at 2022-02-23 The release with the temporary hotfix for a flaw of Linux unified page/buffer cache. -See [issue#269](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/269) for more information. +See [issue#269](https://libmdbx.dqdkfa.ru/dead-github/issues/269) for more information. Acknowledgements: @@ -526,10 +526,10 @@ Acknowledgements: Fixes: - - [Added hotfix](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/269) for a flaw of Linux unified page/buffer cache. - - [Fixed/Reworked](https://web.archive.org/web/https://github.com/erthink/libmdbx/pull/270) move-assignment operators for "managed" classes of C++ API. + - [Added hotfix](https://libmdbx.dqdkfa.ru/dead-github/issues/269) for a flaw of Linux unified page/buffer cache. + - [Fixed/Reworked](https://libmdbx.dqdkfa.ru/dead-github/pull/270) move-assignment operators for "managed" classes of C++ API. - Fixed potential `SIGSEGV` while open DB with overrided non-default page size. - - [Made](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/267) `mdbx_env_open()` idempotence in failure cases. + - [Made](https://libmdbx.dqdkfa.ru/dead-github/issues/267) `mdbx_env_open()` idempotence in failure cases. - Refined/Fixed pages reservation inside `mdbx_update_gc()` to avoid non-reclamation in a rare cases. - Fixed typo in a retained space calculation for the hsr-callback. @@ -562,15 +562,15 @@ New features, extensions and improvements: Fixes: - Fixed handling `MDBX_opt_rp_augment_limit` for GC's records from huge transactions (Erigon/Akula/Ethereum). - - [Fixed](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/258) build on Android (avoid including `sys/sem.h`). - - [Fixed](https://web.archive.org/web/https://github.com/erthink/libmdbx/pull/261) missing copy assignment operator for `mdbx::move_result`. + - [Fixed](https://libmdbx.dqdkfa.ru/dead-github/issues/258) build on Android (avoid including `sys/sem.h`). + - [Fixed](https://libmdbx.dqdkfa.ru/dead-github/pull/261) missing copy assignment operator for `mdbx::move_result`. - Fixed missing `&` for `std::ostream &operator<<()` overloads. - Fixed unexpected `EXDEV` (Cross-device link) error from `mdbx_env_copy()`. - Fixed base64 encoding/decoding bugs in auxillary C++ API. - Fixed overflow of `pgno_t` during checking PNL on 64-bit platforms. - - [Fixed](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/260) excessive PNL checking after sort for spilling. + - [Fixed](https://libmdbx.dqdkfa.ru/dead-github/issues/260) excessive PNL checking after sort for spilling. - Reworked checking `MAX_PAGENO` and DB upper-size geometry limit. - - [Fixed](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/265) build for some combinations of versions of MSVC and Windows SDK. + - [Fixed](https://libmdbx.dqdkfa.ru/dead-github/issues/265) build for some combinations of versions of MSVC and Windows SDK. Minors: @@ -597,10 +597,10 @@ Acknowledgements: New features, extensions and improvements: - - [Added](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/236) `mdbx_cursor_get_batch()`. - - [Added](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/250) `MDBX_SET_UPPERBOUND`. + - [Added](https://libmdbx.dqdkfa.ru/dead-github/issues/236) `mdbx_cursor_get_batch()`. + - [Added](https://libmdbx.dqdkfa.ru/dead-github/issues/250) `MDBX_SET_UPPERBOUND`. - C++ API is finalized now. - - The GC update stage has been [significantly speeded](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/254) when fixing huge Erigon's transactions (Ethereum ecosystem). + - The GC update stage has been [significantly speeded](https://libmdbx.dqdkfa.ru/dead-github/issues/254) when fixing huge Erigon's transactions (Ethereum ecosystem). Fixes: @@ -611,12 +611,12 @@ Minors: - Fixed returning `MDBX_RESULT_TRUE` (unexpected -1) from `mdbx_env_set_option()`. - Added `mdbx_env_get_syncbytes()` and `mdbx_env_get_syncperiod()`. - - [Clarified](https://web.archive.org/web/https://github.com/erthink/libmdbx/pull/249) description of `MDBX_INTEGERKEY`. + - [Clarified](https://libmdbx.dqdkfa.ru/dead-github/pull/249) description of `MDBX_INTEGERKEY`. - Reworked/simplified `mdbx_env_sync_internal()`. - - [Fixed](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/248) extra assertion inside `mdbx_cursor_put()` for `MDBX_DUPFIXED` cases. + - [Fixed](https://libmdbx.dqdkfa.ru/dead-github/issues/248) extra assertion inside `mdbx_cursor_put()` for `MDBX_DUPFIXED` cases. - Avoiding extra looping inside `mdbx_env_info_ex()`. - Explicitly enabled core dumps from stochastic tests scripts on Linux. - - [Fixed](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/253) `mdbx_override_meta()` to avoid false-positive assertions. + - [Fixed](https://libmdbx.dqdkfa.ru/dead-github/issues/253) `mdbx_override_meta()` to avoid false-positive assertions. - For compatibility reverted returning `MDBX_ENODATA`for some cases. @@ -632,10 +632,10 @@ Acknowledgements: Fixes: - - [Fixed compilation](https://web.archive.org/web/https://github.com/erthink/libmdbx/pull/239) with `devtoolset-9` on CentOS/RHEL 7. - - [Fixed unexpected `MDBX_PROBLEM` error](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/242) because of update an obsolete meta-page. - - [Fixed returning `MDBX_NOTFOUND` error](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/243) in case an inexact value found for `MDBX_GET_BOTH` operation. - - [Fixed compilation](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/245) without kernel/libc-devel headers. + - [Fixed compilation](https://libmdbx.dqdkfa.ru/dead-github/pull/239) with `devtoolset-9` on CentOS/RHEL 7. + - [Fixed unexpected `MDBX_PROBLEM` error](https://libmdbx.dqdkfa.ru/dead-github/issues/242) because of update an obsolete meta-page. + - [Fixed returning `MDBX_NOTFOUND` error](https://libmdbx.dqdkfa.ru/dead-github/issues/243) in case an inexact value found for `MDBX_GET_BOTH` operation. + - [Fixed compilation](https://libmdbx.dqdkfa.ru/dead-github/issues/245) without kernel/libc-devel headers. Minors: @@ -652,7 +652,7 @@ Minors: The database format signature has been changed to prevent forward-interoperability with an previous releases, which may lead to a -[false positive diagnosis of database corruption](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/238) +[false positive diagnosis of database corruption](https://libmdbx.dqdkfa.ru/dead-github/issues/238) due to flaws of an old library versions. This change is mostly invisible: @@ -704,7 +704,7 @@ Acknowledgements: Fixes: - Fixed possibility of looping update GC during transaction commit (no public issue since the problem was discovered inside [Positive Technologies](https://www.ptsecurity.ru)). - - Fixed `#pragma pack` to avoid provoking some compilers to generate code with [unaligned access](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/235). + - Fixed `#pragma pack` to avoid provoking some compilers to generate code with [unaligned access](https://libmdbx.dqdkfa.ru/dead-github/issues/235). - Fixed `noexcept` for potentially throwing `txn::put()` of C++ API. Minors: @@ -730,7 +730,7 @@ Extensions and improvements: Fixes: - - Always setup `madvise` while opening DB (fixes https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/231). + - Always setup `madvise` while opening DB (fixes https://libmdbx.dqdkfa.ru/dead-github/issues/231). - Fixed checking legacy `P_DIRTY` flag (`0x10`) for nested/sub-pages. Minors: @@ -751,11 +751,11 @@ Acknowledgements: - [Lionel Debroux](https://github.com/debrouxl) for fuzzing tests and reporting bugs. - [Sergey Fedotov](https://github.com/SergeyFromHell/) for [`node-mdbx` NodeJS bindings](https://www.npmjs.com/package/node-mdbx). - [Kris Zyp](https://github.com/kriszyp) for [`lmdbx-store` NodeJS bindings](https://github.com/kriszyp/lmdbx-store). - - [Noel Kuntze](https://github.com/Thermi) for [draft Python bindings](https://web.archive.org/web/https://github.com/erthink/libmdbx/commits/python-bindings). + - [Noel Kuntze](https://github.com/Thermi) for [draft Python bindings](https://libmdbx.dqdkfa.ru/dead-github/commits/python-bindings). New features, extensions and improvements: - - [Allow to predefine/override `MDBX_BUILD_TIMESTAMP` for builds reproducibility](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/201). + - [Allow to predefine/override `MDBX_BUILD_TIMESTAMP` for builds reproducibility](https://libmdbx.dqdkfa.ru/dead-github/issues/201). - Added options support for `long-stochastic` script. - Avoided `MDBX_TXN_FULL` error for large transactions when possible. - The `MDBX_READERS_LIMIT` increased to `32767`. @@ -763,7 +763,7 @@ New features, extensions and improvements: - Minimized the size of poisoned/unpoisoned regions to avoid Valgrind/ASAN stuck. - Added more workarounds for QEMU for testing builds for 32-bit platforms, Alpha and Sparc architectures. - `mdbx_chk` now skips iteration & checking of DB' records if corresponding page-tree is corrupted (to avoid `SIGSEGV`, ASAN failures, etc). - - Added more checks for [rare/fuzzing corruption cases](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/217). + - Added more checks for [rare/fuzzing corruption cases](https://libmdbx.dqdkfa.ru/dead-github/issues/217). Backward compatibility break: @@ -775,18 +775,18 @@ Backward compatibility break: Fixes: - Fixed excess meta-pages checks in case `mdbx_chk` is called to check the DB for a specific meta page and thus could prevent switching to the selected meta page, even if the check passed without errors. - - Fixed [recursive use of SRW-lock on Windows cause by `MDBX_NOTLS` option](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/203). - - Fixed [log a warning during a new DB creation](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/205). - - Fixed [false-negative `mdbx_cursor_eof()` result](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/207). - - Fixed [`make install` with non-GNU `install` utility (OSX, BSD)](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/208). - - Fixed [installation by `CMake` in special cases by complete use `GNUInstallDirs`'s variables](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/209). - - Fixed [C++ Buffer issue with `std::string` and alignment](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/191). + - Fixed [recursive use of SRW-lock on Windows cause by `MDBX_NOTLS` option](https://libmdbx.dqdkfa.ru/dead-github/issues/203). + - Fixed [log a warning during a new DB creation](https://libmdbx.dqdkfa.ru/dead-github/issues/205). + - Fixed [false-negative `mdbx_cursor_eof()` result](https://libmdbx.dqdkfa.ru/dead-github/issues/207). + - Fixed [`make install` with non-GNU `install` utility (OSX, BSD)](https://libmdbx.dqdkfa.ru/dead-github/issues/208). + - Fixed [installation by `CMake` in special cases by complete use `GNUInstallDirs`'s variables](https://libmdbx.dqdkfa.ru/dead-github/issues/209). + - Fixed [C++ Buffer issue with `std::string` and alignment](https://libmdbx.dqdkfa.ru/dead-github/issues/191). - Fixed `safe64_reset()` for platforms without atomic 64-bit compare-and-swap. - Fixed hang/shutdown on big-endian platforms without `__cxa_thread_atexit()`. - - Fixed [using bad meta-pages if DB was partially/recoverable corrupted](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/217). + - Fixed [using bad meta-pages if DB was partially/recoverable corrupted](https://libmdbx.dqdkfa.ru/dead-github/issues/217). - Fixed extra `noexcept` for `buffer::&assign_reference()`. - Fixed `bootid` generation on Windows for case of change system' time. - - Fixed [test framework keygen-related issue](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/127). + - Fixed [test framework keygen-related issue](https://libmdbx.dqdkfa.ru/dead-github/issues/127). ## v0.10.1 at 2021-06-01 @@ -807,10 +807,10 @@ New features: Fixes: - Fixed minor "foo not used" warnings from modern C++ compilers when building the C++ part of the library. - - Fixed confusing/messy errors when build library from unfit github's archives (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/197). + - Fixed confusing/messy errors when build library from unfit github's archives (https://libmdbx.dqdkfa.ru/dead-github/issues/197). - Fixed `#​e​l​s​i​f` typo. - - Fixed rare unexpected `MDBX_PROBLEM` error during altering data in huge transactions due to wrong spilling/oust of dirty pages (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/195). - - Re-Fixed WSL1/WSL2 detection with distinguishing (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/97). + - Fixed rare unexpected `MDBX_PROBLEM` error during altering data in huge transactions due to wrong spilling/oust of dirty pages (https://libmdbx.dqdkfa.ru/dead-github/issues/195). + - Re-Fixed WSL1/WSL2 detection with distinguishing (https://libmdbx.dqdkfa.ru/dead-github/issues/97). ## v0.10.0 at 2021-05-09 @@ -833,7 +833,7 @@ New features: and conjointly with the `MDBX_ENV_CHECKPID=0` and `MDBX_TXN_CHECKOWNER=0` options can yield up to 30% more performance compared to LMDB. - Using float point (exponential quantized) representation for internal 16-bit values - of grow step and shrink threshold when huge ones (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/166). + of grow step and shrink threshold when huge ones (https://libmdbx.dqdkfa.ru/dead-github/issues/166). To minimize the impact on compatibility, only the odd values inside the upper half of the range (i.e. 32769..65533) are used for the new representation. - Added the `mdbx_drop` similar to LMDB command-line tool to purge or delete (sub)database(s). @@ -842,7 +842,7 @@ New features: - The internal node sizes were refined, resulting in a reduction in large/overflow pages in some use cases and a slight increase in limits for a keys size to ≈½ of page size. - Added to `mdbx_chk` output number of keys/items on pages. - - Added explicit `install-strip` and `install-no-strip` targets to the `Makefile` (https://web.archive.org/web/https://github.com/erthink/libmdbx/pull/180). + - Added explicit `install-strip` and `install-no-strip` targets to the `Makefile` (https://libmdbx.dqdkfa.ru/dead-github/pull/180). - Major rework page splitting (af9b7b560505684249b76730997f9e00614b8113) for - An "auto-appending" feature upon insertion for both ascending and descending key sequences. As a result, the optimality of page filling @@ -850,7 +850,7 @@ New features: inserting ordered sequences of keys, - A "splitting at middle" to make page tree more balanced on average. - Added `mdbx_get_sysraminfo()` to the API. - - Added guessing a reasonable maximum DB size for the default upper limit of geometry (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/183). + - Added guessing a reasonable maximum DB size for the default upper limit of geometry (https://libmdbx.dqdkfa.ru/dead-github/issues/183). - Major rework internal labeling of a dirty pages (958fd5b9479f52f2124ab7e83c6b18b04b0e7dda) for a "transparent spilling" feature with the gist to make a dirty pages be ready to spilling (writing to a disk) without further altering ones. @@ -866,7 +866,7 @@ New features: - Support `make help` to list available make targets. - Silently `make`'s build by default. - Preliminary [Python bindings](https://github.com/Thermi/libmdbx/tree/python-bindings) is available now - by [Noel Kuntze](https://github.com/Thermi) (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/147). + by [Noel Kuntze](https://github.com/Thermi) (https://libmdbx.dqdkfa.ru/dead-github/issues/147). Backward compatibility break: @@ -881,22 +881,22 @@ Backward compatibility break: Fixes: - - Fixed performance regression due non-optimal C11 atomics usage (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/160). - - Fixed "reincarnation" of subDB after it deletion (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/168). + - Fixed performance regression due non-optimal C11 atomics usage (https://libmdbx.dqdkfa.ru/dead-github/issues/160). + - Fixed "reincarnation" of subDB after it deletion (https://libmdbx.dqdkfa.ru/dead-github/issues/168). - Fixed (disallowing) implicit subDB deletion via operations on `@MAIN`'s DBI-handle. - - Fixed a crash of `mdbx_env_info_ex()` in case of a call for a non-open environment (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/171). - - Fixed the selecting/adjustment values inside `mdbx_env_set_geometry()` for implicit out-of-range cases (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/170). - - Fixed `mdbx_env_set_option()` for set initial and limit size of dirty page list ((https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/179). - - Fixed an unreasonably huge default upper limit for DB geometry (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/183). + - Fixed a crash of `mdbx_env_info_ex()` in case of a call for a non-open environment (https://libmdbx.dqdkfa.ru/dead-github/issues/171). + - Fixed the selecting/adjustment values inside `mdbx_env_set_geometry()` for implicit out-of-range cases (https://libmdbx.dqdkfa.ru/dead-github/issues/170). + - Fixed `mdbx_env_set_option()` for set initial and limit size of dirty page list ((https://libmdbx.dqdkfa.ru/dead-github/issues/179). + - Fixed an unreasonably huge default upper limit for DB geometry (https://libmdbx.dqdkfa.ru/dead-github/issues/183). - Fixed `constexpr` specifier for the `slice::invalid()`. - - Fixed (no)readahead auto-handling (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/164). + - Fixed (no)readahead auto-handling (https://libmdbx.dqdkfa.ru/dead-github/issues/164). - Fixed non-alloy build for Windows. - Switched to using Heap-functions instead of LocalAlloc/LocalFree on Windows. - - Fixed `mdbx_env_stat_ex()` to returning statistics of the whole environment instead of MainDB only (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/190). + - Fixed `mdbx_env_stat_ex()` to returning statistics of the whole environment instead of MainDB only (https://libmdbx.dqdkfa.ru/dead-github/issues/190). - Fixed building by GCC 4.8.5 (added workaround for a preprocessor's bug). - Fixed building C++ part for iOS <= 13.0 (unavailability of `std::filesystem::path`). - Fixed building for Windows target versions prior to Windows Vista (`WIN32_WINNT < 0x0600`). - - Fixed building by MinGW for Windows (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/155). + - Fixed building by MinGW for Windows (https://libmdbx.dqdkfa.ru/dead-github/issues/155). ------------------------------------------------------------------------------- @@ -919,7 +919,7 @@ Removed options and features: New features: - Package for FreeBSD is available now by Mahlon E. Smith. - - New API functions to get/set various options (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/128): + - New API functions to get/set various options (https://libmdbx.dqdkfa.ru/dead-github/issues/128): - the maximum number of named databases for the environment; - the maximum number of threads/reader slots; - threshold (since the last unsteady commit) to force flush the data buffers to disk; @@ -932,7 +932,7 @@ New features: - maximal part of the dirty pages may be spilled when necessary; - minimal part of the dirty pages should be spilled when necessary; - how much of the parent transaction dirty pages will be spilled while start each child transaction; - - Unlimited/Dynamic size of retired and dirty page lists (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/123). + - Unlimited/Dynamic size of retired and dirty page lists (https://libmdbx.dqdkfa.ru/dead-github/issues/123). - Added `-p` option (purge subDB before loading) to `mdbx_load` tool. - Reworked spilling of large transaction and committing of nested transactions: - page spilling code reworked to avoid the flaws and bugs inherited from LMDB; @@ -942,22 +942,22 @@ New features: - Added `MDBX_ENABLE_REFUND` and `MDBX_PNL_ASCENDING` internal/advanced build options. - Added `mdbx_default_pagesize()` function. - Better support architectures with a weak/relaxed memory consistency model (ARM, AARCH64, PPC, MIPS, RISC-V, etc) by means [C11 atomics](https://en.cppreference.com/w/c/atomic). - - Speed up page number lists and dirty page lists (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/132). + - Speed up page number lists and dirty page lists (https://libmdbx.dqdkfa.ru/dead-github/issues/132). - Added `LIBMDBX_NO_EXPORTS_LEGACY_API` build option. Fixes: - - Fixed missing cleanup (null assigned) in the C++ commit/abort (https://web.archive.org/web/https://github.com/erthink/libmdbx/pull/143). + - Fixed missing cleanup (null assigned) in the C++ commit/abort (https://libmdbx.dqdkfa.ru/dead-github/pull/143). - Fixed `mdbx_realloc()` for case of nullptr and `MDBX_WITHOUT_MSVC_CRT=ON` for Windows. - - Fixed the possibility to use invalid and renewed (closed & re-opened, dropped & re-created) DBI-handles (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/146). - - Fixed 4-byte aligned access to 64-bit integers, including access to the `bootid` meta-page's field (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/153). + - Fixed the possibility to use invalid and renewed (closed & re-opened, dropped & re-created) DBI-handles (https://libmdbx.dqdkfa.ru/dead-github/issues/146). + - Fixed 4-byte aligned access to 64-bit integers, including access to the `bootid` meta-page's field (https://libmdbx.dqdkfa.ru/dead-github/issues/153). - Fixed minor/potential memory leak during page flushing and unspilling. - Fixed handling states of cursors's and subDBs's for nested transactions. - Fixed page leak in extra rare case the list of retired pages changed during update GC on transaction commit. - - Fixed assertions to avoid false-positive UB detection by CLANG/LLVM (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/153). - - Fixed `MDBX_TXN_FULL` and regressive `MDBX_KEYEXIST` during large transaction commit with `MDBX_LIFORECLAIM` (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/123). + - Fixed assertions to avoid false-positive UB detection by CLANG/LLVM (https://libmdbx.dqdkfa.ru/dead-github/issues/153). + - Fixed `MDBX_TXN_FULL` and regressive `MDBX_KEYEXIST` during large transaction commit with `MDBX_LIFORECLAIM` (https://libmdbx.dqdkfa.ru/dead-github/issues/123). - Fixed auto-recovery (`weak->steady` with the same boot-id) when Database size at last weak checkpoint is large than at last steady checkpoint. - - Fixed operation on systems with unusual small/large page size, including PowerPC (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/157). + - Fixed operation on systems with unusual small/large page size, including PowerPC (https://libmdbx.dqdkfa.ru/dead-github/issues/157). ## v0.9.2 at 2020-11-27 @@ -995,11 +995,11 @@ Fixes: - Fixed copy&paste typos. - Fixed minor false-positive GCC warning. - Added workaround for broken `DEFINE_ENUM_FLAG_OPERATORS` from Windows SDK. - - Fixed cursor state after multimap/dupsort repeated deletes (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/121). + - Fixed cursor state after multimap/dupsort repeated deletes (https://libmdbx.dqdkfa.ru/dead-github/issues/121). - Added `SIGPIPE` suppression for internal thread during `mdbx_env_copy()`. - - Fixed extra-rare `MDBX_KEY_EXIST` error during `mdbx_commit()` (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/131). - - Fixed spilled pages checking (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/126). - - Fixed `mdbx_load` for 'plain text' and without `-s name` cases (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/136). + - Fixed extra-rare `MDBX_KEY_EXIST` error during `mdbx_commit()` (https://libmdbx.dqdkfa.ru/dead-github/issues/131). + - Fixed spilled pages checking (https://libmdbx.dqdkfa.ru/dead-github/issues/126). + - Fixed `mdbx_load` for 'plain text' and without `-s name` cases (https://libmdbx.dqdkfa.ru/dead-github/issues/136). - Fixed save/restore/commit of cursors for nested transactions. - Fixed cursors state in rare/special cases (move next beyond end-of-data, after deletion and so on). - Added workaround for MSVC 19.28 (Visual Studio 16.8) (but may still hang during compilation). diff --git a/README.md b/README.md index d25189b8..44c68726 100644 --- a/README.md +++ b/README.md @@ -435,7 +435,7 @@ unexpected or broken down. ### Testing The amalgamated source code does not contain any tests for or several reasons. -Please read [the explanation](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/214#issuecomment-870717981) and don't ask to alter this. +Please read [the explanation](https://libmdbx.dqdkfa.ru/dead-github/issues/214#issuecomment-870717981) and don't ask to alter this. So for testing _libmdbx_ itself you need a full source code, i.e. the clone of a git repository, there is no option. The full source code of _libmdbx_ has a [`test` subdirectory](https://gitflic.ru/project/erthink/libmdbx/tree/master/test) with minimalistic test "framework". @@ -618,7 +618,7 @@ Bindings | Rust | [libmdbx-rs](https://github.com/vorot93/libmdbx-rs) | [Artem Vorotnikov](https://github.com/vorot93) | | Rust | [mdbx](https://crates.io/crates/mdbx) | [gcxfd](https://github.com/gcxfd) | | Java | [mdbxjni](https://github.com/castortech/mdbxjni) | [Castor Technologies](https://castortech.com/) | -| Python (draft) | [python-bindings](https://web.archive.org/web/https://github.com/erthink/libmdbx/commits/python-bindings) branch | [Noel Kuntze](https://github.com/Thermi) +| Python (draft) | [python-bindings](https://libmdbx.dqdkfa.ru/dead-github/commits/python-bindings) branch | [Noel Kuntze](https://github.com/Thermi) | .NET (obsolete) | [mdbx.NET](https://github.com/wangjia184/mdbx.NET) | [Jerry Wang](https://github.com/wangjia184) | diff --git a/TODO.md b/TODO.md index 70016f38..0d9fd46d 100644 --- a/TODO.md +++ b/TODO.md @@ -11,19 +11,19 @@ For the same reason ~~Github~~ is blacklisted forever. So currently most of the links are broken due to noted malicious ~~Github~~ sabotage. - - [Move most of `mdbx_chk` functional to the library API](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/204). - - [Replace SRW-lock on Windows to allow shrink DB with `MDBX_NOTLS` option](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/210). - - [More flexible support of asynchronous runtime/framework(s)](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/200). - - [Migration guide from LMDB to MDBX](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/199). - - [Support for RAW devices](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/124). - - [Support MessagePack for Keys & Values](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/115). - - [Engage new terminology](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/137). + - [Move most of `mdbx_chk` functional to the library API](https://libmdbx.dqdkfa.ru/dead-github/issues/204). + - [Replace SRW-lock on Windows to allow shrink DB with `MDBX_NOTLS` option](https://libmdbx.dqdkfa.ru/dead-github/issues/210). + - [More flexible support of asynchronous runtime/framework(s)](https://libmdbx.dqdkfa.ru/dead-github/issues/200). + - [Migration guide from LMDB to MDBX](https://libmdbx.dqdkfa.ru/dead-github/issues/199). + - [Support for RAW devices](https://libmdbx.dqdkfa.ru/dead-github/issues/124). + - [Support MessagePack for Keys & Values](https://libmdbx.dqdkfa.ru/dead-github/issues/115). + - [Engage new terminology](https://libmdbx.dqdkfa.ru/dead-github/issues/137). - Packages for [Astra Linux](https://astralinux.ru/), [ALT Linux](https://www.altlinux.org/), [ROSA Linux](https://www.rosalinux.ru/), etc. Done ---- - - [Simple careful mode for working with corrupted DB](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/223). - - [Engage an "overlapped I/O" on Windows](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/224). - - [Large/Overflow pages accounting for dirty-room](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/192). - - [Get rid of dirty-pages list in MDBX_WRITEMAP mode](https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/193). + - [Simple careful mode for working with corrupted DB](https://libmdbx.dqdkfa.ru/dead-github/issues/223). + - [Engage an "overlapped I/O" on Windows](https://libmdbx.dqdkfa.ru/dead-github/issues/224). + - [Large/Overflow pages accounting for dirty-room](https://libmdbx.dqdkfa.ru/dead-github/issues/192). + - [Get rid of dirty-pages list in MDBX_WRITEMAP mode](https://libmdbx.dqdkfa.ru/dead-github/issues/193). diff --git a/mdbx.h++ b/mdbx.h++ index bc3e1802..b57e7507 100644 --- a/mdbx.h++ +++ b/mdbx.h++ @@ -287,7 +287,7 @@ namespace mdbx { // To enable all kinds of an compiler optimizations we use a byte-like type // that don't presumes aliases for pointers as does the `char` type and its // derivatives/typedefs. -// Please see https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/263 +// Please see https://libmdbx.dqdkfa.ru/dead-github/issues/263 // for reasoning of the use of `char8_t` type and switching to `__restrict__`. using byte = char8_t; #else diff --git a/src/core.c b/src/core.c index 2342a970..089be05b 100644 --- a/src/core.c +++ b/src/core.c @@ -4445,7 +4445,7 @@ static void iov_callback4dirtypages(iov_ctx_t *ctx, size_t offset, void *data, osal_flush_incoherent_mmap(env->me_map + offset, bytes, env->me_os_psize); const MDBX_page *const rp = (const MDBX_page *)(env->me_map + offset); /* check with timeout as the workaround - * for https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/269 */ + * for https://libmdbx.dqdkfa.ru/dead-github/issues/269 */ if (unlikely(memcmp(wp, rp, bytes))) { ctx->coherency_timestamp = 0; WARNING("catch delayed/non-arrived page %" PRIaPGNO " %s", wp->mp_pgno, @@ -6604,7 +6604,7 @@ static __inline bool is_gc_usable(MDBX_txn *txn, const MDBX_cursor *mc, return false; /* avoid (recursive) search inside empty tree and while tree is - updating, https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/31 */ + updating, https://libmdbx.dqdkfa.ru/dead-github/issues/31 */ if (txn->mt_dbs[FREE_DBI].md_entries == 0) return false; @@ -6808,7 +6808,7 @@ next_gc:; /* Stop reclaiming to avoid large/overflow the page list. * This is a rare case while search for a continuously multi-page region * in a large database. - * https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/123 + * https://libmdbx.dqdkfa.ru/dead-github/issues/123 */ NOTICE("stop reclaiming to avoid PNL overflow: %zu (current) + %zu " "(chunk) -> %zu", @@ -7999,7 +7999,7 @@ __cold int mdbx_thread_unregister(const MDBX_env *env) { return MDBX_SUCCESS; } -/* check against https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/269 */ +/* check against https://libmdbx.dqdkfa.ru/dead-github/issues/269 */ static bool coherency_check(const MDBX_env *env, const txnid_t txnid, const volatile MDBX_db *dbs, const volatile MDBX_meta *meta, bool report) { @@ -8100,7 +8100,7 @@ __cold static int coherency_timeout(uint64_t *timestamp, pgno_t pgno) { } /* check with timeout as the workaround - * for https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/269 */ + * for https://libmdbx.dqdkfa.ru/dead-github/issues/269 */ __hot static int coherency_check_readed(const MDBX_env *env, const txnid_t txnid, const volatile MDBX_db *dbs, @@ -8336,8 +8336,7 @@ static int txn_renew(MDBX_txn *txn, const unsigned flags) { txn->tw.troika = meta_tap(env); const meta_ptr_t head = meta_recent(env, &txn->tw.troika); uint64_t timestamp = 0; - while ( - "workaround for https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/269") { + while ("workaround for https://libmdbx.dqdkfa.ru/dead-github/issues/269") { rc = coherency_check_readed(env, head.txnid, head.ptr_v->mm_dbs, head.ptr_v, ×tamp); if (likely(rc == MDBX_SUCCESS)) @@ -11978,7 +11977,7 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending, } uint64_t timestamp = 0; - while ("workaround for https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/269") { + while ("workaround for https://libmdbx.dqdkfa.ru/dead-github/issues/269") { rc = coherency_check_written(env, pending->unsafe_txnid, target, ×tamp); if (likely(rc == MDBX_SUCCESS)) @@ -12500,7 +12499,7 @@ mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t size_now, uint64_t timestamp = 0; while ("workaround for " - "https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/269") { + "https://libmdbx.dqdkfa.ru/dead-github/issues/269") { meta = *head.ptr_c; rc = coherency_check_readed(env, head.txnid, meta.mm_dbs, &meta, ×tamp); @@ -13843,7 +13842,7 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, } else { #if MDBX_MMAP_INCOHERENT_FILE_WRITE /* Temporary `workaround` for OpenBSD kernel's flaw. - * See https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/67 */ + * See https://libmdbx.dqdkfa.ru/dead-github/issues/67 */ if ((flags & MDBX_WRITEMAP) == 0) { if (flags & MDBX_ACCEDE) flags |= MDBX_WRITEMAP; @@ -21198,7 +21197,7 @@ __cold static int fetch_envinfo_ex(const MDBX_env *env, const MDBX_txn *txn, const size_t size_before_pgop_stat = offsetof(MDBX_envinfo, mi_pgop_stat); /* is the environment open? - * (https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/171) */ + * (https://libmdbx.dqdkfa.ru/dead-github/issues/171) */ if (unlikely(!env->me_map)) { /* environment not yet opened */ #if 1 diff --git a/src/mdbx_chk.c b/src/mdbx_chk.c index 57ea1631..df289401 100644 --- a/src/mdbx_chk.c +++ b/src/mdbx_chk.c @@ -1171,7 +1171,7 @@ int main(int argc, char *argv[]) { envflags &= ~MDBX_RDONLY; #if MDBX_MMAP_INCOHERENT_FILE_WRITE /* Temporary `workaround` for OpenBSD kernel's flaw. - * See https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/67 */ + * See https://libmdbx.dqdkfa.ru/dead-github/issues/67 */ envflags |= MDBX_WRITEMAP; #endif /* MDBX_MMAP_INCOHERENT_FILE_WRITE */ break; diff --git a/src/osal.c b/src/osal.c index 9bd30d04..ccf2205a 100644 --- a/src/osal.c +++ b/src/osal.c @@ -1311,7 +1311,7 @@ MDBX_INTERNAL_FUNC int osal_openfile(const enum osal_openfile_purpose purpose, flags |= O_CLOEXEC; #endif /* O_CLOEXEC */ - /* Safeguard for https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/144 */ + /* Safeguard for https://libmdbx.dqdkfa.ru/dead-github/issues/144 */ #if STDIN_FILENO == 0 && STDOUT_FILENO == 1 && STDERR_FILENO == 2 int stub_fd0 = -1, stub_fd1 = -1, stub_fd2 = -1; static const char dev_null[] = "/dev/null"; @@ -1349,7 +1349,7 @@ MDBX_INTERNAL_FUNC int osal_openfile(const enum osal_openfile_purpose purpose, errno = EACCES /* restore errno if file exists */; } - /* Safeguard for https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/144 */ + /* Safeguard for https://libmdbx.dqdkfa.ru/dead-github/issues/144 */ #if STDIN_FILENO == 0 && STDOUT_FILENO == 1 && STDERR_FILENO == 2 if (*fd == STDIN_FILENO) { WARNING("Got STD%s_FILENO/%d, avoid using it by dup(fd)", "IN", @@ -2197,7 +2197,7 @@ MDBX_INTERNAL_FUNC int osal_munmap(osal_mmap_t *map) { VALGRIND_MAKE_MEM_NOACCESS(map->address, map->current); /* Unpoisoning is required for ASAN to avoid false-positive diagnostic * when this memory will re-used by malloc or another mmapping. - * See https://web.archive.org/web/https://github.com/erthink/libmdbx/pull/93#issuecomment-613687203 + * See https://libmdbx.dqdkfa.ru/dead-github/pull/93#issuecomment-613687203 */ MDBX_ASAN_UNPOISON_MEMORY_REGION(map->address, (map->filesize && map->filesize < map->limit) @@ -2276,7 +2276,7 @@ MDBX_INTERNAL_FUNC int osal_mresize(const int flags, osal_mmap_t *map, /* Unpoisoning is required for ASAN to avoid false-positive diagnostic * when this memory will re-used by malloc or another mmapping. - * See https://web.archive.org/web/https://github.com/erthink/libmdbx/pull/93#issuecomment-613687203 + * See https://libmdbx.dqdkfa.ru/dead-github/pull/93#issuecomment-613687203 */ MDBX_ASAN_UNPOISON_MEMORY_REGION(map->address, map->limit); status = NtUnmapViewOfSection(GetCurrentProcess(), map->address); @@ -2557,7 +2557,7 @@ retry_mapview:; /* Unpoisoning is required for ASAN to avoid false-positive diagnostic * when this memory will re-used by malloc or another mmapping. * See - * https://web.archive.org/web/https://github.com/erthink/libmdbx/pull/93#issuecomment-613687203 + * https://libmdbx.dqdkfa.ru/dead-github/pull/93#issuecomment-613687203 */ MDBX_ASAN_UNPOISON_MEMORY_REGION( map->address, @@ -2579,7 +2579,7 @@ retry_mapview:; /* Unpoisoning is required for ASAN to avoid false-positive diagnostic * when this memory will re-used by malloc or another mmapping. * See - * https://web.archive.org/web/https://github.com/erthink/libmdbx/pull/93#issuecomment-613687203 + * https://libmdbx.dqdkfa.ru/dead-github/pull/93#issuecomment-613687203 */ MDBX_ASAN_UNPOISON_MEMORY_REGION( map->address, (map->current < map->limit) ? map->current : map->limit); diff --git a/test/valgrind_suppress.txt b/test/valgrind_suppress.txt index 2e67a56b..96d1327b 100644 --- a/test/valgrind_suppress.txt +++ b/test/valgrind_suppress.txt @@ -31,7 +31,7 @@ fun:wipe_steady* } -# memcmp() inside iov_write() as workaround for https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/269 +# memcmp() inside iov_write() as workaround for https://libmdbx.dqdkfa.ru/dead-github/issues/269 { iov-pagecheck-1 Memcheck:Cond From 30972102e519c22092595f878bfc6e32392375a9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Fri, 25 Nov 2022 18:56:15 +0300 Subject: [PATCH 233/364] =?UTF-8?q?mdbx:=20=D0=B8=D1=81=D0=BF=D1=80=D0=B0?= =?UTF-8?q?=D0=B2=D0=BB=D0=B5=D0=BD=D0=B8=D0=B5=20=D1=81=D0=B1=D0=BE=D1=80?= =?UTF-8?q?=D0=BA=D0=B8=20=D0=BF=D1=80=D0=B8=20`MDBX=5FPNL=5FASCENDING=3D1?= =?UTF-8?q?`.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/core.c b/src/core.c index 089be05b..51184bf1 100644 --- a/src/core.c +++ b/src/core.c @@ -6212,6 +6212,8 @@ MDBX_MAYBE_UNUSED static __always_inline size_t __builtin_clzl(size_t value) { } #endif /* _MSC_VER */ +#if !MDBX_PNL_ASCENDING + #if !defined(MDBX_ATTRIBUTE_TARGET) && \ (__has_attribute(__target__) || __GNUC_PREREQ(5, 0)) #define MDBX_ATTRIBUTE_TARGET(target) __attribute__((__target__(target))) @@ -6525,6 +6527,8 @@ __hot static pgno_t *scan4seq_neon(pgno_t *range, const size_t len, /* Choosing of another variants should be added here. */ #endif /* scan4seq_default */ +#endif /* MDBX_PNL_ASCENDING */ + #ifndef scan4seq_default #define scan4seq_default scan4seq_fallback #endif /* scan4seq_default */ @@ -10518,7 +10522,7 @@ static __inline void txn_merge(MDBX_txn *const parent, MDBX_txn *const txn, DEBUG("refund parent's spilled page %" PRIaPGNO, sl[i] >> 1); i -= 1; } while (i && sl[i] >= (parent->mt_next_pgno << 1)); - MDBX_PNL_GETSIZE(sl) = i; + MDBX_PNL_SETSIZE(sl, i); #else assert(MDBX_PNL_MOST(sl) == MDBX_PNL_FIRST(sl)); size_t i = 0; From b3248442962cfdda728656d6d9085147a7d42b63 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Mon, 28 Nov 2022 15:45:29 +0300 Subject: [PATCH 234/364] =?UTF-8?q?mdbx:=20=D0=9E=D0=B1=D0=BD=D0=BE=D0=B2?= =?UTF-8?q?=D0=BB=D0=B5=D0=BD=D0=B8=D0=B5=20ChangeLog.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ChangeLog.md | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/ChangeLog.md b/ChangeLog.md index 16e6b1a4..3aa0f7ee 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -4,21 +4,34 @@ ChangeLog English version [by Google](https://gitflic-ru.translate.goog/project/erthink/libmdbx/blob?file=ChangeLog.md&_x_tr_sl=ru&_x_tr_tl=en) and [by Yandex](https://translated.turbopages.org/proxy_u/ru-en.en/https/gitflic.ru/project/erthink/libmdbx/blob?file=ChangeLog.md). -## v0.12.3 в процессе подготовки срочного исправления +## v0.12.3 (Акула) в процессе подготовки -Исправления (без корректировок вышеперечисленных новых функций): +Благодарности: + - [Alex Sharov](https://t.me/AskAlexSharov) и команде [Erigon](https://github.com/ledgerwatch/erigon) за тестирование. + - [Simon Leier](https://t.me/leisim) за сообщение о сбоях и тестирование. + +Новое: + + - Использование адреса https://libmdbx.dqdkfa.ru/dead-github для отсылки к сохранённым в web.archive.org копиям ресурсов, уничтоженных администрацией Github. + +Исправления (без корректировок новых функций): + + - Устранение SIGSEGV или ошибочного вызова `free()` в ситуациях повторного открытия среды посредством `mdbx_env_open()`. - Устранение ошибки совершенной в коммите fe20de136c22ed3bc4c6d3f673e79c106e824f60 от 2022-09-18, в результате чего на Linux в режиме `MDBX_WRITEMAP` никогда не вызывался `msync()`. Проблема существует только в релизе 0.12.2. - Добавление подсчета грязных страниц в `MDBX_WRITEMAP` для предоставления посредством `mdbx_txn_info()` актуальной информации об объеме изменений в процессе транзакций чтения-записи. - Исправление несущественной опечатки в условиях `#if` определения порядка байт. + - Исправление сборки для случая `MDBX_PNL_ASCENDING=1`. -Мелочи: +Ликвидация технических долгов и мелочи: - Доработка поддержки авто-слияния записей GC внутри `page_alloc_slowpath()`. - Устранение несущественных предупреждений Coverity. + - Использование единого курсора для поиска в GC. + - Переработка внутренних флагов связанных с выделением страниц из GC. ------------------------------------------------------------------------------- From 3757eb72f7c6b46862f8f17881ac88e8cecc1979 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Fri, 25 Nov 2022 18:04:43 +0300 Subject: [PATCH 235/364] =?UTF-8?q?mdbx:=20=D1=8D=D0=BA=D0=BE=D0=BD=D0=BE?= =?UTF-8?q?=D0=BC=D0=B8=D1=8F=20=D0=BF=D0=BE=D1=81=D0=BB=D0=B5=D0=B4=D0=BE?= =?UTF-8?q?=D0=B2=D0=B0=D1=82=D0=B5=D0=BB=D1=8C=D0=BD=D0=BE=D1=81=D1=82?= =?UTF-8?q?=D0=B5=D0=B9=20=D0=BF=D1=80=D0=B8=20=D0=B2=D1=8B=D0=B4=D0=B5?= =?UTF-8?q?=D0=BB=D0=B5=D0=BD=D0=B8=D0=B8=20=D0=BE=D0=B4=D0=B8=D0=BD=D0=BE?= =?UTF-8?q?=D1=87=D0=BD=D1=8B=D1=85=20=D1=81=D1=82=D1=80=D0=B0=D0=BD=D0=B8?= =?UTF-8?q?=D1=86.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core.c | 215 +++++++++++++++++++++++++++++------------------- src/internals.h | 2 + 2 files changed, 134 insertions(+), 83 deletions(-) diff --git a/src/core.c b/src/core.c index 51184bf1..335da529 100644 --- a/src/core.c +++ b/src/core.c @@ -6516,7 +6516,7 @@ __hot static pgno_t *scan4seq_neon(pgno_t *range, const size_t len, #if defined(__AVX512BW__) && defined(MDBX_ATTRIBUTE_TARGET_AVX512BW) #define scan4seq_default scan4seq_avx512bw -#define scan4seq scan4seq_default +#define scan4seq_impl scan4seq_default #elif defined(__AVX2__) && defined(MDBX_ATTRIBUTE_TARGET_AVX2) #define scan4seq_default scan4seq_avx2 #elif defined(__SSE2__) && defined(MDBX_ATTRIBUTE_TARGET_SSE2) @@ -6533,20 +6533,20 @@ __hot static pgno_t *scan4seq_neon(pgno_t *range, const size_t len, #define scan4seq_default scan4seq_fallback #endif /* scan4seq_default */ -#ifdef scan4seq -/* The scan4seq() is the best or no alternatives */ +#ifdef scan4seq_impl +/* The scan4seq_impl() is the best or no alternatives */ #elif !MDBX_HAVE_BUILTIN_CPU_SUPPORTS /* The scan4seq_default() will be used since no cpu-features detection support * from compiler. Please don't ask to implement cpuid-based detection and don't * make such PRs. */ -#define scan4seq scan4seq_default +#define scan4seq_impl scan4seq_default #else /* Selecting the most appropriate implementation at runtime, * depending on the available CPU features. */ static pgno_t *scan4seq_resolver(pgno_t *range, const size_t len, const size_t seq); -static pgno_t *(*scan4seq)(pgno_t *range, const size_t len, - const size_t seq) = scan4seq_resolver; +static pgno_t *(*scan4seq_impl)(pgno_t *range, const size_t len, + const size_t seq) = scan4seq_resolver; static pgno_t *scan4seq_resolver(pgno_t *range, const size_t len, const size_t seq) { @@ -6569,10 +6569,10 @@ static pgno_t *scan4seq_resolver(pgno_t *range, const size_t len, choice = scan4seq_avx512bw; #endif /* MDBX_ATTRIBUTE_TARGET_AVX512BW */ /* Choosing of another variants should be added here. */ - scan4seq = choice ? choice : scan4seq_default; - return scan4seq(range, len, seq); + scan4seq_impl = choice ? choice : scan4seq_default; + return scan4seq_impl(range, len, seq); } -#endif /* scan4seq */ +#endif /* scan4seq_impl */ //------------------------------------------------------------------------------ @@ -6623,6 +6623,83 @@ __hot static bool is_already_reclaimed(const MDBX_txn *txn, txnid_t id) { return false; } +__hot static pgno_t pnl_get_single(MDBX_PNL pnl) { + const size_t len = MDBX_PNL_GETSIZE(pnl); + assert(len > 0); + pgno_t *target = MDBX_PNL_EDGE(pnl); + const ptrdiff_t dir = MDBX_PNL_ASCENDING ? 1 : -1; + + /* пытаемся пропускать последовательности при наличии одиночных элементов */ + if (likely(len > 2) && unlikely(target[dir] == *target + 1)) { + pgno_t *scan = target + dir + dir; + size_t left = len; + do { + if (likely(scan[-dir] != *scan - 1 && *scan + 1 != scan[dir])) { +#if MDBX_PNL_ASCENDING + target = scan; + break; +#else + /* вырезаем элемент с перемещением хвоста */ + const pgno_t pgno = *scan; + MDBX_PNL_SETSIZE(pnl, len - 1); + while (++scan <= target) + scan[-1] = *scan; + return pgno; +#endif + } + scan += dir; + } while (--left > 2); + } + + const pgno_t pgno = *target; +#if MDBX_PNL_ASCENDING + /* вырезаем элемент с перемещением хвоста */ + MDBX_PNL_SETSIZE(pnl, len - 1); + for (const pgno_t *const end = pnl + len - 1; target <= end; ++target) + *target = target[1]; +#else + /* перемещать хвост не нужно, просто усекам список */ + MDBX_PNL_SETSIZE(pnl, len - 1); +#endif + return pgno; +} + +__hot static pgno_t pnl_get_sequence(MDBX_PNL pnl, const size_t num, + uint8_t flags) { + const size_t len = MDBX_PNL_GETSIZE(pnl); + pgno_t *edge = MDBX_PNL_EDGE(pnl); + assert(len >= num && num > 1); + const size_t seq = num - 1; +#if !MDBX_PNL_ASCENDING + if (edge[-(ptrdiff_t)seq] - *edge == seq) { + if (unlikely(flags & MDBX_ALLOC_RESERVE)) + return P_INVALID; + assert(edge == scan4range_checker(pnl, seq)); + /* перемещать хвост не нужно, просто усекам список */ + MDBX_PNL_SETSIZE(pnl, len - num); + return *edge; + } +#endif + pgno_t *target = scan4seq_impl(edge, len, seq); + assert(target == scan4range_checker(pnl, seq)); + if (target) { + if (unlikely(flags & MDBX_ALLOC_RESERVE)) + return P_INVALID; + const pgno_t pgno = *target; + /* вырезаем найденную последовательность с перемещением хвоста */ + MDBX_PNL_SETSIZE(pnl, len - num); +#if MDBX_PNL_ASCENDING + for (const pgno_t *const end = pnl + len - num; target <= end; ++target) + *target = target[num]; +#else + for (const pgno_t *const end = pnl + len; ++target <= end;) + target[-(ptrdiff_t)num] = *target; +#endif + return pgno; + } + return 0; +} + static pgr_t page_alloc_slowpath(const MDBX_cursor *mc, const size_t num, uint8_t flags) { #if MDBX_ENABLE_PROFGC @@ -6646,25 +6723,22 @@ static pgr_t page_alloc_slowpath(const MDBX_cursor *mc, const size_t num, eASSERT(env, pnl_check_allocated(txn->tw.relist, txn->mt_next_pgno - MDBX_ENABLE_REFUND)); - pgno_t pgno = 0, *range = nullptr; - size_t newnext, re_len = MDBX_PNL_GETSIZE(txn->tw.relist); + pgno_t pgno = 0; + size_t newnext; if (num > 1) { #if MDBX_ENABLE_PROFGC prof->xpages += 1; #endif /* MDBX_ENABLE_PROFGC */ - if (re_len >= num) { + if (MDBX_PNL_GETSIZE(txn->tw.relist) >= num) { eASSERT(env, MDBX_PNL_LAST(txn->tw.relist) < txn->mt_next_pgno && MDBX_PNL_FIRST(txn->tw.relist) < txn->mt_next_pgno); - range = txn->tw.relist + (MDBX_PNL_ASCENDING ? 1 : re_len); - range = scan4seq(range, re_len, num - 1); - eASSERT(env, range == scan4range_checker(txn->tw.relist, num - 1)); - if (likely(range)) { - pgno = *range; + pgno = pnl_get_sequence(txn->tw.relist, num, flags); + if (likely(pgno)) goto done; - } } } else { - eASSERT(env, num == 0 || re_len == 0); + eASSERT(env, num == 0 || MDBX_PNL_GETSIZE(txn->tw.relist) == 0); + eASSERT(env, !(flags & MDBX_ALLOC_RESERVE) || num == 0); } //--------------------------------------------------------------------------- @@ -6681,7 +6755,7 @@ static pgr_t page_alloc_slowpath(const MDBX_cursor *mc, const size_t num, * большего резерва из-за увеличения списка переработанных страниц. */ (flags & MDBX_ALLOC_RESERVE) == 0) { if (txn->mt_dbs[FREE_DBI].md_branch_pages && - re_len < env->me_maxgc_ov1page / 2) + MDBX_PNL_GETSIZE(txn->tw.relist) < env->me_maxgc_ov1page / 2) flags += MDBX_ALLOC_COALESCE; } @@ -6775,40 +6849,38 @@ next_gc:; const size_t gc_len = MDBX_PNL_GETSIZE(gc_pnl); TRACE("gc-read: id #%" PRIaTXN " len %zu, re-list will %zu ", id, gc_len, - gc_len + re_len); + gc_len + MDBX_PNL_GETSIZE(txn->tw.relist)); - eASSERT(env, re_len == MDBX_PNL_GETSIZE(txn->tw.relist)); - if (unlikely(gc_len + re_len >= env->me_maxgc_ov1page)) { + if (unlikely(gc_len + MDBX_PNL_GETSIZE(txn->tw.relist) >= + env->me_maxgc_ov1page)) { /* Don't try to coalesce too much. */ if (flags & MDBX_ALLOC_SHOULD_SCAN) { eASSERT(env, flags & MDBX_ALLOC_COALESCE); + eASSERT(env, !(flags & MDBX_ALLOC_RESERVE)); eASSERT(env, num > 0); #if MDBX_ENABLE_PROFGC env->me_lck->mti_pgop_stat.gc_prof.coalescences += 1; #endif /* MDBX_ENABLE_PROFGC */ TRACE("clear %s %s", "MDBX_ALLOC_COALESCE", "since got threshold"); - if (re_len >= num) { + if (MDBX_PNL_GETSIZE(txn->tw.relist) >= num) { eASSERT(env, MDBX_PNL_LAST(txn->tw.relist) < txn->mt_next_pgno && MDBX_PNL_FIRST(txn->tw.relist) < txn->mt_next_pgno); - range = txn->tw.relist + (MDBX_PNL_ASCENDING ? 1 : re_len); - pgno = *range; - if (num == 1) - goto done; - range = scan4seq(range, re_len, num - 1); - eASSERT(env, range == scan4range_checker(txn->tw.relist, num - 1)); - if (likely(range)) { - pgno = *range; + if (likely(num == 1)) { + pgno = pnl_get_single(txn->tw.relist); goto done; } + pgno = pnl_get_sequence(txn->tw.relist, num, flags); + if (likely(pgno)) + goto done; } flags -= MDBX_ALLOC_COALESCE | MDBX_ALLOC_SHOULD_SCAN; } - if (unlikely(/* list is too long already */ re_len >= - env->me_options.rp_augment_limit) && + if (unlikely(/* list is too long already */ MDBX_PNL_GETSIZE( + txn->tw.relist) >= env->me_options.rp_augment_limit) && ((/* not a slot-request from gc-update */ num && /* have enough unallocated space */ txn->mt_geo.upper >= txn->mt_next_pgno + num) || - gc_len + re_len >= MDBX_PGL_LIMIT)) { + gc_len + MDBX_PNL_GETSIZE(txn->tw.relist) >= MDBX_PGL_LIMIT)) { /* Stop reclaiming to avoid large/overflow the page list. * This is a rare case while search for a continuously multi-page region * in a large database. @@ -6816,7 +6888,8 @@ next_gc:; */ NOTICE("stop reclaiming to avoid PNL overflow: %zu (current) + %zu " "(chunk) -> %zu", - re_len, gc_len, gc_len + re_len); + MDBX_PNL_GETSIZE(txn->tw.relist), gc_len, + gc_len + MDBX_PNL_GETSIZE(txn->tw.relist)); goto depleted_gc; } } @@ -6844,7 +6917,7 @@ next_gc:; } /* Merge in descending sorted order */ - re_len = pnl_merge(txn->tw.relist, gc_pnl); + pnl_merge(txn->tw.relist, gc_pnl); flags |= MDBX_ALLOC_SHOULD_SCAN; if (AUDIT_ENABLED()) { if (unlikely(!pnl_check(txn->tw.relist, txn->mt_next_pgno))) { @@ -6856,15 +6929,13 @@ next_gc:; } eASSERT(env, dirtylist_check(txn)); - eASSERT(env, - re_len == 0 || MDBX_PNL_MOST(txn->tw.relist) < txn->mt_next_pgno); - if (MDBX_ENABLE_REFUND && re_len && + eASSERT(env, MDBX_PNL_GETSIZE(txn->tw.relist) == 0 || + MDBX_PNL_MOST(txn->tw.relist) < txn->mt_next_pgno); + if (MDBX_ENABLE_REFUND && MDBX_PNL_GETSIZE(txn->tw.relist) && unlikely(MDBX_PNL_MOST(txn->tw.relist) == txn->mt_next_pgno - 1)) { /* Refund suitable pages into "unallocated" space */ txn_refund(txn); - re_len = MDBX_PNL_GETSIZE(txn->tw.relist); } - eASSERT(env, re_len == MDBX_PNL_GETSIZE(txn->tw.relist)); eASSERT(env, pnl_check_allocated(txn->tw.relist, txn->mt_next_pgno - MDBX_ENABLE_REFUND)); @@ -6872,7 +6943,7 @@ next_gc:; if (unlikely(num == 0)) { eASSERT(env, ret.err == MDBX_SUCCESS); TRACE("%s: last id #%" PRIaTXN ", re-len %zu", "early-exit for slot", id, - re_len); + MDBX_PNL_GETSIZE(txn->tw.relist)); goto early_exit; } @@ -6881,31 +6952,29 @@ next_gc:; eASSERT(env, op == MDBX_PREV || op == MDBX_NEXT); if (flags & MDBX_ALLOC_COALESCE) { TRACE("%s: last id #%" PRIaTXN ", re-len %zu", "coalesce-continue", id, - re_len); + MDBX_PNL_GETSIZE(txn->tw.relist)); goto next_gc; } scan: eASSERT(env, flags & MDBX_ALLOC_SHOULD_SCAN); eASSERT(env, num > 0); - if (re_len >= num) { + if (MDBX_PNL_GETSIZE(txn->tw.relist) >= num) { eASSERT(env, MDBX_PNL_LAST(txn->tw.relist) < txn->mt_next_pgno && MDBX_PNL_FIRST(txn->tw.relist) < txn->mt_next_pgno); - range = txn->tw.relist + (MDBX_PNL_ASCENDING ? 1 : re_len); - pgno = *range; - if (num == 1) - goto done; - range = scan4seq(range, re_len, num - 1); - eASSERT(env, range == scan4range_checker(txn->tw.relist, num - 1)); - if (likely(range)) { - pgno = *range; + if (likely(num == 1)) { + eASSERT(env, !(flags & MDBX_ALLOC_RESERVE)); + pgno = pnl_get_single(txn->tw.relist); goto done; } + pgno = pnl_get_sequence(txn->tw.relist, num, flags); + if (likely(pgno)) + goto done; } flags -= MDBX_ALLOC_SHOULD_SCAN; if (ret.err == MDBX_SUCCESS) { TRACE("%s: last id #%" PRIaTXN ", re-len %zu", "continue-search", id, - re_len); + MDBX_PNL_GETSIZE(txn->tw.relist)); goto next_gc; } @@ -6924,7 +6993,7 @@ depleted_gc: * - extend the database file. */ /* Will use new pages from the map if nothing is suitable in the GC. */ - newnext = (pgno = txn->mt_next_pgno) + num; + newnext = txn->mt_next_pgno + num; /* Does reclaiming stopped at the last steady point? */ const meta_ptr_t recent = meta_recent(env, &txn->tw.troika); @@ -7001,7 +7070,7 @@ depleted_gc: /* Avoid kick lagging reader(s) if is enough unallocated space * at the end of database file. */ if (!(flags & MDBX_ALLOC_RESERVE) && newnext <= txn->mt_end_pgno) { - eASSERT(env, range == nullptr); + eASSERT(env, pgno == 0); goto done; } @@ -7014,13 +7083,14 @@ depleted_gc: //--------------------------------------------------------------------------- no_gc: + eASSERT(env, pgno == 0); if (flags & MDBX_ALLOC_RESERVE) { ret.err = MDBX_NOTFOUND; goto fail; } /* Will use new pages from the map if nothing is suitable in the GC. */ - newnext = (pgno = txn->mt_next_pgno) + num; + newnext = txn->mt_next_pgno + num; if (newnext <= txn->mt_end_pgno) goto done; @@ -7052,27 +7122,15 @@ no_gc: goto fail; } env->me_txn->mt_end_pgno = (pgno_t)aligned; + eASSERT(env, pgno == 0); //--------------------------------------------------------------------------- done: ret.err = MDBX_SUCCESS; if (likely((flags & MDBX_ALLOC_RESERVE) == 0)) { - ENSURE(env, pgno >= NUM_METAS); - if (range) { - eASSERT(env, pgno == *range); + if (pgno) { eASSERT(env, pgno + num <= txn->mt_next_pgno && pgno >= NUM_METAS); - eASSERT(env, re_len == MDBX_PNL_GETSIZE(txn->tw.relist)); - /* Cutoff allocated pages from tw.relist */ -#if MDBX_PNL_ASCENDING - for (const pgno_t *const end = re_list + re_len - num; range <= end; - ++range) - *range = range[num]; -#else - for (const pgno_t *const end = txn->tw.relist + re_len; ++range <= end;) - range[-(ptrdiff_t)num] = *range; -#endif - MDBX_PNL_SETSIZE(txn->tw.relist, re_len -= num); eASSERT(env, pnl_check_allocated(txn->tw.relist, txn->mt_next_pgno - MDBX_ENABLE_REFUND)); } else { @@ -7082,6 +7140,7 @@ done: eASSERT(env, pgno >= NUM_METAS && pgno + num <= txn->mt_next_pgno); } + ENSURE(env, pgno >= NUM_METAS); #if MDBX_ENABLE_PROFGC if (!monotime_shot) monotime_shot = osal_monotime(); @@ -7182,20 +7241,10 @@ __hot static pgr_t page_alloc(const MDBX_cursor *mc) { return ret; } - MDBX_PNL pnl = txn->tw.relist; - const size_t len = MDBX_PNL_GETSIZE(pnl); - if (likely(len > 0)) { + if (likely(MDBX_PNL_GETSIZE(txn->tw.relist) > 0)) { + const pgno_t pgno = pnl_get_single(txn->tw.relist); MDBX_env *const env = txn->mt_env; - MDBX_PNL_SETSIZE(pnl, len - 1); -#if MDBX_PNL_ASCENDING - const pgno_t pgno = pnl[1]; - for (size_t i = 1; i < len; ++i) - pnl[i] = pnl[i + 1]; -#else - const pgno_t pgno = pnl[len]; -#endif - #if MDBX_ENABLE_PROFGC const uint64_t monotime_before = osal_monotime(); size_t majflt_before; diff --git a/src/internals.h b/src/internals.h index e6bcdd9a..383581f8 100644 --- a/src/internals.h +++ b/src/internals.h @@ -949,9 +949,11 @@ typedef struct MDBX_dpl { #define MDBX_PNL_END(pl) (&(pl)[MDBX_PNL_GETSIZE(pl) + 1]) #if MDBX_PNL_ASCENDING +#define MDBX_PNL_EDGE(pl) ((pl) + 1) #define MDBX_PNL_LEAST(pl) MDBX_PNL_FIRST(pl) #define MDBX_PNL_MOST(pl) MDBX_PNL_LAST(pl) #else +#define MDBX_PNL_EDGE(pl) ((pl) + MDBX_PNL_GETSIZE(pl)) #define MDBX_PNL_LEAST(pl) MDBX_PNL_LAST(pl) #define MDBX_PNL_MOST(pl) MDBX_PNL_FIRST(pl) #endif From 07674ada47e363df3f5691c09d6c99fd4bf95273 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Fri, 18 Nov 2022 15:41:15 +0300 Subject: [PATCH 236/364] =?UTF-8?q?mdbx:=20=D0=B4=D0=BE=D1=80=D0=B0=D0=B1?= =?UTF-8?q?=D0=BE=D1=82=D0=BA=D0=B0=20=D0=BF=D0=BE=D0=B4=D0=B3=D0=BE=D1=82?= =?UTF-8?q?=D0=BE=D0=B2=D0=BA=D0=B8=20=D1=80=D0=B5=D0=B7=D0=B5=D1=80=D0=B2?= =?UTF-8?q?=D0=B0=20=D0=BF=D0=B5=D1=80=D0=B5=D0=B4=20=D0=BE=D0=B1=D0=BD?= =?UTF-8?q?=D0=BE=D0=B2=D0=BB=D0=B5=D0=BD=D0=B8=D0=B5=D0=BC=20GC=20=D0=BF?= =?UTF-8?q?=D1=80=D0=B8=20=D0=B2=D0=BA=D0=BB=D1=8E=D1=87=D0=B5=D0=BD=D0=BD?= =?UTF-8?q?=D0=BE=D0=BC=20BigFoot.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core.c | 188 +++++++++++++++++++++++++++++++++++------------------ 1 file changed, 125 insertions(+), 63 deletions(-) diff --git a/src/core.c b/src/core.c index 335da529..407cd935 100644 --- a/src/core.c +++ b/src/core.c @@ -6761,6 +6761,7 @@ static pgr_t page_alloc_slowpath(const MDBX_cursor *mc, const size_t num, MDBX_cursor *const gc = (MDBX_cursor *)((char *)env->me_txn0 + sizeof(MDBX_txn)); + eASSERT(env, mc != gc && gc->mc_next == nullptr); gc->mc_txn = txn; gc->mc_flags = 0; @@ -7188,8 +7189,16 @@ done: } if (LOG_ENABLED(level)) debug_log(level, __func__, __LINE__, - "unable alloc %zu %s, flags 0x%x, errcode %d\n", num, what, - flags, ret.err); + "unable alloc %zu %s, alloc-flags 0x%x, err %d, txn-flags " + "0x%x, re-list-len %zu, loose-count %zu, gc: height %u, " + "branch %zu, leaf %zu, large %zu, entries %zu\n", + num, what, flags, ret.err, txn->mt_flags, + MDBX_PNL_GETSIZE(txn->tw.relist), txn->tw.loose_count, + txn->mt_dbs[FREE_DBI].md_depth, + (size_t)txn->mt_dbs[FREE_DBI].md_branch_pages, + (size_t)txn->mt_dbs[FREE_DBI].md_leaf_pages, + (size_t)txn->mt_dbs[FREE_DBI].md_overflow_pages, + (size_t)txn->mt_dbs[FREE_DBI].md_entries); ret.page = NULL; } } else { @@ -9508,7 +9517,13 @@ static __always_inline size_t gcu_backlog_size(MDBX_txn *txn) { static int gcu_clean_stored_retired(MDBX_txn *txn, gcu_context_t *ctx) { int err = MDBX_SUCCESS; - if (ctx->retired_stored) + if (ctx->retired_stored) { + MDBX_cursor *const gc = (MDBX_cursor *)((char *)txn + sizeof(MDBX_txn)); + tASSERT(txn, txn == txn->mt_env->me_txn0 && gc->mc_next == nullptr); + gc->mc_txn = txn; + gc->mc_flags = 0; + gc->mc_next = txn->mt_cursors[FREE_DBI]; + txn->mt_cursors[FREE_DBI] = gc; do { MDBX_val key, val; #if MDBX_ENABLE_BIGFOOT @@ -9517,11 +9532,10 @@ static int gcu_clean_stored_retired(MDBX_txn *txn, gcu_context_t *ctx) { key.iov_base = &txn->mt_txnid; #endif /* MDBX_ENABLE_BIGFOOT */ key.iov_len = sizeof(txnid_t); - const struct cursor_set_result csr = - cursor_set(&ctx->cursor, &key, &val, MDBX_SET); + const struct cursor_set_result csr = cursor_set(gc, &key, &val, MDBX_SET); if (csr.err == MDBX_SUCCESS && csr.exact) { ctx->retired_stored = 0; - err = mdbx_cursor_del(&ctx->cursor, 0); + err = mdbx_cursor_del(gc, 0); TRACE("== clear-4linear, backlog %zu, err %d", gcu_backlog_size(txn), err); } @@ -9531,6 +9545,9 @@ static int gcu_clean_stored_retired(MDBX_txn *txn, gcu_context_t *ctx) { #else while (0); #endif /* MDBX_ENABLE_BIGFOOT */ + txn->mt_cursors[FREE_DBI] = gc->mc_next; + gc->mc_next = nullptr; + } return err; } @@ -9544,28 +9561,45 @@ static int gcu_touch(gcu_context_t *ctx) { /* Prepare a backlog of pages to modify GC itself, while reclaiming is * prohibited. It should be enough to prevent search in page_alloc_slowpath() * during a deleting, when GC tree is unbalanced. */ -static int gcu_prepare_backlog(MDBX_txn *txn, gcu_context_t *ctx, - const bool reserve4retired) { - const size_t pages4retiredlist = - reserve4retired ? number_of_ovpages( - txn->mt_env, MDBX_PNL_SIZEOF(txn->tw.retired_pages)) - : 0; - const size_t backlog4cow = txn->mt_dbs[FREE_DBI].md_depth; - const size_t backlog4rebalance = backlog4cow + 1; +static int gcu_prepare_backlog(MDBX_txn *txn, gcu_context_t *ctx) { + const size_t for_cow = txn->mt_dbs[FREE_DBI].md_depth; + const size_t for_rebalance = for_cow + 1; + size_t for_split = ctx->retired_stored == 0; - if (likely(pages4retiredlist < 2 && - gcu_backlog_size(txn) > (reserve4retired - ? backlog4rebalance - : (backlog4cow + backlog4rebalance)))) + const intptr_t retired_left = + MDBX_PNL_SIZEOF(txn->tw.retired_pages) - ctx->retired_stored; + size_t for_retiredlist = 0; + if (MDBX_ENABLE_BIGFOOT && retired_left > 0) { + for_retiredlist = (retired_left + txn->mt_env->me_maxgc_ov1page - 1) / + txn->mt_env->me_maxgc_ov1page; + const size_t per_branch_page = + (txn->mt_env->me_psize - PAGEHDRSZ) / + (sizeof(indx_t) + sizeof(MDBX_node) + sizeof(txnid_t)); + for (size_t entries = for_retiredlist; entries > 1; for_split += entries) + entries = (entries + per_branch_page - 1) / per_branch_page; + } else if (!MDBX_ENABLE_BIGFOOT && retired_left != 0) { + for_retiredlist = + number_of_ovpages(txn->mt_env, MDBX_PNL_SIZEOF(txn->tw.retired_pages)); + } + + const size_t for_tree_before_touch = for_cow + for_rebalance + for_split; + const size_t for_tree_after_touch = for_rebalance + for_split; + const size_t for_data = for_retiredlist; + const size_t for_all_before_touch = for_data + for_tree_before_touch; + const size_t for_all_after_touch = for_data + for_tree_after_touch; + + if (likely(for_data < 2 && gcu_backlog_size(txn) > for_all_before_touch)) return MDBX_SUCCESS; - TRACE( - ">> reserve4retired %c, backlog %zu, 4list %zu, 4cow %zu, 4rebalance %zu", - reserve4retired ? 'Y' : 'N', gcu_backlog_size(txn), pages4retiredlist, - backlog4cow, backlog4rebalance); + TRACE(">> retired-stored %zu, left %zi, backlog %zu, need %zu (4list %zu, " + "4split %zu, " + "4cow %zu, 4tree %zu)", + ctx->retired_stored, retired_left, gcu_backlog_size(txn), + for_all_before_touch, for_data, for_split, for_cow, + for_tree_before_touch); int err; - if (unlikely(pages4retiredlist > 2)) { + if (unlikely(for_data > 2)) { MDBX_val key, val; key.iov_base = val.iov_base = nullptr; key.iov_len = sizeof(txnid_t); @@ -9578,28 +9612,34 @@ static int gcu_prepare_backlog(MDBX_txn *txn, gcu_context_t *ctx, err = gcu_touch(ctx); TRACE("== after-touch, backlog %zu, err %d", gcu_backlog_size(txn), err); - if (unlikely(pages4retiredlist > 1) && + if (!MDBX_ENABLE_BIGFOOT && unlikely(for_data > 1) && MDBX_PNL_GETSIZE(txn->tw.retired_pages) != ctx->retired_stored && err == MDBX_SUCCESS) { - tASSERT(txn, reserve4retired); - err = gcu_clean_stored_retired(txn, ctx); - if (unlikely(err != MDBX_SUCCESS)) - return err; - err = - page_alloc_slowpath(&ctx->cursor, pages4retiredlist, MDBX_ALLOC_RESERVE) - .err; + if (unlikely(ctx->retired_stored)) { + err = gcu_clean_stored_retired(txn, ctx); + if (unlikely(err != MDBX_SUCCESS)) + return err; + if (!ctx->retired_stored) + return /* restart by tail-recursion */ gcu_prepare_backlog(txn, ctx); + } + err = page_alloc_slowpath(&ctx->cursor, for_data, MDBX_ALLOC_RESERVE).err; TRACE("== after-4linear, backlog %zu, err %d", gcu_backlog_size(txn), err); cASSERT(&ctx->cursor, - gcu_backlog_size(txn) >= pages4retiredlist || err != MDBX_SUCCESS); + gcu_backlog_size(txn) >= for_data || err != MDBX_SUCCESS); } - while (gcu_backlog_size(txn) < backlog4cow + pages4retiredlist && - err == MDBX_SUCCESS) + while (gcu_backlog_size(txn) < for_all_after_touch && err == MDBX_SUCCESS) err = page_alloc_slowpath(&ctx->cursor, 0, MDBX_ALLOC_RESERVE | MDBX_ALLOC_UNIMPORTANT) .err; - TRACE("<< backlog %zu, err %d", gcu_backlog_size(txn), err); + TRACE("<< backlog %zu, err %d, gc: height %u, branch %zu, leaf %zu, large " + "%zu, entries %zu", + gcu_backlog_size(txn), err, txn->mt_dbs[FREE_DBI].md_depth, + (size_t)txn->mt_dbs[FREE_DBI].md_branch_pages, + (size_t)txn->mt_dbs[FREE_DBI].md_leaf_pages, + (size_t)txn->mt_dbs[FREE_DBI].md_overflow_pages, + (size_t)txn->mt_dbs[FREE_DBI].md_entries); return (err != MDBX_NOTFOUND) ? err : MDBX_SUCCESS; } @@ -9665,9 +9705,10 @@ retry: TRACE("%s", " >> continue"); if (ctx->retired_stored != MDBX_PNL_GETSIZE(txn->tw.retired_pages) && - (MDBX_PNL_GETSIZE(txn->tw.retired_pages) > env->me_maxgc_ov1page || + (ctx->loop == 1 || + MDBX_PNL_GETSIZE(txn->tw.retired_pages) > env->me_maxgc_ov1page || ctx->retired_stored > env->me_maxgc_ov1page)) { - rc = gcu_prepare_backlog(txn, ctx, true); + rc = gcu_prepare_backlog(txn, ctx); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } @@ -9696,7 +9737,7 @@ retry: if (unlikely(rc != MDBX_SUCCESS)) goto bailout; if (likely(!ctx->dense)) { - rc = gcu_prepare_backlog(txn, ctx, false); + rc = gcu_prepare_backlog(txn, ctx); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } @@ -9730,7 +9771,7 @@ retry: if (ctx->cleaned_id > txn->tw.last_reclaimed) break; if (likely(!ctx->dense)) { - rc = gcu_prepare_backlog(txn, ctx, false); + rc = gcu_prepare_backlog(txn, ctx); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } @@ -9767,30 +9808,29 @@ retry: /* handle loose pages - put ones into the reclaimed- or retired-list */ if (txn->tw.loose_pages) { + tASSERT(txn, txn->tw.loose_count > 0); /* Return loose page numbers to tw.relist, * though usually none are left at this point. * The pages themselves remain in dirtylist. */ if (unlikely(!txn->tw.lifo_reclaimed && txn->tw.last_reclaimed < 1)) { - if (txn->tw.loose_count > 0) { - TRACE("%s: try allocate gc-slot for %zu loose-pages", dbg_prefix_mode, - txn->tw.loose_count); - rc = page_alloc_slowpath(&ctx->cursor, 0, MDBX_ALLOC_RESERVE).err; - if (rc == MDBX_SUCCESS) { - TRACE("%s: retry since gc-slot for %zu loose-pages available", - dbg_prefix_mode, txn->tw.loose_count); - continue; - } - - /* Put loose page numbers in tw.retired_pages, - * since unable to return them to tw.relist. */ - if (unlikely((rc = pnl_need(&txn->tw.retired_pages, - txn->tw.loose_count)) != 0)) - goto bailout; - for (MDBX_page *mp = txn->tw.loose_pages; mp; mp = mp->mp_next) - pnl_xappend(txn->tw.retired_pages, mp->mp_pgno); - TRACE("%s: append %zu loose-pages to retired-pages", dbg_prefix_mode, - txn->tw.loose_count); + TRACE("%s: try allocate gc-slot for %zu loose-pages", dbg_prefix_mode, + txn->tw.loose_count); + rc = page_alloc_slowpath(&ctx->cursor, 0, MDBX_ALLOC_RESERVE).err; + if (rc == MDBX_SUCCESS) { + TRACE("%s: retry since gc-slot for %zu loose-pages available", + dbg_prefix_mode, txn->tw.loose_count); + continue; } + + /* Put loose page numbers in tw.retired_pages, + * since unable to return them to tw.relist. */ + if (unlikely((rc = pnl_need(&txn->tw.retired_pages, + txn->tw.loose_count)) != 0)) + goto bailout; + for (MDBX_page *mp = txn->tw.loose_pages; mp; mp = mp->mp_next) + pnl_xappend(txn->tw.retired_pages, mp->mp_pgno); + TRACE("%s: append %zu loose-pages to retired-pages", dbg_prefix_mode, + txn->tw.loose_count); } else { /* Room for loose pages + temp PNL with same */ rc = pnl_need(&txn->tw.relist, 2 * txn->tw.loose_count + 2); @@ -9874,14 +9914,31 @@ retry: } retired_pages_before = MDBX_PNL_GETSIZE(txn->tw.retired_pages); - rc = gcu_prepare_backlog(txn, ctx, true); + rc = gcu_prepare_backlog(txn, ctx); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; + if (retired_pages_before != MDBX_PNL_GETSIZE(txn->tw.retired_pages)) { + TRACE("%s: retired-list changed (%zu -> %zu), retry", dbg_prefix_mode, + retired_pages_before, MDBX_PNL_GETSIZE(txn->tw.retired_pages)); + break; + } pnl_sort(txn->tw.retired_pages, txn->mt_next_pgno); ctx->retired_stored = 0; ctx->bigfoot = txn->mt_txnid; do { + if (ctx->retired_stored) { + rc = gcu_prepare_backlog(txn, ctx); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + if (ctx->retired_stored >= + MDBX_PNL_GETSIZE(txn->tw.retired_pages)) { + TRACE("%s: retired-list changed (%zu -> %zu), retry", + dbg_prefix_mode, retired_pages_before, + MDBX_PNL_GETSIZE(txn->tw.retired_pages)); + break; + } + } key.iov_len = sizeof(txnid_t); key.iov_base = &ctx->bigfoot; const size_t left = @@ -9927,7 +9984,7 @@ retry: key.iov_len = sizeof(txnid_t); key.iov_base = &txn->mt_txnid; do { - gcu_prepare_backlog(txn, ctx, true); + gcu_prepare_backlog(txn, ctx); data.iov_len = MDBX_PNL_SIZEOF(txn->tw.retired_pages); rc = mdbx_cursor_put(&ctx->cursor, &key, &data, MDBX_RESERVE); if (unlikely(rc != MDBX_SUCCESS)) @@ -9940,7 +9997,7 @@ retry: eASSERT(env, data.iov_len == MDBX_PNL_SIZEOF(txn->tw.retired_pages)); memcpy(data.iov_base, txn->tw.retired_pages, data.iov_len); - TRACE("%s: put-retired #%u @ %" PRIaTXN, dbg_prefix_mode, + TRACE("%s: put-retired #%zu @ %" PRIaTXN, dbg_prefix_mode, ctx->retired_stored, txn->mt_txnid); #endif /* MDBX_ENABLE_BIGFOOT */ if (LOG_ENABLED(MDBX_LOG_EXTRA)) { @@ -9982,7 +10039,7 @@ retry: if (0 >= (intptr_t)left) break; - const size_t prefer_max_scatter = 257; + const size_t prefer_max_scatter = MDBX_ENABLE_BIGFOOT ? MDBX_TXL_MAX : 257; txnid_t reservation_gc_id; if (ctx->lifo) { if (txn->tw.lifo_reclaimed == nullptr) { @@ -10154,6 +10211,10 @@ retry: : (ctx->rid < INT16_MAX) ? (size_t)ctx->rid : INT16_MAX; if (avail_gc_slots > 1) { +#if MDBX_ENABLE_BIGFOOT + chunk = (chunk < env->me_maxgc_ov1page * 2) ? chunk / 2 + : env->me_maxgc_ov1page; +#else if (chunk < env->me_maxgc_ov1page * 2) chunk /= 2; else { @@ -10190,6 +10251,7 @@ retry: : tail; } } +#endif /* MDBX_ENABLE_BIGFOOT */ } } tASSERT(txn, chunk > 0); @@ -10217,7 +10279,7 @@ retry: data.iov_len = (chunk + 1) * sizeof(pgno_t); TRACE("%s: reserve %zu [%zu...%zu) @%" PRIaTXN, dbg_prefix_mode, chunk, ctx->settled + 1, ctx->settled + chunk + 1, reservation_gc_id); - gcu_prepare_backlog(txn, ctx, true); + gcu_prepare_backlog(txn, ctx); rc = mdbx_cursor_put(&ctx->cursor, &key, &data, MDBX_RESERVE | MDBX_NOOVERWRITE); tASSERT(txn, pnl_check_allocated(txn->tw.relist, From c5ddf12602cb2fc2a219a077d608f17553e3017d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Sat, 26 Nov 2022 01:04:21 +0300 Subject: [PATCH 237/364] =?UTF-8?q?mdbx:=20=D1=83=D0=B2=D0=B5=D0=BB=D0=B8?= =?UTF-8?q?=D1=87=D0=B5=D0=BD=D0=B8=D0=B5=20=D0=BB=D0=B8=D0=BC=D0=B8=D1=82?= =?UTF-8?q?=D0=B0=20`MDBX=5FTXL=5FMAX`=20=D0=B4=D0=BE=202^26.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/internals.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/internals.h b/src/internals.h index 383581f8..e5217ef6 100644 --- a/src/internals.h +++ b/src/internals.h @@ -933,7 +933,7 @@ typedef struct MDBX_dpl { #define MDBX_TXL_INITIAL \ (MDBX_TXL_GRANULATE - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(txnid_t)) #define MDBX_TXL_MAX \ - ((1u << 17) - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(txnid_t)) + ((1u << 26) - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(txnid_t)) #define MDBX_PNL_ALLOCLEN(pl) ((pl)[-1]) #define MDBX_PNL_GETSIZE(pl) ((size_t)((pl)[0])) From c521a21f051cf14c61d65c04e0e30ec41820657b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Sat, 26 Nov 2022 16:58:10 +0300 Subject: [PATCH 238/364] =?UTF-8?q?mdbx:=20=D0=BF=D0=B5=D1=80=D0=B5=D0=BC?= =?UTF-8?q?=D0=B5=D1=89=D0=B5=D0=BD=D0=B8=D0=B5=20`mp=5Fnext`=20=D0=B2=20?= =?UTF-8?q?=D0=B7=D0=B0=D0=B3=D0=BE=D0=BB=D0=BE=D0=B2=D0=BA=D0=B5=20=D1=81?= =?UTF-8?q?=D1=82=D1=80=D0=B0=D0=BD=D0=B8=D1=86=D1=8B=20=D0=B4=D0=BB=D1=8F?= =?UTF-8?q?=20=D0=BE=D1=82=D0=B4=D0=B5=D0=BB=D0=B5=D0=BD=D0=B8=D1=8F=20?= =?UTF-8?q?=D0=BE=D1=82=20`mp=5Ftxnid`.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core.c | 111 +++++++++++++++++++++++++++++------------------- src/internals.h | 8 ++-- 2 files changed, 71 insertions(+), 48 deletions(-) diff --git a/src/core.c b/src/core.c index 407cd935..65c261e6 100644 --- a/src/core.c +++ b/src/core.c @@ -2741,8 +2741,11 @@ static __always_inline size_t dpl_bytes2size(const ptrdiff_t bytes) { } static __always_inline size_t dpl_setlen(MDBX_dpl *dl, size_t len) { - static const MDBX_page dpl_stub_pageE = { - {0}, 0, P_BAD, {0}, /* pgno */ ~(pgno_t)0}; + static const MDBX_page dpl_stub_pageE = {INVALID_TXNID, + 0, + P_BAD, + {0}, + /* pgno */ ~(pgno_t)0}; assert(dpl_stub_pageE.mp_flags == P_BAD && dpl_stub_pageE.mp_pgno == P_INVALID); dl->length = len; @@ -2753,7 +2756,11 @@ static __always_inline size_t dpl_setlen(MDBX_dpl *dl, size_t len) { } static __always_inline void dpl_clear(MDBX_dpl *dl) { - static const MDBX_page dpl_stub_pageB = {{0}, 0, P_BAD, {0}, /* pgno */ 0}; + static const MDBX_page dpl_stub_pageB = {INVALID_TXNID, + 0, + P_BAD, + {0}, + /* pgno */ 0}; assert(dpl_stub_pageB.mp_flags == P_BAD && dpl_stub_pageB.mp_pgno == 0); dl->sorted = dpl_setlen(dl, 0); dl->pages_including_loose = 0; @@ -3671,8 +3678,8 @@ static MDBX_page *page_malloc(MDBX_txn *txn, size_t num) { eASSERT(env, env->me_dp_reserve_len > 0); MDBX_ASAN_UNPOISON_MEMORY_REGION(np, size); VALGRIND_MEMPOOL_ALLOC(env, np, size); - VALGRIND_MAKE_MEM_DEFINED(&np->mp_next, sizeof(np->mp_next)); - env->me_dp_reserve = np->mp_next; + VALGRIND_MAKE_MEM_DEFINED(&mp_next(np), sizeof(MDBX_page *)); + env->me_dp_reserve = mp_next(np); env->me_dp_reserve_len -= 1; } else { size = pgno2bytes(env, num); @@ -3710,9 +3717,9 @@ static void dpage_free(MDBX_env *env, MDBX_page *dp, size_t npages) { memset(dp, -1, pgno2bytes(env, npages)); if (npages == 1 && env->me_dp_reserve_len < env->me_options.dp_reserve_limit) { - MDBX_ASAN_POISON_MEMORY_REGION((char *)dp + sizeof(dp->mp_next), - env->me_psize - sizeof(dp->mp_next)); - dp->mp_next = env->me_dp_reserve; + MDBX_ASAN_POISON_MEMORY_REGION(dp, env->me_psize); + MDBX_ASAN_UNPOISON_MEMORY_REGION(&mp_next(dp), sizeof(MDBX_page *)); + mp_next(dp) = env->me_dp_reserve; VALGRIND_MEMPOOL_FREE(env, dp); env->me_dp_reserve = dp; env->me_dp_reserve_len += 1; @@ -3883,7 +3890,7 @@ static void refund_loose(MDBX_txn *txn) { tASSERT(txn, txn->mt_next_pgno >= MIN_PAGENO + txn->tw.loose_count); pgno_t most = MIN_PAGENO; size_t w = 0; - for (const MDBX_page *lp = txn->tw.loose_pages; lp; lp = lp->mp_next) { + for (const MDBX_page *lp = txn->tw.loose_pages; lp; lp = mp_next(lp)) { tASSERT(txn, lp->mp_flags == P_LOOSE); tASSERT(txn, txn->mt_next_pgno > lp->mp_pgno); if (likely(txn->mt_next_pgno - txn->tw.loose_count <= lp->mp_pgno)) { @@ -3893,6 +3900,8 @@ static void refund_loose(MDBX_txn *txn) { suitable[++w] = lp->mp_pgno; most = (lp->mp_pgno > most) ? lp->mp_pgno : most; } + MDBX_ASAN_UNPOISON_MEMORY_REGION(&mp_next(lp), sizeof(MDBX_page *)); + VALGRIND_MAKE_MEM_DEFINED(&mp_next(lp), sizeof(MDBX_page *)); } if (most + 1 == txn->mt_next_pgno) { @@ -3984,10 +3993,12 @@ static void refund_loose(MDBX_txn *txn) { for (MDBX_page **link = &txn->tw.loose_pages; *link;) { MDBX_page *dp = *link; tASSERT(txn, dp->mp_flags == P_LOOSE); + MDBX_ASAN_UNPOISON_MEMORY_REGION(&mp_next(dp), sizeof(MDBX_page *)); + VALGRIND_MAKE_MEM_DEFINED(&mp_next(dp), sizeof(MDBX_page *)); if (txn->mt_next_pgno > dp->mp_pgno) { - link = &dp->mp_next; + link = &mp_next(dp); } else { - *link = dp->mp_next; + *link = mp_next(dp); if ((txn->mt_flags & MDBX_WRITEMAP) == 0) dpage_free(txn->mt_env, dp, 1); } @@ -4286,8 +4297,11 @@ status_done: pgno + txn->mt_env->me_options.dp_loose_limit || txn->mt_next_pgno <= txn->mt_env->me_options.dp_loose_limit))) { DEBUG("loosen dirty page %" PRIaPGNO, pgno); + if (MDBX_DEBUG != 0 || unlikely(txn->mt_env->me_flags & MDBX_PAGEPERTURB)) + memset(page_data(mp), -1, txn->mt_env->me_psize - PAGEHDRSZ); + mp->mp_txnid = INVALID_TXNID; mp->mp_flags = P_LOOSE; - mp->mp_next = txn->tw.loose_pages; + mp_next(mp) = txn->tw.loose_pages; txn->tw.loose_pages = mp; txn->tw.loose_count++; #if MDBX_ENABLE_REFUND @@ -4295,8 +4309,6 @@ status_done: ? pgno + 2 : txn->tw.loose_refund_wl; #endif /* MDBX_ENABLE_REFUND */ - if (MDBX_DEBUG != 0 || unlikely(txn->mt_env->me_flags & MDBX_PAGEPERTURB)) - memset(page_data(mp), -1, txn->mt_env->me_psize - PAGEHDRSZ); VALGRIND_MAKE_MEM_NOACCESS(page_data(mp), txn->mt_env->me_psize - PAGEHDRSZ); MDBX_ASAN_POISON_MEMORY_REGION(page_data(mp), @@ -4770,7 +4782,7 @@ __cold static int txn_spill_slowpath(MDBX_txn *const txn, MDBX_cursor *const m0, goto bailout; dpl_clear(txn->tw.dirtylist); txn->tw.dirtyroom = env->me_options.dp_limit - txn->tw.loose_count; - for (MDBX_page *lp = txn->tw.loose_pages; lp != nullptr; lp = lp->mp_next) { + for (MDBX_page *lp = txn->tw.loose_pages; lp != nullptr; lp = mp_next(lp)) { rc = dpl_append(txn, lp->mp_pgno, lp, 1); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; @@ -5505,20 +5517,22 @@ __hot static int __must_check_result page_dirty(MDBX_txn *txn, MDBX_page *mp, int rc; if (unlikely(txn->tw.dirtyroom == 0)) { if (txn->tw.loose_count) { - MDBX_page *loose = txn->tw.loose_pages; - DEBUG("purge-and-reclaim loose page %" PRIaPGNO, loose->mp_pgno); - rc = pnl_insert_range(&txn->tw.relist, loose->mp_pgno, 1); + MDBX_page *lp = txn->tw.loose_pages; + DEBUG("purge-and-reclaim loose page %" PRIaPGNO, lp->mp_pgno); + rc = pnl_insert_range(&txn->tw.relist, lp->mp_pgno, 1); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; - size_t di = dpl_search(txn, loose->mp_pgno); - tASSERT(txn, txn->tw.dirtylist->items[di].ptr == loose); + size_t di = dpl_search(txn, lp->mp_pgno); + tASSERT(txn, txn->tw.dirtylist->items[di].ptr == lp); dpl_remove(txn, di); - txn->tw.loose_pages = loose->mp_next; + MDBX_ASAN_UNPOISON_MEMORY_REGION(&mp_next(lp), sizeof(MDBX_page *)); + VALGRIND_MAKE_MEM_DEFINED(&mp_next(lp), sizeof(MDBX_page *)); + txn->tw.loose_pages = mp_next(lp); txn->tw.loose_count--; txn->tw.dirtyroom++; if (!MDBX_AVOID_MSYNC || !(txn->mt_flags & MDBX_WRITEMAP)) { tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0); - dpage_free(txn->mt_env, loose, 1); + dpage_free(txn->mt_env, lp, 1); } } else { ERROR("Dirtyroom is depleted, DPL length %zu", txn->tw.dirtylist->length); @@ -7148,8 +7162,8 @@ done: #endif /* MDBX_ENABLE_PROFGC */ if (env->me_flags & MDBX_WRITEMAP) { ret.page = pgno2page(env, pgno); - VALGRIND_MAKE_MEM_UNDEFINED(ret.page, pgno2bytes(env, num)); MDBX_ASAN_UNPOISON_MEMORY_REGION(ret.page, pgno2bytes(env, num)); + VALGRIND_MAKE_MEM_UNDEFINED(ret.page, pgno2bytes(env, num)); } else { ret.page = page_malloc(txn, num); if (unlikely(!ret.page)) { @@ -7237,16 +7251,17 @@ __hot static pgr_t page_alloc(const MDBX_cursor *mc) { } #endif /* MDBX_ENABLE_REFUND */ - MDBX_page *page = txn->tw.loose_pages; - txn->tw.loose_pages = page->mp_next; + MDBX_page *lp = txn->tw.loose_pages; + MDBX_ASAN_UNPOISON_MEMORY_REGION(lp, txn->mt_env->me_psize); + VALGRIND_MAKE_MEM_DEFINED(&mp_next(lp), sizeof(MDBX_page *)); + txn->tw.loose_pages = mp_next(lp); txn->tw.loose_count--; - DEBUG_EXTRA("db %d use loose page %" PRIaPGNO, DDBI(mc), page->mp_pgno); - tASSERT(txn, page->mp_pgno < txn->mt_next_pgno); - tASSERT(txn, page->mp_pgno >= NUM_METAS); - VALGRIND_MAKE_MEM_UNDEFINED(page_data(page), page_space(txn->mt_env)); - MDBX_ASAN_UNPOISON_MEMORY_REGION(page_data(page), page_space(txn->mt_env)); - page->mp_txnid = txn->mt_front; - pgr_t ret = {page, MDBX_SUCCESS}; + DEBUG_EXTRA("db %d use loose page %" PRIaPGNO, DDBI(mc), lp->mp_pgno); + tASSERT(txn, lp->mp_pgno < txn->mt_next_pgno); + tASSERT(txn, lp->mp_pgno >= NUM_METAS); + VALGRIND_MAKE_MEM_UNDEFINED(page_data(lp), page_space(txn->mt_env)); + lp->mp_txnid = txn->mt_front; + pgr_t ret = {lp, MDBX_SUCCESS}; return ret; } @@ -8717,7 +8732,9 @@ int mdbx_txn_begin_ex(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags, rc = pnl_insert_range(&parent->tw.relist, lp->mp_pgno, 1); if (unlikely(rc != MDBX_SUCCESS)) goto nested_failed; - parent->tw.loose_pages = lp->mp_next; + MDBX_ASAN_UNPOISON_MEMORY_REGION(&mp_next(lp), sizeof(MDBX_page *)); + VALGRIND_MAKE_MEM_DEFINED(&mp_next(lp), sizeof(MDBX_page *)); + parent->tw.loose_pages = mp_next(lp); /* Remove from dirty list */ page_wash(parent, di, lp, 1); } while (parent->tw.loose_pages); @@ -9827,8 +9844,11 @@ retry: if (unlikely((rc = pnl_need(&txn->tw.retired_pages, txn->tw.loose_count)) != 0)) goto bailout; - for (MDBX_page *mp = txn->tw.loose_pages; mp; mp = mp->mp_next) - pnl_xappend(txn->tw.retired_pages, mp->mp_pgno); + for (MDBX_page *lp = txn->tw.loose_pages; lp; lp = mp_next(lp)) { + pnl_xappend(txn->tw.retired_pages, lp->mp_pgno); + MDBX_ASAN_UNPOISON_MEMORY_REGION(&mp_next(lp), sizeof(MDBX_page *)); + VALGRIND_MAKE_MEM_DEFINED(&mp_next(lp), sizeof(MDBX_page *)); + } TRACE("%s: append %zu loose-pages to retired-pages", dbg_prefix_mode, txn->tw.loose_count); } else { @@ -9839,9 +9859,11 @@ retry: MDBX_PNL loose = txn->tw.relist + MDBX_PNL_ALLOCLEN(txn->tw.relist) - txn->tw.loose_count - 1; size_t count = 0; - for (MDBX_page *mp = txn->tw.loose_pages; mp; mp = mp->mp_next) { - tASSERT(txn, mp->mp_flags == P_LOOSE); - loose[++count] = mp->mp_pgno; + for (MDBX_page *lp = txn->tw.loose_pages; lp; lp = mp_next(lp)) { + tASSERT(txn, lp->mp_flags == P_LOOSE); + loose[++count] = lp->mp_pgno; + MDBX_ASAN_UNPOISON_MEMORY_REGION(&mp_next(lp), sizeof(MDBX_page *)); + VALGRIND_MAKE_MEM_DEFINED(&mp_next(lp), sizeof(MDBX_page *)); } tASSERT(txn, count == txn->tw.loose_count); MDBX_PNL_SETSIZE(loose, count); @@ -11045,9 +11067,12 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { txn_refund(parent); if (ASSERT_ENABLED()) { /* Check parent's loose pages not suitable for refund */ - for (MDBX_page *lp = parent->tw.loose_pages; lp; lp = lp->mp_next) + for (MDBX_page *lp = parent->tw.loose_pages; lp; lp = mp_next(lp)) { tASSERT(parent, lp->mp_pgno < parent->tw.loose_refund_wl && lp->mp_pgno + 1 < parent->mt_next_pgno); + MDBX_ASAN_UNPOISON_MEMORY_REGION(&mp_next(lp), sizeof(MDBX_page *)); + VALGRIND_MAKE_MEM_DEFINED(&mp_next(lp), sizeof(MDBX_page *)); + } /* Check parent's reclaimed pages not suitable for refund */ if (MDBX_PNL_GETSIZE(parent->tw.relist)) tASSERT(parent, @@ -14497,8 +14522,8 @@ __cold int mdbx_env_close_ex(MDBX_env *env, bool dont_sync) { while ((dp = env->me_dp_reserve) != NULL) { MDBX_ASAN_UNPOISON_MEMORY_REGION(dp, env->me_psize); - VALGRIND_MAKE_MEM_DEFINED(&dp->mp_next, sizeof(dp->mp_next)); - env->me_dp_reserve = dp->mp_next; + VALGRIND_MAKE_MEM_DEFINED(&mp_next(dp), sizeof(MDBX_page *)); + env->me_dp_reserve = mp_next(dp); osal_free(dp); } VALGRIND_DESTROY_MEMPOOL(env); @@ -23882,8 +23907,8 @@ __cold int mdbx_env_set_option(MDBX_env *env, const MDBX_option_t option, eASSERT(env, env->me_dp_reserve != NULL); MDBX_page *dp = env->me_dp_reserve; MDBX_ASAN_UNPOISON_MEMORY_REGION(dp, env->me_psize); - VALGRIND_MAKE_MEM_DEFINED(&dp->mp_next, sizeof(dp->mp_next)); - env->me_dp_reserve = dp->mp_next; + VALGRIND_MAKE_MEM_DEFINED(&mp_next(dp), sizeof(MDBX_page *)); + env->me_dp_reserve = mp_next(dp); VALGRIND_MEMPOOL_FREE(env, dp); osal_free(dp); env->me_dp_reserve_len -= 1; diff --git a/src/internals.h b/src/internals.h index e5217ef6..1fcfbace 100644 --- a/src/internals.h +++ b/src/internals.h @@ -525,16 +525,12 @@ typedef struct MDBX_meta { * Each non-metapage up to MDBX_meta.mm_last_pg is reachable exactly once * in the snapshot: Either used by a database or listed in a GC record. */ typedef struct MDBX_page { - union { #define IS_FROZEN(txn, p) ((p)->mp_txnid < (txn)->mt_txnid) #define IS_SPILLED(txn, p) ((p)->mp_txnid == (txn)->mt_txnid) #define IS_SHADOWED(txn, p) ((p)->mp_txnid > (txn)->mt_txnid) #define IS_VALID(txn, p) ((p)->mp_txnid <= (txn)->mt_front) #define IS_MODIFIABLE(txn, p) ((p)->mp_txnid == (txn)->mt_front) - uint64_t - mp_txnid; /* txnid which created this page, maybe zero in legacy DB */ - struct MDBX_page *mp_next; /* for in-memory list of freed pages */ - }; + uint64_t mp_txnid; /* txnid which created page, maybe zero in legacy DB */ uint16_t mp_leaf2_ksize; /* key size if this is a LEAF2 page */ #define P_BRANCH 0x01u /* branch page */ #define P_LEAF 0x02u /* leaf page */ @@ -576,6 +572,8 @@ typedef struct MDBX_page { /* Size of the page header, excluding dynamic data at the end */ #define PAGEHDRSZ offsetof(MDBX_page, mp_ptrs) +#define mp_next(mp) (*(MDBX_page **)((mp)->mp_ptrs + 2)) + #pragma pack(pop) typedef struct profgc_stat { From 7685b4080e9b1d1c4a42d8ad80e9f71110bff7c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Sun, 27 Nov 2022 12:31:42 +0300 Subject: [PATCH 239/364] =?UTF-8?q?mdbx:=20=D0=B8=D1=81=D0=BF=D1=80=D0=B0?= =?UTF-8?q?=D0=B2=D0=BB=D0=B5=D0=BD=D0=B8=D0=B5=20=D0=B2=D0=BE=D0=B7=D0=B2?= =?UTF-8?q?=D1=80=D0=B0=D1=82=D0=B0=20=D0=B8=20=D0=BF=D0=BE=D0=B4=D1=81?= =?UTF-8?q?=D1=87=D0=B5=D1=82=D0=B0=20"=D0=B3=D1=80=D1=8F=D0=B7=D0=BD?= =?UTF-8?q?=D1=8B=D1=85"=20=D1=81=D1=82=D1=80=D0=B0=D0=BD=D0=B8=D1=86=20?= =?UTF-8?q?=D0=B2=20=D1=80=D0=B5=D0=B6=D0=B8=D0=BC=D0=B5=20`MDBX=5FWRITEMA?= =?UTF-8?q?P`.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Исправление регрессии после коммита db72763de049d6e4546f838277fe83b9081ad1de. После отключения затратой поддержки списка "грязных" страниц логика page_retire_ex() оказалась не полной и требовала доработки. Из-за этого страницы добавленные или клонированные-и-измененные в текущей транзакции, которые становились не нужными, не возвращались к доступным для немедленного использования, а помещались в retired-список становящихся доступными в последующих транзакциях. В результате, в некоторых сценариях, особенно с интенсивным расщеплением страниц из-за вставки ключей, происходило необоснованно сильное потребление/выделение страниц БД. В свою очередь, это приводило к использованию излишнего кол-ва страниц, увеличению GC, росту RSS и размеру БД. --- src/core.c | 162 +++++++++++++++++++++++++++++------------------------ 1 file changed, 89 insertions(+), 73 deletions(-) diff --git a/src/core.c b/src/core.c index 65c261e6..b9cfd9f1 100644 --- a/src/core.c +++ b/src/core.c @@ -4078,20 +4078,31 @@ __cold static void kill_page(MDBX_txn *txn, MDBX_page *mp, pgno_t pgno, } } -/* Remove page from dirty list */ +/* Remove page from dirty list, etc */ static __inline void page_wash(MDBX_txn *txn, const size_t di, MDBX_page *const mp, const size_t npages) { tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); - tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); - tASSERT(txn, di && di <= txn->tw.dirtylist->length && - txn->tw.dirtylist->items[di].ptr == mp); - dpl_remove_ex(txn, di, npages); - txn->tw.dirtyroom++; - tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length == - (txn->mt_parent ? txn->mt_parent->tw.dirtyroom - : txn->mt_env->me_options.dp_limit)); + tASSERT(txn, (di > 0) == (txn->tw.dirtylist != nullptr)); mp->mp_txnid = INVALID_TXNID; mp->mp_flags = P_BAD; + + if (di) { + tASSERT(txn, txn->tw.dirtylist != nullptr); + tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); + tASSERT(txn, di <= txn->tw.dirtylist->length && + txn->tw.dirtylist->items[di].ptr == mp); + dpl_remove_ex(txn, di, npages); + txn->tw.dirtyroom++; + tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length == + (txn->mt_parent ? txn->mt_parent->tw.dirtyroom + : txn->mt_env->me_options.dp_limit)); + } else { + tASSERT(txn, txn->tw.dirtylist == nullptr); + tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) != 0 && !MDBX_AVOID_MSYNC); + tASSERT(txn, txn->tw.writemap_dirty_npages >= npages); + txn->tw.writemap_dirty_npages -= npages; + } + VALGRIND_MAKE_MEM_UNDEFINED(mp, PAGEHDRSZ); if (txn->mt_flags & MDBX_WRITEMAP) { VALGRIND_MAKE_MEM_NOACCESS(page_data(mp), @@ -4128,9 +4139,9 @@ static int page_retire_ex(MDBX_cursor *mc, const pgno_t pgno, * requires support the list of dirty pages and avoid explicit spilling. * So for flexibility and avoid extra internal dependencies we just * fallback to reading if dirty list was not allocated yet. */ - size_t di = 0, si = 0; - size_t npages = 1; - bool is_frozen = false, is_spilled = false, is_shadowed = false; + size_t di = 0, si = 0, npages = 1; + bool is_frozen = false, is_spilled = false, is_shadowed = false, + is_modifable = false; if (unlikely(!mp)) { if (ASSERT_ENABLED() && pageflags) { pgr_t check; @@ -4154,6 +4165,7 @@ static int page_retire_ex(MDBX_cursor *mc, const pgno_t pgno, if ((di = dpl_exist(txn, pgno)) != 0) { mp = txn->tw.dirtylist->items[di].ptr; tASSERT(txn, IS_MODIFIABLE(txn, mp)); + is_modifable = true; goto status_done; } if ((si = search_spilled(txn, pgno)) != 0) { @@ -4185,10 +4197,10 @@ static int page_retire_ex(MDBX_cursor *mc, const pgno_t pgno, is_frozen = IS_FROZEN(txn, mp); if (!is_frozen) { - const bool is_dirty = IS_MODIFIABLE(txn, mp); - is_spilled = IS_SPILLED(txn, mp) && !(txn->mt_flags & MDBX_WRITEMAP); + is_modifable = IS_MODIFIABLE(txn, mp); is_shadowed = IS_SHADOWED(txn, mp); - if (is_dirty) { + is_spilled = IS_SPILLED(txn, mp) && !(txn->mt_flags & MDBX_WRITEMAP); + if (is_modifable) { tASSERT(txn, !is_spilled); tASSERT(txn, !txn->tw.spilled.list || !search_spilled(txn, pgno)); tASSERT(txn, debug_dpl_find(txn, pgno) == mp || txn->mt_parent || @@ -4197,9 +4209,9 @@ static int page_retire_ex(MDBX_cursor *mc, const pgno_t pgno, tASSERT(txn, !debug_dpl_find(txn, pgno)); } - di = (is_dirty && txn->tw.dirtylist) ? dpl_exist(txn, pgno) : 0; + di = (is_modifable && txn->tw.dirtylist) ? dpl_exist(txn, pgno) : 0; si = is_spilled ? search_spilled(txn, pgno) : 0; - tASSERT(txn, !is_dirty || di || (txn->mt_flags & MDBX_WRITEMAP)); + tASSERT(txn, !is_modifable || di || (txn->mt_flags & MDBX_WRITEMAP)); } else { tASSERT(txn, !IS_MODIFIABLE(txn, mp)); tASSERT(txn, !IS_SPILLED(txn, mp)); @@ -4240,22 +4252,19 @@ status_done: * нераспределенного "хвоста" БД сдвигается только при их коммите. */ if (MDBX_ENABLE_REFUND && unlikely(pgno + npages == txn->mt_next_pgno)) { const char *kind = nullptr; - if (di) { + if (is_modifable) { /* Страница испачкана в этой транзакции, но до этого могла быть * аллоцирована, испачкана и пролита в одной из родительских транзакций. * Её МОЖНО вытолкнуть в нераспределенный хвост. */ kind = "dirty"; /* Remove from dirty list */ - page_wash(txn, di, mp, npages); + page_wash(txn, di, mp ? mp : pgno2page(txn->mt_env, pgno), npages); } else if (si) { /* Страница пролита в этой транзакции, т.е. она аллоцирована * и запачкана в этой или одной из родительских транзакций. * Её МОЖНО вытолкнуть в нераспределенный хвост. */ kind = "spilled"; spill_remove(txn, si, npages); - } else if (txn->mt_flags & MDBX_WRITEMAP) { - kind = "writemap"; - tASSERT(txn, mp && IS_MODIFIABLE(txn, mp)); } else { /* Страница аллоцирована, запачкана и возможно пролита в одной * из родительских транзакций. @@ -4286,67 +4295,74 @@ status_done: return MDBX_SUCCESS; } - if (di) { - /* Dirty page from this transaction */ - /* If suitable we can reuse it through loose list */ - if (likely(npages == 1 && - txn->tw.loose_count < txn->mt_env->me_options.dp_loose_limit && - (!MDBX_ENABLE_REFUND || - /* skip pages near to the end in favor of compactification */ - txn->mt_next_pgno > - pgno + txn->mt_env->me_options.dp_loose_limit || - txn->mt_next_pgno <= txn->mt_env->me_options.dp_loose_limit))) { - DEBUG("loosen dirty page %" PRIaPGNO, pgno); - if (MDBX_DEBUG != 0 || unlikely(txn->mt_env->me_flags & MDBX_PAGEPERTURB)) - memset(page_data(mp), -1, txn->mt_env->me_psize - PAGEHDRSZ); - mp->mp_txnid = INVALID_TXNID; - mp->mp_flags = P_LOOSE; - mp_next(mp) = txn->tw.loose_pages; - txn->tw.loose_pages = mp; - txn->tw.loose_count++; + if (is_modifable) { + if (di) { + /* Dirty page from this transaction */ + /* If suitable we can reuse it through loose list */ + if (likely( + npages == 1 && + txn->tw.loose_count < txn->mt_env->me_options.dp_loose_limit && + (!MDBX_ENABLE_REFUND || + /* skip pages near to the end in favor of compactification */ + txn->mt_next_pgno > + pgno + txn->mt_env->me_options.dp_loose_limit || + txn->mt_next_pgno <= txn->mt_env->me_options.dp_loose_limit))) { + DEBUG("loosen dirty page %" PRIaPGNO, pgno); + if (MDBX_DEBUG != 0 || + unlikely(txn->mt_env->me_flags & MDBX_PAGEPERTURB)) + memset(page_data(mp), -1, txn->mt_env->me_psize - PAGEHDRSZ); + mp->mp_txnid = INVALID_TXNID; + mp->mp_flags = P_LOOSE; + mp_next(mp) = txn->tw.loose_pages; + txn->tw.loose_pages = mp; + txn->tw.loose_count++; #if MDBX_ENABLE_REFUND - txn->tw.loose_refund_wl = (pgno + 2 > txn->tw.loose_refund_wl) - ? pgno + 2 - : txn->tw.loose_refund_wl; + txn->tw.loose_refund_wl = (pgno + 2 > txn->tw.loose_refund_wl) + ? pgno + 2 + : txn->tw.loose_refund_wl; #endif /* MDBX_ENABLE_REFUND */ - VALGRIND_MAKE_MEM_NOACCESS(page_data(mp), - txn->mt_env->me_psize - PAGEHDRSZ); - MDBX_ASAN_POISON_MEMORY_REGION(page_data(mp), - txn->mt_env->me_psize - PAGEHDRSZ); - return MDBX_SUCCESS; - } + VALGRIND_MAKE_MEM_NOACCESS(page_data(mp), + txn->mt_env->me_psize - PAGEHDRSZ); + MDBX_ASAN_POISON_MEMORY_REGION(page_data(mp), + txn->mt_env->me_psize - PAGEHDRSZ); + return MDBX_SUCCESS; + } #if !MDBX_DEBUG && !defined(MDBX_USE_VALGRIND) && !defined(__SANITIZE_ADDRESS__) - if (unlikely(txn->mt_env->me_flags & MDBX_PAGEPERTURB)) + if (unlikely(txn->mt_env->me_flags & MDBX_PAGEPERTURB)) #endif - { - /* Страница могла быть изменена в одной из родительских транзакций, - * в том числе, позже выгружена и затем снова загружена и изменена. - * В обоих случаях её нельзя затирать на диске и помечать недоступной - * в asan и/или valgrind */ - for (MDBX_txn *parent = txn->mt_parent; - parent && (parent->mt_flags & MDBX_TXN_SPILLS); - parent = parent->mt_parent) { - if (intersect_spilled(parent, pgno, npages)) - goto skip_invalidate; - if (dpl_intersect(parent, pgno, npages)) - goto skip_invalidate; - } + { + /* Страница могла быть изменена в одной из родительских транзакций, + * в том числе, позже выгружена и затем снова загружена и изменена. + * В обоих случаях её нельзя затирать на диске и помечать недоступной + * в asan и/или valgrind */ + for (MDBX_txn *parent = txn->mt_parent; + parent && (parent->mt_flags & MDBX_TXN_SPILLS); + parent = parent->mt_parent) { + if (intersect_spilled(parent, pgno, npages)) + goto skip_invalidate; + if (dpl_intersect(parent, pgno, npages)) + goto skip_invalidate; + } #if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__) - if (MDBX_DEBUG != 0 || unlikely(txn->mt_env->me_flags & MDBX_PAGEPERTURB)) + if (MDBX_DEBUG != 0 || + unlikely(txn->mt_env->me_flags & MDBX_PAGEPERTURB)) #endif - kill_page(txn, mp, pgno, npages); - if ((txn->mt_flags & MDBX_WRITEMAP) == 0) { - VALGRIND_MAKE_MEM_NOACCESS(page_data(pgno2page(txn->mt_env, pgno)), - pgno2bytes(txn->mt_env, npages) - PAGEHDRSZ); - MDBX_ASAN_POISON_MEMORY_REGION(page_data(pgno2page(txn->mt_env, pgno)), - pgno2bytes(txn->mt_env, npages) - - PAGEHDRSZ); + kill_page(txn, mp, pgno, npages); + if ((txn->mt_flags & MDBX_WRITEMAP) == 0) { + VALGRIND_MAKE_MEM_NOACCESS(page_data(pgno2page(txn->mt_env, pgno)), + pgno2bytes(txn->mt_env, npages) - + PAGEHDRSZ); + MDBX_ASAN_POISON_MEMORY_REGION( + page_data(pgno2page(txn->mt_env, pgno)), + pgno2bytes(txn->mt_env, npages) - PAGEHDRSZ); + } } + skip_invalidate:; } - skip_invalidate: - /* Remove from dirty list */ + + /* wash dirty page */ page_wash(txn, di, mp, npages); reclaim: From 05804e2f30703d634d3e13c9132cbe70b71a9329 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Mon, 28 Nov 2022 01:20:36 +0300 Subject: [PATCH 240/364] =?UTF-8?q?mdbx:=20=D0=B4=D0=BE=D1=80=D0=B0=D0=B1?= =?UTF-8?q?=D0=BE=D1=82=D0=BA=D0=B0/=D0=BE=D0=BF=D1=82=D0=B8=D0=BC=D0=B8?= =?UTF-8?q?=D0=B7=D0=B0=D1=86=D0=B8=D1=8F=20`page=5Fretire=5Fex()`.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core.c | 83 +++++++++++++++++++++++++++++------------------------- 1 file changed, 45 insertions(+), 38 deletions(-) diff --git a/src/core.c b/src/core.c index b9cfd9f1..e2167874 100644 --- a/src/core.c +++ b/src/core.c @@ -3238,9 +3238,9 @@ static int __must_check_result page_check(MDBX_cursor *const mc, static int __must_check_result cursor_check(MDBX_cursor *mc); static int __must_check_result cursor_check_updating(MDBX_cursor *mc); static int __must_check_result cursor_del(MDBX_cursor *mc); -static int __must_check_result delete (MDBX_txn *txn, MDBX_dbi dbi, - const MDBX_val *key, - const MDBX_val *data, unsigned flags); +static int __must_check_result delete(MDBX_txn *txn, MDBX_dbi dbi, + const MDBX_val *key, const MDBX_val *data, + unsigned flags); #define SIBLING_LEFT 0 #define SIBLING_RIGHT 2 static int __must_check_result cursor_sibling(MDBX_cursor *mc, int dir); @@ -4140,8 +4140,14 @@ static int page_retire_ex(MDBX_cursor *mc, const pgno_t pgno, * So for flexibility and avoid extra internal dependencies we just * fallback to reading if dirty list was not allocated yet. */ size_t di = 0, si = 0, npages = 1; - bool is_frozen = false, is_spilled = false, is_shadowed = false, - is_modifable = false; + enum page_status { + unknown, + frozen, + spilled, + shadowed, + modifable + } status = unknown; + if (unlikely(!mp)) { if (ASSERT_ENABLED() && pageflags) { pgr_t check; @@ -4153,7 +4159,7 @@ static int page_retire_ex(MDBX_cursor *mc, const pgno_t pgno, tASSERT(txn, !(pageflags & P_FROZEN) || IS_FROZEN(txn, check.page)); } if (pageflags & P_FROZEN) { - is_frozen = true; + status = frozen; if (ASSERT_ENABLED()) { for (MDBX_txn *scan = txn; scan; scan = scan->mt_parent) { tASSERT(txn, !txn->tw.spilled.list || !search_spilled(scan, pgno)); @@ -4165,25 +4171,25 @@ static int page_retire_ex(MDBX_cursor *mc, const pgno_t pgno, if ((di = dpl_exist(txn, pgno)) != 0) { mp = txn->tw.dirtylist->items[di].ptr; tASSERT(txn, IS_MODIFIABLE(txn, mp)); - is_modifable = true; + status = modifable; goto status_done; } if ((si = search_spilled(txn, pgno)) != 0) { - is_spilled = true; + status = spilled; goto status_done; } for (MDBX_txn *parent = txn->mt_parent; parent; parent = parent->mt_parent) { if (dpl_exist(parent, pgno)) { - is_shadowed = true; + status = shadowed; goto status_done; } if (search_spilled(parent, pgno)) { - is_spilled = true; + status = spilled; goto status_done; } } - is_frozen = true; + status = frozen; goto status_done; } @@ -4195,27 +4201,27 @@ static int page_retire_ex(MDBX_cursor *mc, const pgno_t pgno, pageflags = mp->mp_flags; } - is_frozen = IS_FROZEN(txn, mp); - if (!is_frozen) { - is_modifable = IS_MODIFIABLE(txn, mp); - is_shadowed = IS_SHADOWED(txn, mp); - is_spilled = IS_SPILLED(txn, mp) && !(txn->mt_flags & MDBX_WRITEMAP); - if (is_modifable) { - tASSERT(txn, !is_spilled); - tASSERT(txn, !txn->tw.spilled.list || !search_spilled(txn, pgno)); - tASSERT(txn, debug_dpl_find(txn, pgno) == mp || txn->mt_parent || - (txn->mt_flags & MDBX_WRITEMAP)); - } else { - tASSERT(txn, !debug_dpl_find(txn, pgno)); - } - - di = (is_modifable && txn->tw.dirtylist) ? dpl_exist(txn, pgno) : 0; - si = is_spilled ? search_spilled(txn, pgno) : 0; - tASSERT(txn, !is_modifable || di || (txn->mt_flags & MDBX_WRITEMAP)); - } else { + if (IS_FROZEN(txn, mp)) { + status = frozen; tASSERT(txn, !IS_MODIFIABLE(txn, mp)); tASSERT(txn, !IS_SPILLED(txn, mp)); tASSERT(txn, !IS_SHADOWED(txn, mp)); + tASSERT(txn, !debug_dpl_find(txn, pgno)); + tASSERT(txn, !txn->tw.spilled.list || !search_spilled(txn, pgno)); + } else if (IS_MODIFIABLE(txn, mp)) { + status = modifable; + di = txn->tw.dirtylist ? dpl_exist(txn, pgno) : 0; + tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) || !IS_SPILLED(txn, mp)); + tASSERT(txn, !txn->tw.spilled.list || !search_spilled(txn, pgno)); + } else if (IS_SHADOWED(txn, mp)) { + status = shadowed; + tASSERT(txn, !txn->tw.spilled.list || !search_spilled(txn, pgno)); + tASSERT(txn, !debug_dpl_find(txn, pgno)); + } else { + tASSERT(txn, IS_SPILLED(txn, mp)); + status = spilled; + si = search_spilled(txn, pgno); + tASSERT(txn, !debug_dpl_find(txn, pgno)); } status_done: @@ -4239,7 +4245,7 @@ status_done: mc->mc_db->md_overflow_pages -= (pgno_t)npages; } - if (is_frozen) { + if (status == frozen) { retire: DEBUG("retire %zu page %" PRIaPGNO, npages, pgno); rc = pnl_append_range(false, &txn->tw.retired_pages, pgno, npages); @@ -4252,7 +4258,7 @@ status_done: * нераспределенного "хвоста" БД сдвигается только при их коммите. */ if (MDBX_ENABLE_REFUND && unlikely(pgno + npages == txn->mt_next_pgno)) { const char *kind = nullptr; - if (is_modifable) { + if (status == modifable) { /* Страница испачкана в этой транзакции, но до этого могла быть * аллоцирована, испачкана и пролита в одной из родительских транзакций. * Её МОЖНО вытолкнуть в нераспределенный хвост. */ @@ -4264,6 +4270,7 @@ status_done: * и запачкана в этой или одной из родительских транзакций. * Её МОЖНО вытолкнуть в нераспределенный хвост. */ kind = "spilled"; + tASSERT(txn, status == spilled); spill_remove(txn, si, npages); } else { /* Страница аллоцирована, запачкана и возможно пролита в одной @@ -4276,18 +4283,18 @@ status_done: parent = parent->mt_parent) { if (search_spilled(parent, pgno)) { kind = "parent-spilled"; - tASSERT(txn, is_spilled); + tASSERT(txn, status == spilled); break; } if (mp == debug_dpl_find(parent, pgno)) { kind = "parent-dirty"; - tASSERT(txn, !is_spilled); + tASSERT(txn, status == shadowed); break; } } tASSERT(txn, kind != nullptr); } - tASSERT(txn, is_spilled || is_shadowed || (mp && IS_SHADOWED(txn, mp))); + tASSERT(txn, status == spilled || status == shadowed); } DEBUG("refunded %zu %s page %" PRIaPGNO, npages, kind, pgno); txn->mt_next_pgno = pgno; @@ -4295,7 +4302,7 @@ status_done: return MDBX_SUCCESS; } - if (is_modifable) { + if (status == modifable) { if (di) { /* Dirty page from this transaction */ /* If suitable we can reuse it through loose list */ @@ -4392,7 +4399,7 @@ status_done: goto reclaim; } - if (is_shadowed) { + if (status == shadowed) { /* Dirty page MUST BE a clone from (one of) parent transaction(s). */ if (ASSERT_ENABLED()) { const MDBX_page *parent_dp = nullptr; @@ -19569,8 +19576,8 @@ int mdbx_del(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, return delete (txn, dbi, key, data, 0); } -static int delete (MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, - const MDBX_val *data, unsigned flags) { +static int delete(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, + const MDBX_val *data, unsigned flags) { MDBX_cursor_couple cx; MDBX_cursor_op op; MDBX_val rdata; From 8c74de57eaaaccb848f1e8451d74301ace83ae56 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Mon, 28 Nov 2022 08:30:50 +0300 Subject: [PATCH 241/364] =?UTF-8?q?mdbx:=20=D0=B8=D1=81=D0=BF=D1=80=D0=B0?= =?UTF-8?q?=D0=B2=D0=BB=D0=B5=D0=BD=D0=B8=D0=B5=20`txn=5Fcommit()`=20?= =?UTF-8?q?=D0=B4=D0=BB=D1=8F=20=D1=81=D0=BB=D1=83=D1=87=D0=B0=D0=B5=D0=B2?= =?UTF-8?q?=20=D0=BA=D0=BE=D0=BD=D0=BA=D1=83=D1=80=D0=B5=D0=BD=D1=82=D0=BD?= =?UTF-8?q?=D1=8B=D1=85=20=D0=B8/=D0=B8=D0=BB=D0=B8=20=D0=BD=D0=B5=D0=B2?= =?UTF-8?q?=D0=B5=D1=80=D0=BD=D1=8B=D1=85=20=D0=B2=D1=8B=D0=B7=D0=BE=D0=B2?= =?UTF-8?q?=D0=BE=D0=B2=20=D0=BF=D1=80=D0=B8=20`MDBX=5FENABLE=5FPROFGC=3D1?= =?UTF-8?q?`.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core.c | 89 ++++++++++++++++++++++++++++-------------------------- 1 file changed, 47 insertions(+), 42 deletions(-) diff --git a/src/core.c b/src/core.c index e2167874..095bd1ca 100644 --- a/src/core.c +++ b/src/core.c @@ -10927,22 +10927,54 @@ static __inline void txn_merge(MDBX_txn *const parent, MDBX_txn *const txn, } } +static void take_gcprof(MDBX_txn *txn, MDBX_commit_latency *latency) { + MDBX_env *const env = txn->mt_env; + if (MDBX_ENABLE_PROFGC) { + pgop_stat_t *const ptr = &env->me_lck->mti_pgop_stat; + latency->gc_prof.work_counter = ptr->gc_prof.work.spe_counter; + latency->gc_prof.work_rtime_monotonic = + osal_monotime_to_16dot16(ptr->gc_prof.work.rtime_monotonic); + latency->gc_prof.work_xtime_monotonic = + osal_monotime_to_16dot16(ptr->gc_prof.work.xtime_monotonic); + latency->gc_prof.work_rtime_cpu = + osal_monotime_to_16dot16(ptr->gc_prof.work.rtime_cpu); + latency->gc_prof.work_rsteps = ptr->gc_prof.work.rsteps; + latency->gc_prof.work_xpages = ptr->gc_prof.work.xpages; + latency->gc_prof.work_majflt = ptr->gc_prof.work.majflt; + + latency->gc_prof.self_counter = ptr->gc_prof.self.spe_counter; + latency->gc_prof.self_rtime_monotonic = + osal_monotime_to_16dot16(ptr->gc_prof.self.rtime_monotonic); + latency->gc_prof.self_xtime_monotonic = + osal_monotime_to_16dot16(ptr->gc_prof.self.xtime_monotonic); + latency->gc_prof.self_rtime_cpu = + osal_monotime_to_16dot16(ptr->gc_prof.self.rtime_cpu); + latency->gc_prof.self_rsteps = ptr->gc_prof.self.rsteps; + latency->gc_prof.self_xpages = ptr->gc_prof.self.xpages; + latency->gc_prof.self_majflt = ptr->gc_prof.self.majflt; + + latency->gc_prof.wloops = ptr->gc_prof.wloops; + latency->gc_prof.coalescences = ptr->gc_prof.coalescences; + latency->gc_prof.wipes = ptr->gc_prof.wipes; + latency->gc_prof.flushes = ptr->gc_prof.flushes; + latency->gc_prof.kicks = ptr->gc_prof.kicks; + if (txn == env->me_txn0) + memset(&ptr->gc_prof, 0, sizeof(ptr->gc_prof)); + } else + memset(&latency->gc_prof, 0, sizeof(latency->gc_prof)); +} + int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { STATIC_ASSERT(MDBX_TXN_FINISHED == MDBX_TXN_BLOCKED - MDBX_TXN_HAS_CHILD - MDBX_TXN_ERROR); const uint64_t ts_0 = latency ? osal_monotime() : 0; uint64_t ts_1 = 0, ts_2 = 0, ts_3 = 0, ts_4 = 0, ts_5 = 0, gc_cputime = 0; - MDBX_env *const env = txn->mt_env; int rc = check_txn(txn, MDBX_TXN_FINISHED); if (unlikely(rc != MDBX_SUCCESS)) goto provide_latency; - if (unlikely(txn->mt_flags & MDBX_TXN_ERROR)) { - rc = MDBX_RESULT_TRUE; - goto fail; - } - + MDBX_env *const env = txn->mt_env; #if MDBX_ENV_CHECKPID if (unlikely(env->me_pid != osal_getpid())) { env->me_flags |= MDBX_FATAL_ERROR; @@ -10951,6 +10983,11 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { } #endif /* MDBX_ENV_CHECKPID */ + if (unlikely(txn->mt_flags & MDBX_TXN_ERROR)) { + rc = MDBX_RESULT_TRUE; + goto fail; + } + /* txn_end() mode for a commit which writes nothing */ unsigned end_mode = MDBX_END_PURE_COMMIT | MDBX_END_UPDATE | MDBX_END_SLOT | MDBX_END_FREE; @@ -11268,6 +11305,8 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { end_mode = MDBX_END_COMMITTED | MDBX_END_UPDATE | MDBX_END_EOTDONE; done: + if (latency) + take_gcprof(txn, latency); rc = txn_end(txn, end_mode); provide_latency: @@ -11279,42 +11318,6 @@ provide_latency: latency->audit = (ts_3 > ts_2) ? osal_monotime_to_16dot16(ts_3 - ts_2) : 0; latency->write = (ts_4 > ts_3) ? osal_monotime_to_16dot16(ts_4 - ts_3) : 0; latency->sync = (ts_5 > ts_4) ? osal_monotime_to_16dot16(ts_5 - ts_4) : 0; - -#if MDBX_ENABLE_PROFGC - pgop_stat_t *const ptr = &env->me_lck->mti_pgop_stat; - latency->gc_prof.work_counter = ptr->gc_prof.work.spe_counter; - latency->gc_prof.work_rtime_monotonic = - osal_monotime_to_16dot16(ptr->gc_prof.work.rtime_monotonic); - latency->gc_prof.work_xtime_monotonic = - osal_monotime_to_16dot16(ptr->gc_prof.work.xtime_monotonic); - latency->gc_prof.work_rtime_cpu = - osal_monotime_to_16dot16(ptr->gc_prof.work.rtime_cpu); - latency->gc_prof.work_rsteps = ptr->gc_prof.work.rsteps; - latency->gc_prof.work_xpages = ptr->gc_prof.work.xpages; - latency->gc_prof.work_majflt = ptr->gc_prof.work.majflt; - - latency->gc_prof.self_counter = ptr->gc_prof.self.spe_counter; - latency->gc_prof.self_rtime_monotonic = - osal_monotime_to_16dot16(ptr->gc_prof.self.rtime_monotonic); - latency->gc_prof.self_xtime_monotonic = - osal_monotime_to_16dot16(ptr->gc_prof.self.xtime_monotonic); - latency->gc_prof.self_rtime_cpu = - osal_monotime_to_16dot16(ptr->gc_prof.self.rtime_cpu); - latency->gc_prof.self_rsteps = ptr->gc_prof.self.rsteps; - latency->gc_prof.self_xpages = ptr->gc_prof.self.xpages; - latency->gc_prof.self_majflt = ptr->gc_prof.self.majflt; - - latency->gc_prof.wloops = ptr->gc_prof.wloops; - latency->gc_prof.coalescences = ptr->gc_prof.coalescences; - latency->gc_prof.wipes = ptr->gc_prof.wipes; - latency->gc_prof.flushes = ptr->gc_prof.flushes; - latency->gc_prof.kicks = ptr->gc_prof.kicks; - if (txn == env->me_txn0) - memset(&ptr->gc_prof, 0, sizeof(ptr->gc_prof)); -#else - memset(&latency->gc_prof, 0, sizeof(latency->gc_prof)); -#endif /* MDBX_ENABLE_PROFGC */ - const uint64_t ts_6 = osal_monotime(); latency->ending = ts_5 ? osal_monotime_to_16dot16(ts_6 - ts_5) : 0; latency->whole = osal_monotime_to_16dot16_noUnderflow(ts_6 - ts_0); @@ -11323,6 +11326,8 @@ provide_latency: fail: txn->mt_flags |= MDBX_TXN_ERROR; + if (latency) + take_gcprof(txn, latency); mdbx_txn_abort(txn); goto provide_latency; } From 9cee1ff799a6d09472d2b9e910a41bc77d8ef9e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Mon, 28 Nov 2022 17:40:02 +0300 Subject: [PATCH 242/364] =?UTF-8?q?mdbx:=20=D0=BE=D0=BF=D1=80=D0=B5=D0=B4?= =?UTF-8?q?=D0=B5=D0=BB=D0=B5=D0=BD=D0=B8=D0=B5=20`ior=5FWriteFile=5Fflag`?= =?UTF-8?q?=20=D0=B4=D0=BB=D1=8F=20=D1=8F=D1=81=D0=BD=D0=BE=D1=81=D1=82?= =?UTF-8?q?=D0=B8=20=D0=BA=D0=BE=D0=B4=D0=B0.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/osal.c | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/src/osal.c b/src/osal.c index ccf2205a..e08e09ab 100644 --- a/src/osal.c +++ b/src/osal.c @@ -567,6 +567,7 @@ MDBX_MAYBE_UNUSED MDBX_INTERNAL_FUNC size_t osal_mb2w(wchar_t *dst, #if defined(_WIN32) || defined(_WIN64) #define ior_alignment_mask (ior->pagesize - 1) +#define ior_WriteFile_flag 1 #define OSAL_IOV_MAX (4096 / sizeof(ior_sgv_element)) static void ior_put_event(osal_ioring_t *ior, HANDLE event) { @@ -677,7 +678,7 @@ MDBX_INTERNAL_FUNC int osal_ioring_add(osal_ioring_t *ior, const size_t offset, (uintptr_t)(uint64_t)item->sgv[0].Buffer) & ior_alignment_mask) == 0 && ior->last_sgvcnt + segments < OSAL_IOV_MAX) { - assert((item->single.iov_len & 1) == 0); + assert((item->single.iov_len & ior_WriteFile_flag) == 0); assert(item->sgv[ior->last_sgvcnt].Buffer == 0); ior->last_bytes += bytes; size_t i = 0; @@ -687,13 +688,13 @@ MDBX_INTERNAL_FUNC int osal_ioring_add(osal_ioring_t *ior, const size_t offset, } while (++i < segments); ior->slots_left -= segments; item->sgv[ior->last_sgvcnt += segments].Buffer = 0; - assert((item->single.iov_len & 1) == 0); + assert((item->single.iov_len & ior_WriteFile_flag) == 0); return MDBX_SUCCESS; } - const void *end = - (char *)(item->single.iov_base) + item->single.iov_len - 1; + const void *end = (char *)(item->single.iov_base) + item->single.iov_len - + ior_WriteFile_flag; if (unlikely(end == data)) { - assert((item->single.iov_len & 1) != 0); + assert((item->single.iov_len & ior_WriteFile_flag) != 0); item->single.iov_len += bytes; return MDBX_SUCCESS; } @@ -740,8 +741,8 @@ MDBX_INTERNAL_FUNC int osal_ioring_add(osal_ioring_t *ior, const size_t offset, segments > OSAL_IOV_MAX) { /* WriteFile() */ item->single.iov_base = data; - item->single.iov_len = bytes + 1; - assert((item->single.iov_len & 1) != 0); + item->single.iov_len = bytes + ior_WriteFile_flag; + assert((item->single.iov_len & ior_WriteFile_flag) != 0); } else { /* WriteFileGather() */ item->sgv[0].Buffer = PtrToPtr64(data); @@ -750,7 +751,7 @@ MDBX_INTERNAL_FUNC int osal_ioring_add(osal_ioring_t *ior, const size_t offset, item->sgv[slots_used].Buffer = PtrToPtr64(data); } item->sgv[slots_used].Buffer = 0; - assert((item->single.iov_len & 1) == 0); + assert((item->single.iov_len & ior_WriteFile_flag) == 0); slots_used = segments; } ior->last_bytes = bytes; @@ -778,9 +779,9 @@ MDBX_INTERNAL_FUNC void osal_ioring_walk( #if defined(_WIN32) || defined(_WIN64) size_t offset = ior_offset(item); char *data = item->single.iov_base; - size_t bytes = item->single.iov_len - 1; + size_t bytes = item->single.iov_len - ior_WriteFile_flag; size_t i = 1; - if (bytes & 1) { + if (bytes & ior_WriteFile_flag) { data = Ptr64ToPtr(item->sgv[0].Buffer); bytes = ior->pagesize; while (item->sgv[i].Buffer) { @@ -824,9 +825,9 @@ osal_ioring_write(osal_ioring_t *ior) { LONG async_started = 0; for (ior_item_t *item = ior->pool; item <= ior->last;) { item->ov.Internal = STATUS_PENDING; - size_t i = 1, bytes = item->single.iov_len - 1; + size_t i = 1, bytes = item->single.iov_len - ior_WriteFile_flag; r.wops += 1; - if (bytes & 1) { + if (bytes & ior_WriteFile_flag) { bytes = ior->pagesize; while (item->sgv[i].Buffer) { bytes += ior->pagesize; @@ -964,8 +965,8 @@ osal_ioring_write(osal_ioring_t *ior) { assert(ior->async_waiting == ior->async_completed); for (ior_item_t *item = ior->pool; item <= ior->last;) { - size_t i = 1, bytes = item->single.iov_len - 1; - if (bytes & 1) { + size_t i = 1, bytes = item->single.iov_len - ior_WriteFile_flag; + if (bytes & ior_WriteFile_flag) { bytes = ior->pagesize; while (item->sgv[i].Buffer) { bytes += ior->pagesize; @@ -1059,7 +1060,7 @@ MDBX_INTERNAL_FUNC void osal_ioring_reset(osal_ioring_t *ior) { if (item->ov.hEvent && item->ov.hEvent != ior) ior_put_event(ior, item->ov.hEvent); size_t i = 1; - if ((item->single.iov_len & 1) == 0) + if ((item->single.iov_len & ior_WriteFile_flag) == 0) while (item->sgv[i].Buffer) ++i; item = ior_next(item, i); From d6b9a7182583a1331dd555e2b12ade803b3b365d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Mon, 28 Nov 2022 21:37:08 +0300 Subject: [PATCH 243/364] =?UTF-8?q?mdbx-test:=20=D0=B4=D0=BE=D0=B1=D0=B0?= =?UTF-8?q?=D0=B2=D0=BB=D0=B5=D0=BD=D0=B8=D0=B5=20=D0=B8=D1=81=D0=BA=D0=BB?= =?UTF-8?q?=D1=8E=D1=87=D0=B5=D0=BD=D0=B8=D0=B9=20Valgrind=20=D0=B4=D0=BB?= =?UTF-8?q?=D1=8F=20=D0=B8=D0=B7=D0=BC=D0=B5=D0=BD=D0=B5=D0=BD=D0=BD=D0=BE?= =?UTF-8?q?=D0=B3=D0=BE=20=D0=BA=D0=BE=D0=B4=D0=B0.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test/valgrind_suppress.txt | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/test/valgrind_suppress.txt b/test/valgrind_suppress.txt index 96d1327b..3d0d1be4 100644 --- a/test/valgrind_suppress.txt +++ b/test/valgrind_suppress.txt @@ -30,6 +30,14 @@ ... fun:wipe_steady* } +{ + msync-meta + Memcheck:Param + msync(start) + fun:msync + ... + fun:meta_sync* +} # memcmp() inside iov_write() as workaround for https://libmdbx.dqdkfa.ru/dead-github/issues/269 { From 01a39e7dc21781846e1befd94b78c6918ef9f3e9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Tue, 29 Nov 2022 01:10:44 +0300 Subject: [PATCH 244/364] =?UTF-8?q?mdbx:=20=D0=B4=D0=BE=D0=B1=D0=B0=D0=B2?= =?UTF-8?q?=D0=BB=D0=B5=D0=BD=D0=B8=D0=B5=20=D0=B8=20=D0=B8=D1=81=D0=BF?= =?UTF-8?q?=D0=BE=D0=BB=D1=8C=D0=B7=D0=BE=D0=B2=D0=B0=D0=BD=D0=B8=D0=B5=20?= =?UTF-8?q?`ptr=5Fdisp()`=20=D0=B8=20`ptr=5Fdist()`.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Для уменьшения кастинга типов указателей и потенциальной нагрузки оптимизатора/кодогенератора алиасингом. --- src/core.c | 344 +++++++++++++++++++++++------------------------- src/internals.h | 16 ++- src/osal.c | 136 ++++++++++--------- src/osal.h | 5 +- 4 files changed, 248 insertions(+), 253 deletions(-) diff --git a/src/core.c b/src/core.c index 095bd1ca..44bb8bf1 100644 --- a/src/core.c +++ b/src/core.c @@ -260,24 +260,24 @@ static __always_inline void unaligned_poke_u64(const size_t expected_alignment, } #define UNALIGNED_PEEK_8(ptr, struct, field) \ - peek_u8((const uint8_t *)(ptr) + offsetof(struct, field)) + peek_u8(ptr_disp(ptr, offsetof(struct, field))) #define UNALIGNED_POKE_8(ptr, struct, field, value) \ - poke_u8((uint8_t *)(ptr) + offsetof(struct, field), value) + poke_u8(ptr_disp(ptr, offsetof(struct, field)), value) #define UNALIGNED_PEEK_16(ptr, struct, field) \ - unaligned_peek_u16(1, (const char *)(ptr) + offsetof(struct, field)) + unaligned_peek_u16(1, ptr_disp(ptr, offsetof(struct, field))) #define UNALIGNED_POKE_16(ptr, struct, field, value) \ - unaligned_poke_u16(1, (char *)(ptr) + offsetof(struct, field), value) + unaligned_poke_u16(1, ptr_disp(ptr, offsetof(struct, field)), value) #define UNALIGNED_PEEK_32(ptr, struct, field) \ - unaligned_peek_u32(1, (const char *)(ptr) + offsetof(struct, field)) + unaligned_peek_u32(1, ptr_disp(ptr, offsetof(struct, field))) #define UNALIGNED_POKE_32(ptr, struct, field, value) \ - unaligned_poke_u32(1, (char *)(ptr) + offsetof(struct, field), value) + unaligned_poke_u32(1, ptr_disp(ptr, offsetof(struct, field)), value) #define UNALIGNED_PEEK_64(ptr, struct, field) \ - unaligned_peek_u64(1, (const char *)(ptr) + offsetof(struct, field)) + unaligned_peek_u64(1, ptr_disp(ptr, offsetof(struct, field))) #define UNALIGNED_POKE_64(ptr, struct, field, value) \ - unaligned_poke_u64(1, (char *)(ptr) + offsetof(struct, field), value) + unaligned_poke_u64(1, ptr_disp(ptr, offsetof(struct, field)), value) /* Get the page number pointed to by a branch node */ MDBX_NOTHROW_PURE_FUNCTION static __always_inline pgno_t @@ -341,13 +341,13 @@ static __always_inline void node_set_flags(MDBX_node *const __restrict node, /* Address of the key for the node */ MDBX_NOTHROW_PURE_FUNCTION static __always_inline void * node_key(const MDBX_node *const __restrict node) { - return (char *)node + NODESIZE; + return ptr_disp(node, NODESIZE); } /* Address of the data for a node */ MDBX_NOTHROW_PURE_FUNCTION static __always_inline void * node_data(const MDBX_node *const __restrict node) { - return (char *)node_key(node) + node_ks(node); + return ptr_disp(node_key(node), node_ks(node)); } /* Size of a node in a leaf page with a given key and data. @@ -653,7 +653,7 @@ pgno2bytes(const MDBX_env *env, size_t pgno) { MDBX_NOTHROW_PURE_FUNCTION static __always_inline MDBX_page * pgno2page(const MDBX_env *env, size_t pgno) { - return (MDBX_page *)(env->me_map + pgno2bytes(env, pgno)); + return ptr_disp(env->me_map, pgno2bytes(env, pgno)); } MDBX_NOTHROW_PURE_FUNCTION static __always_inline pgno_t @@ -680,7 +680,7 @@ bytes_align2os_bytes(const MDBX_env *env, size_t bytes) { /* Address of first usable data byte in a page, after the header */ MDBX_NOTHROW_PURE_FUNCTION static __always_inline void * page_data(const MDBX_page *mp) { - return (char *)mp + PAGEHDRSZ; + return ptr_disp(mp, PAGEHDRSZ); } MDBX_NOTHROW_PURE_FUNCTION static __always_inline const MDBX_page * @@ -799,7 +799,7 @@ page_node(const MDBX_page *mp, size_t i) { assert(PAGETYPE_COMPAT(mp) == P_LEAF || PAGETYPE_WHOLE(mp) == P_BRANCH); assert(page_numkeys(mp) > i); assert(mp->mp_ptrs[i] % 2 == 0); - return (MDBX_node *)((char *)mp + mp->mp_ptrs[i] + PAGEHDRSZ); + return ptr_disp(mp, mp->mp_ptrs[i] + PAGEHDRSZ); } /* The address of a key in a LEAF2 page. @@ -810,7 +810,7 @@ page_leaf2key(const MDBX_page *mp, size_t i, size_t keysize) { assert(PAGETYPE_COMPAT(mp) == (P_LEAF | P_LEAF2)); assert(mp->mp_leaf2_ksize == keysize); (void)keysize; - return (char *)mp + PAGEHDRSZ + (i * mp->mp_leaf2_ksize); + return ptr_disp(mp, PAGEHDRSZ + i * mp->mp_leaf2_ksize); } /* Set the node's key into keyptr. */ @@ -2114,7 +2114,7 @@ static int lcklist_detach_locked(MDBX_env *env) { } while (++r != end); \ \ if (unlikely(key_diff_mask < 256)) { \ - memcpy(begin, tmp, (char *)end - (char *)begin); \ + memcpy(begin, tmp, ptr_dist(end, begin)); \ break; \ } \ end = (r = tmp) + length; \ @@ -3698,7 +3698,7 @@ static MDBX_page *page_malloc(MDBX_txn *txn, size_t num) { size_t skip = PAGEHDRSZ; if (num > 1) skip += pgno2bytes(env, num - 1); - memset((char *)np + skip, 0, size - skip); + memset(ptr_disp(np, skip), 0, size - skip); } #if MDBX_DEBUG np->mp_pgno = 0; @@ -4064,7 +4064,7 @@ __cold static void kill_page(MDBX_txn *txn, MDBX_page *mp, pgno_t pgno, } else { struct iovec iov[MDBX_AUXILARY_IOV_MAX]; iov[0].iov_len = env->me_psize; - iov[0].iov_base = (char *)env->me_pbuf + env->me_psize; + iov[0].iov_base = ptr_disp(env->me_pbuf, env->me_psize); size_t iov_off = pgno2bytes(env, pgno), n = 1; while (--npages) { iov[n] = iov[0]; @@ -4475,10 +4475,10 @@ static void iov_callback4dirtypages(iov_ctx_t *ctx, size_t offset, void *data, eASSERT(env, (wp->mp_flags & P_ILL_BITS) == 0); if (likely(ctx->err == MDBX_SUCCESS)) { - VALGRIND_MAKE_MEM_DEFINED(env->me_map + offset, bytes); - MDBX_ASAN_UNPOISON_MEMORY_REGION(env->me_map + offset, bytes); - osal_flush_incoherent_mmap(env->me_map + offset, bytes, env->me_os_psize); - const MDBX_page *const rp = (const MDBX_page *)(env->me_map + offset); + const MDBX_page *const rp = ptr_disp(env->me_map, offset); + VALGRIND_MAKE_MEM_DEFINED(rp, bytes); + MDBX_ASAN_UNPOISON_MEMORY_REGION(rp, bytes); + osal_flush_incoherent_mmap(rp, bytes, env->me_os_psize); /* check with timeout as the workaround * for https://libmdbx.dqdkfa.ru/dead-github/issues/269 */ if (unlikely(memcmp(wp, rp, bytes))) { @@ -4504,7 +4504,7 @@ static void iov_callback4dirtypages(iov_ctx_t *ctx, size_t offset, void *data, size_t npages = IS_OVERFLOW(wp) ? wp->mp_pages : 1u; size_t chunk = pgno2bytes(env, npages); eASSERT(env, bytes >= chunk); - MDBX_page *next = (MDBX_page *)((char *)wp + chunk); + MDBX_page *next = ptr_disp(wp, chunk); dpage_free(env, wp, npages); wp = next; offset += chunk; @@ -5640,16 +5640,15 @@ __cold static int set_readahead(const MDBX_env *env, const pgno_t edge, #endif /* F_RDAHEAD */ int err; + void *const ptr = ptr_disp(env->me_map, offset); if (enable) { #if defined(MADV_NORMAL) - err = madvise(env->me_map + offset, length, MADV_NORMAL) - ? ignore_enosys(errno) - : MDBX_SUCCESS; + err = + madvise(ptr, length, MADV_NORMAL) ? ignore_enosys(errno) : MDBX_SUCCESS; if (unlikely(MDBX_IS_ERROR(err))) return err; #elif defined(POSIX_MADV_NORMAL) - err = ignore_enosys( - posix_madvise(env->me_map + offset, length, POSIX_MADV_NORMAL)); + err = ignore_enosys(posix_madvise(ptr, length, POSIX_MADV_NORMAL)); if (unlikely(MDBX_IS_ERROR(err))) return err; #elif defined(POSIX_FADV_NORMAL) && defined(POSIX_FADV_WILLNEED) @@ -5678,20 +5677,18 @@ __cold static int set_readahead(const MDBX_env *env, const pgno_t edge, (void)/* Ignore ENOTTY for DB on the ram-disk and so on */ fcntl( env->me_lazy_fd, F_RDADVISE, &hint); #elif defined(MADV_WILLNEED) - err = madvise(env->me_map + offset, length, MADV_WILLNEED) - ? ignore_enosys(errno) - : MDBX_SUCCESS; + err = madvise(ptr, length, MADV_WILLNEED) ? ignore_enosys(errno) + : MDBX_SUCCESS; if (unlikely(MDBX_IS_ERROR(err))) return err; #elif defined(POSIX_MADV_WILLNEED) - err = ignore_enosys( - posix_madvise(env->me_map + offset, length, POSIX_MADV_WILLNEED)); + err = ignore_enosys(posix_madvise(ptr, length, POSIX_MADV_WILLNEED)); if (unlikely(MDBX_IS_ERROR(err))) return err; #elif defined(_WIN32) || defined(_WIN64) if (mdbx_PrefetchVirtualMemory) { WIN32_MEMORY_RANGE_ENTRY hint; - hint.VirtualAddress = env->me_map + offset; + hint.VirtualAddress = ptr; hint.NumberOfBytes = length; (void)mdbx_PrefetchVirtualMemory(GetCurrentProcess(), 1, &hint, 0); } @@ -5706,14 +5703,12 @@ __cold static int set_readahead(const MDBX_env *env, const pgno_t edge, } } else { #if defined(MADV_RANDOM) - err = madvise(env->me_map + offset, length, MADV_RANDOM) - ? ignore_enosys(errno) - : MDBX_SUCCESS; + err = + madvise(ptr, length, MADV_RANDOM) ? ignore_enosys(errno) : MDBX_SUCCESS; if (unlikely(MDBX_IS_ERROR(err))) return err; #elif defined(POSIX_MADV_RANDOM) - err = ignore_enosys( - posix_madvise(env->me_map + offset, length, POSIX_MADV_RANDOM)); + err = ignore_enosys(posix_madvise(ptr, length, POSIX_MADV_RANDOM)); if (unlikely(MDBX_IS_ERROR(err))) return err; #elif defined(POSIX_FADV_RANDOM) @@ -5787,14 +5782,15 @@ __cold static void munlock_after(const MDBX_env *env, const pgno_t aligned_pgno, munlock_begin % env->me_os_psize == 0 && munlock_size % env->me_os_psize == 0); #if defined(_WIN32) || defined(_WIN64) - err = VirtualUnlock(env->me_map + munlock_begin, munlock_size) + err = VirtualUnlock(ptr_disp(env->me_map, munlock_begin), munlock_size) ? MDBX_SUCCESS : (int)GetLastError(); if (err == ERROR_NOT_LOCKED) err = MDBX_SUCCESS; #elif defined(_POSIX_MEMLOCK_RANGE) - err = munlock(env->me_map + munlock_begin, munlock_size) ? errno - : MDBX_SUCCESS; + err = munlock(ptr_disp(env->me_map, munlock_begin), munlock_size) + ? errno + : MDBX_SUCCESS; #endif if (likely(err == MDBX_SUCCESS)) update_mlcnt(env, aligned_pgno, false); @@ -5821,7 +5817,7 @@ __cold static int map_resize(MDBX_env *env, const pgno_t used_pgno, const size_t prev_size = env->me_dxb_mmap.current; const size_t prev_limit = env->me_dxb_mmap.limit; #if MDBX_ENABLE_MADVISE || defined(MDBX_USE_VALGRIND) - const void *const prev_addr = env->me_map; + const void *const prev_map = env->me_dxb_mmap.base; #endif /* MDBX_ENABLE_MADVISE || MDBX_USE_VALGRIND */ VERBOSE("resize datafile/mapping: " @@ -5932,20 +5928,20 @@ __cold static int map_resize(MDBX_env *env, const pgno_t used_pgno, rc = MDBX_RESULT_TRUE; #if defined(MADV_REMOVE) if (env->me_flags & MDBX_WRITEMAP) - rc = - madvise(env->me_map + size_bytes, prev_size - size_bytes, MADV_REMOVE) - ? ignore_enosys(errno) - : MDBX_SUCCESS; + rc = madvise(ptr_disp(env->me_map, size_bytes), prev_size - size_bytes, + MADV_REMOVE) + ? ignore_enosys(errno) + : MDBX_SUCCESS; #endif /* MADV_REMOVE */ #if defined(MADV_DONTNEED) if (rc == MDBX_RESULT_TRUE) - rc = madvise(env->me_map + size_bytes, prev_size - size_bytes, + rc = madvise(ptr_disp(env->me_map, size_bytes), prev_size - size_bytes, MADV_DONTNEED) ? ignore_enosys(errno) : MDBX_SUCCESS; #elif defined(POSIX_MADV_DONTNEED) if (rc == MDBX_RESULT_TRUE) - rc = ignore_enosys(posix_madvise(env->me_map + size_bytes, + rc = ignore_enosys(posix_madvise(ptr_disp(env->me_map, size_bytes), prev_size - size_bytes, POSIX_MADV_DONTNEED)); #elif defined(POSIX_FADV_DONTNEED) @@ -5988,7 +5984,7 @@ __cold static int map_resize(MDBX_env *env, const pgno_t used_pgno, !(env->me_flags & MDBX_NORDAHEAD) && mdbx_is_readahead_reasonable(size_bytes, -(intptr_t)prev_size); const bool force = limit_bytes != prev_limit || - env->me_dxb_mmap.address != prev_addr + env->me_dxb_mmap.base != prev_map #if defined(_WIN32) || defined(_WIN64) || prev_size > size_bytes #endif /* Windows */ @@ -6003,7 +5999,7 @@ bailout: eASSERT(env, size_bytes <= env->me_dxb_mmap.filesize); eASSERT(env, size_bytes == env->me_dxb_mmap.current); #ifdef MDBX_USE_VALGRIND - if (prev_limit != env->me_dxb_mmap.limit || prev_addr != env->me_map) { + if (prev_limit != env->me_dxb_mmap.limit || prev_map != env->me_map) { VALGRIND_DISCARD(env->me_valgrind_handle); env->me_valgrind_handle = 0; if (env->me_dxb_mmap.limit) @@ -6023,7 +6019,7 @@ bailout: "limit %" PRIuPTR " -> %" PRIuPTR ", errcode %d", prev_size, size_bytes, prev_limit, limit_bytes, rc); } - if (!env->me_dxb_mmap.address) { + if (!env->me_dxb_mmap.base) { env->me_flags |= MDBX_FATAL_ERROR; if (env->me_txn) env->me_txn->mt_flags |= MDBX_TXN_ERROR; @@ -6079,7 +6075,7 @@ static int meta_unsteady(int err, MDBX_env *env, const txnid_t early_than, const uint64_t wipe = MDBX_DATASIGN_NONE; const void *ptr = &wipe; size_t bytes = sizeof(meta->mm_sign), - offset = (uint8_t *)&meta->mm_sign - env->me_map; + offset = ptr_dist(&meta->mm_sign, env->me_map); if (env->me_flags & MDBX_WRITEMAP) { unaligned_poke_u64(4, meta->mm_sign, wipe); osal_flush_incoherent_cpu_writeback(); @@ -6093,7 +6089,7 @@ static int meta_unsteady(int err, MDBX_env *env, const txnid_t early_than, return err; } ptr = data_page(meta); - offset = (uint8_t *)ptr - env->me_map; + offset = ptr_dist(ptr, env->me_map); bytes = env->me_psize; } @@ -6517,8 +6513,7 @@ __hot static pgno_t *scan4seq_neon(pgno_t *range, const size_t len, #ifndef __SANITIZE_ADDRESS__ found: #endif /* __SANITIZE_ADDRESS__ */ - return (pgno_t *)((char *)range - - (__builtin_clzl(mask) >> sizeof(size_t) / 4)); + return ptr_disp(range, -(__builtin_clzl(mask) >> sizeof(size_t) / 4)); } range -= 4; } while (range > detent + 3); @@ -6796,8 +6791,7 @@ static pgr_t page_alloc_slowpath(const MDBX_cursor *mc, const size_t num, flags += MDBX_ALLOC_COALESCE; } - MDBX_cursor *const gc = - (MDBX_cursor *)((char *)env->me_txn0 + sizeof(MDBX_txn)); + MDBX_cursor *const gc = ptr_disp(env->me_txn0, sizeof(MDBX_txn)); eASSERT(env, mc != gc && gc->mc_next == nullptr); gc->mc_txn = txn; gc->mc_flags = 0; @@ -7339,8 +7333,8 @@ __hot static void page_copy(MDBX_page *const dst, const MDBX_page *const src, const size_t size) { STATIC_ASSERT(UINT16_MAX > MAX_PAGESIZE - PAGEHDRSZ); STATIC_ASSERT(MIN_PAGESIZE > PAGEHDRSZ + NODESIZE * 4); - char *copy_dst = (void *)dst; - const char *copy_src = (const void *)src; + void *copy_dst = dst; + const void *copy_src = src; size_t copy_len = size; if (src->mp_flags & P_LEAF2) { copy_len = PAGEHDRSZ + src->mp_leaf2_ksize * page_numkeys(src); @@ -7358,8 +7352,8 @@ __hot static void page_copy(MDBX_page *const dst, const MDBX_page *const src, if (unlikely(upper > copy_len)) goto bailout; memcpy(copy_dst, copy_src, lower); - copy_dst += upper; - copy_src += upper; + copy_dst = ptr_disp(copy_dst, upper); + copy_src = ptr_disp(copy_src, upper); copy_len -= upper; } } @@ -7585,7 +7579,7 @@ static int meta_sync(const MDBX_env *env, const meta_ptr_t head) { #endif /* MDBX_ENABLE_PGOP_STAT */ const MDBX_page *page = data_page(head.ptr_c); rc = osal_pwrite(env->me_fd4meta, page, env->me_psize, - (uint8_t *)page - env->me_map); + ptr_dist(page, env->me_map)); if (likely(rc == MDBX_SUCCESS) && env->me_fd4meta == env->me_lazy_fd) { rc = osal_fsync(env->me_lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); @@ -7947,10 +7941,11 @@ static void txn_valgrind(MDBX_env *env, MDBX_txn *txn) { if (edge > last) { eASSERT(env, last >= NUM_METAS); env->me_poison_edge = last; - VALGRIND_MAKE_MEM_NOACCESS(env->me_map + pgno2bytes(env, last), + VALGRIND_MAKE_MEM_NOACCESS(ptr_disp(env->me_map, pgno2bytes(env, last)), pgno2bytes(env, edge - last)); - MDBX_ASAN_POISON_MEMORY_REGION(env->me_map + pgno2bytes(env, last), - pgno2bytes(env, edge - last)); + MDBX_ASAN_POISON_MEMORY_REGION( + ptr_disp(env->me_map, pgno2bytes(env, last)), + pgno2bytes(env, edge - last)); } if (should_unlock) mdbx_txn_unlock(env); @@ -8221,7 +8216,7 @@ static int coherency_check_written(const MDBX_env *env, const txnid_t txnid, if (report) WARNING("catch %s txnid %" PRIaTXN " for meta_%" PRIaPGNO " %s", (head_txnid < MIN_TXNID) ? "invalid" : "unexpected", head_txnid, - bytes2pgno(env, (const uint8_t *)meta - env->me_dxb_mmap.dxb), + bytes2pgno(env, ptr_dist(meta, env->me_map)), "(workaround for incoherent flaw of unified page/buffer cache)"); return coherency_timeout(timestamp, 0); } @@ -8548,7 +8543,7 @@ static int txn_renew(MDBX_txn *txn, const unsigned flags) { } tASSERT(txn, txn == env->me_txn0); - MDBX_cursor *const gc = (MDBX_cursor *)((char *)txn + sizeof(MDBX_txn)); + MDBX_cursor *const gc = ptr_disp(txn, sizeof(MDBX_txn)); rc = cursor_init(gc, txn, FREE_DBI); if (rc != MDBX_SUCCESS) goto bailout; @@ -8718,9 +8713,9 @@ int mdbx_txn_begin_ex(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags, #endif /* MDBX_DEBUG */ memset(txn, 0, tsize); txn->mt_dbxs = env->me_dbxs; /* static */ - txn->mt_dbs = (MDBX_db *)((char *)txn + tsize); - txn->mt_cursors = (MDBX_cursor **)(txn->mt_dbs + env->me_maxdbs); - txn->mt_dbistate = (uint8_t *)txn + size - env->me_maxdbs; + txn->mt_dbs = ptr_disp(txn, tsize); + txn->mt_cursors = ptr_disp(txn->mt_dbs, sizeof(MDBX_db) * env->me_maxdbs); + txn->mt_dbistate = ptr_disp(txn, size - env->me_maxdbs); txn->mt_flags = flags; txn->mt_env = env; @@ -9340,7 +9335,7 @@ static int txn_end(MDBX_txn *txn, const unsigned mode) { "the parent", rc); parent->mt_flags |= MDBX_TXN_ERROR; - if (!env->me_dxb_mmap.address) + if (!env->me_dxb_mmap.base) env->me_flags |= MDBX_FATAL_ERROR; } } @@ -9558,7 +9553,7 @@ static __always_inline size_t gcu_backlog_size(MDBX_txn *txn) { static int gcu_clean_stored_retired(MDBX_txn *txn, gcu_context_t *ctx) { int err = MDBX_SUCCESS; if (ctx->retired_stored) { - MDBX_cursor *const gc = (MDBX_cursor *)((char *)txn + sizeof(MDBX_txn)); + MDBX_cursor *const gc = ptr_disp(txn, sizeof(MDBX_txn)); tASSERT(txn, txn == txn->mt_env->me_txn0 && gc->mc_next == nullptr); gc->mc_txn = txn; gc->mc_flags = 0; @@ -11584,7 +11579,7 @@ static int validate_meta_copy(MDBX_env *env, const MDBX_meta *meta, MDBX_meta *dest) { *dest = *meta; return validate_meta(env, dest, data_page(meta), - bytes2pgno(env, (uint8_t *)meta - env->me_map), nullptr); + bytes2pgno(env, ptr_dist(meta, env->me_map)), nullptr); } /* Read the environment parameters of a DB environment @@ -11753,7 +11748,7 @@ __cold static MDBX_page *meta_model(const MDBX_env *env, MDBX_page *model, meta_set_txnid(env, model_meta, MIN_TXNID + num); unaligned_poke_u64(4, model_meta->mm_sign, meta_sign(model_meta)); eASSERT(env, coherency_check_meta(env, model_meta, true)); - return (MDBX_page *)((uint8_t *)model + env->me_psize); + return ptr_disp(model, env->me_psize); } /* Fill in most of the zeroed meta-pages for an empty database environment. @@ -11823,11 +11818,12 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending, const pgno_t edge = env->me_poison_edge; if (edge > largest_pgno) { env->me_poison_edge = largest_pgno; - VALGRIND_MAKE_MEM_NOACCESS(env->me_map + pgno2bytes(env, largest_pgno), - pgno2bytes(env, edge - largest_pgno)); - MDBX_ASAN_POISON_MEMORY_REGION(env->me_map + - pgno2bytes(env, largest_pgno), - pgno2bytes(env, edge - largest_pgno)); + VALGRIND_MAKE_MEM_NOACCESS( + ptr_disp(env->me_map, pgno2bytes(env, largest_pgno)), + pgno2bytes(env, edge - largest_pgno)); + MDBX_ASAN_POISON_MEMORY_REGION( + ptr_disp(env->me_map, pgno2bytes(env, largest_pgno)), + pgno2bytes(env, edge - largest_pgno)); } #endif /* MDBX_USE_VALGRIND || __SANITIZE_ADDRESS__ */ #if MDBX_ENABLE_MADVISE && \ @@ -11863,13 +11859,13 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending, if ((env->me_flags & MDBX_WRITEMAP) && linux_kernel_version > 0x04050000) advise = MADV_FREE; #endif /* MADV_FREE */ - int err = madvise(env->me_map + discard_edge_bytes, + int err = madvise(ptr_disp(env->me_map, discard_edge_bytes), prev_discarded_bytes - discard_edge_bytes, advise) ? ignore_enosys(errno) : MDBX_SUCCESS; #else int err = ignore_enosys(posix_madvise( - env->me_map + discard_edge_bytes, + ptr_disp(env->me_map, discard_edge_bytes), prev_discarded_bytes - discard_edge_bytes, POSIX_MADV_DONTNEED)); #endif if (unlikely(MDBX_IS_ERROR(err))) { @@ -12060,9 +12056,9 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending, #ifndef NDEBUG /* debug: provoke failure to catch a violators, but don't touch mm_psize * to allow readers catch actual pagesize. */ - uint8_t *provoke_begin = (uint8_t *)&target->mm_dbs[FREE_DBI].md_root; - uint8_t *provoke_end = (uint8_t *)&target->mm_sign; - memset(provoke_begin, 0xCC, provoke_end - provoke_begin); + void *provoke_begin = &target->mm_dbs[FREE_DBI].md_root; + void *provoke_end = &target->mm_sign; + memset(provoke_begin, 0xCC, ptr_dist(provoke_end, provoke_begin)); jitter4testing(false); #endif @@ -12102,7 +12098,7 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending, #endif /* MDBX_ENABLE_PGOP_STAT */ const MDBX_page *page = data_page(target); rc = osal_pwrite(env->me_fd4meta, page, env->me_psize, - (uint8_t *)page - env->me_map); + ptr_dist(page, env->me_map)); if (likely(rc == MDBX_SUCCESS)) { osal_flush_incoherent_mmap(target, sizeof(MDBX_meta), env->me_os_psize); if ((flags & MDBX_NOMETASYNC) == 0 && @@ -12122,14 +12118,14 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending, #endif /* MDBX_ENABLE_PGOP_STAT */ const MDBX_meta undo_meta = *target; rc = osal_pwrite(env->me_fd4meta, pending, sizeof(MDBX_meta), - (uint8_t *)target - env->me_map); + ptr_dist(target, env->me_map)); if (unlikely(rc != MDBX_SUCCESS)) { undo: DEBUG("%s", "write failed, disk error?"); /* On a failure, the pagecache still contains the new data. * Try write some old data back, to prevent it from being used. */ osal_pwrite(env->me_fd4meta, &undo_meta, sizeof(MDBX_meta), - (uint8_t *)target - env->me_map); + ptr_dist(target, env->me_map)); goto fail; } osal_flush_incoherent_mmap(target, sizeof(MDBX_meta), env->me_os_psize); @@ -13028,9 +13024,9 @@ __cold static int setup_dxb(MDBX_env *env, const int lck_rc, #if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__) if (env->me_dxb_mmap.filesize > used_bytes && env->me_dxb_mmap.filesize < env->me_dxb_mmap.limit) { - VALGRIND_MAKE_MEM_NOACCESS(env->me_map + used_bytes, + VALGRIND_MAKE_MEM_NOACCESS(ptr_disp(env->me_map, used_bytes), env->me_dxb_mmap.filesize - used_bytes); - MDBX_ASAN_POISON_MEMORY_REGION(env->me_map + used_bytes, + MDBX_ASAN_POISON_MEMORY_REGION(ptr_disp(env->me_map, used_bytes), env->me_dxb_mmap.filesize - used_bytes); } env->me_poison_edge = @@ -13052,7 +13048,7 @@ __cold static int setup_dxb(MDBX_env *env, const int lck_rc, err = validate_meta_copy(env, target, &clone); if (unlikely(err != MDBX_SUCCESS)) { ERROR("target meta[%u] is corrupted", - bytes2pgno(env, (uint8_t *)data_page(target) - env->me_map)); + bytes2pgno(env, ptr_dist(data_page(target), env->me_map))); meta_troika_dump(env, &troika); return MDBX_CORRUPTED; } @@ -13094,7 +13090,7 @@ __cold static int setup_dxb(MDBX_env *env, const int lck_rc, err = validate_meta_copy(env, prefer_steady.ptr_c, &clone); if (unlikely(err != MDBX_SUCCESS)) { ERROR("meta[%u] with %s txnid %" PRIaTXN " is corrupted, %s needed", - bytes2pgno(env, (uint8_t *)prefer_steady.ptr_c - env->me_map), + bytes2pgno(env, ptr_dist(prefer_steady.ptr_c, env->me_map)), "steady", prefer_steady.txnid, "manual recovery"); meta_troika_dump(env, &troika); return MDBX_CORRUPTED; @@ -13103,8 +13099,7 @@ __cold static int setup_dxb(MDBX_env *env, const int lck_rc, break; } - const pgno_t pgno = - bytes2pgno(env, (uint8_t *)recent.ptr_c - env->me_map); + const pgno_t pgno = bytes2pgno(env, ptr_dist(recent.ptr_c, env->me_map)); const bool last_valid = validate_meta_copy(env, recent.ptr_c, &clone) == MDBX_SUCCESS; eASSERT(env, @@ -13291,7 +13286,7 @@ __cold static int setup_dxb(MDBX_env *env, const int lck_rc, env->me_lck->mti_discarded_tail.weak, bytes2pgno(env, env->me_dxb_mmap.current)); err = - madvise(env->me_map + used_aligned2os_bytes, + madvise(ptr_disp(env->me_map, used_aligned2os_bytes), env->me_dxb_mmap.current - used_aligned2os_bytes, MADV_REMOVE) ? ignore_enosys(errno) : MDBX_SUCCESS; @@ -13304,7 +13299,7 @@ __cold static int setup_dxb(MDBX_env *env, const int lck_rc, env->me_lck->mti_discarded_tail.weak, bytes2pgno(env, env->me_dxb_mmap.current)); err = - madvise(env->me_map + used_aligned2os_bytes, + madvise(ptr_disp(env->me_map, used_aligned2os_bytes), env->me_dxb_mmap.current - used_aligned2os_bytes, MADV_DONTNEED) ? ignore_enosys(errno) : MDBX_SUCCESS; @@ -13312,7 +13307,7 @@ __cold static int setup_dxb(MDBX_env *env, const int lck_rc, return err; #elif defined(POSIX_MADV_DONTNEED) err = ignore_enosys(posix_madvise( - env->me_map + used_aligned2os_bytes, + ptr_disp(env->me_map, used_aligned2os_bytes), env->me_dxb_mmap.current - used_aligned2os_bytes, POSIX_MADV_DONTNEED)); if (unlikely(MDBX_IS_ERROR(err))) return err; @@ -14331,11 +14326,13 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, memset(env->me_pbuf, -1, env->me_psize * 2); MDBX_txn *txn = osal_calloc(1, size); if (txn) { - txn->mt_dbs = (MDBX_db *)((char *)txn + tsize); - txn->mt_cursors = (MDBX_cursor **)(txn->mt_dbs + env->me_maxdbs); + txn->mt_dbs = ptr_disp(txn, tsize); + txn->mt_cursors = + ptr_disp(txn->mt_dbs, sizeof(MDBX_db) * env->me_maxdbs); txn->mt_dbiseqs = - (MDBX_atomic_uint32_t *)(txn->mt_cursors + env->me_maxdbs); - txn->mt_dbistate = (uint8_t *)(txn->mt_dbiseqs + env->me_maxdbs); + ptr_disp(txn->mt_cursors, sizeof(MDBX_cursor *) * env->me_maxdbs); + txn->mt_dbistate = ptr_disp( + txn->mt_dbiseqs, sizeof(MDBX_atomic_uint32_t) * env->me_maxdbs); txn->mt_env = env; txn->mt_dbxs = env->me_dbxs; txn->mt_flags = MDBX_TXN_FINISHED; @@ -14633,8 +14630,8 @@ __hot static int cmp_lexical(const MDBX_val *a, const MDBX_val *b) { __hot static int cmp_reverse(const MDBX_val *a, const MDBX_val *b) { const size_t shortest = (a->iov_len < b->iov_len) ? a->iov_len : b->iov_len; if (likely(shortest)) { - const uint8_t *pa = (const uint8_t *)a->iov_base + a->iov_len; - const uint8_t *pb = (const uint8_t *)b->iov_base + b->iov_len; + const uint8_t *pa = ptr_disp(a->iov_base, a->iov_len); + const uint8_t *pb = ptr_disp(b->iov_base, b->iov_len); const uint8_t *const end = pa - shortest; do { int diff = *--pa - *--pb; @@ -14695,8 +14692,8 @@ __hot static struct node_result node_search(MDBX_cursor *mc, do { i = (low + high) >> 1; nodekey.iov_base = page_leaf2key(mp, i, nodekey.iov_len); - cASSERT(mc, (char *)mp + mc->mc_txn->mt_env->me_psize >= - (char *)nodekey.iov_base + nodekey.iov_len); + cASSERT(mc, ptr_disp(mp, mc->mc_txn->mt_env->me_psize) >= + ptr_disp(nodekey.iov_base, nodekey.iov_len)); int cr = cmp(key, &nodekey); DEBUG("found leaf index %zu [%s], rc = %i", i, DKEY_DEBUG(&nodekey), cr); if (cr > 0) @@ -14730,8 +14727,8 @@ __hot static struct node_result node_search(MDBX_cursor *mc, node = page_node(mp, i); nodekey.iov_len = node_ks(node); nodekey.iov_base = node_key(node); - cASSERT(mc, (char *)mp + mc->mc_txn->mt_env->me_psize >= - (char *)nodekey.iov_base + nodekey.iov_len); + cASSERT(mc, ptr_disp(mp, mc->mc_txn->mt_env->me_psize) >= + ptr_disp(nodekey.iov_base, nodekey.iov_len)); int cr = cmp(key, &nodekey); if (IS_LEAF(mp)) DEBUG("found leaf index %zu [%s], rc = %i", i, DKEY_DEBUG(&nodekey), cr); @@ -16689,11 +16686,11 @@ __hot int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, } else { /* there's only a key anyway, so this is a no-op */ if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { - char *ptr; size_t ksize = mc->mc_db->md_xsize; if (unlikely(key->iov_len != ksize)) return MDBX_BAD_VALSIZE; - ptr = page_leaf2key(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], ksize); + void *ptr = + page_leaf2key(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], ksize); memcpy(ptr, key->iov_base, ksize); fix_parent: /* if overwriting slot 0 of leaf, need to @@ -16805,8 +16802,8 @@ __hot int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, } else { olddata.iov_len = node_ds(node); olddata.iov_base = node_data(node); - cASSERT(mc, (char *)olddata.iov_base + olddata.iov_len <= - (char *)(mc->mc_pg[mc->mc_top]) + env->me_psize); + cASSERT(mc, ptr_disp(olddata.iov_base, olddata.iov_len) <= + ptr_disp(mc->mc_pg[mc->mc_top], env->me_psize)); /* DB has dups? */ if (mc->mc_db->md_flags & MDBX_DUPSORT) { @@ -16946,10 +16943,10 @@ __hot int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, memcpy(page_data(mp), page_data(fp), page_numkeys(fp) * fp->mp_leaf2_ksize); } else { - memcpy((char *)mp + mp->mp_upper + PAGEHDRSZ, - (char *)fp + fp->mp_upper + PAGEHDRSZ, + memcpy(ptr_disp(mp, mp->mp_upper + PAGEHDRSZ), + ptr_disp(fp, fp->mp_upper + PAGEHDRSZ), olddata.iov_len - fp->mp_upper - PAGEHDRSZ); - memcpy((char *)(&mp->mp_ptrs), (char *)(&fp->mp_ptrs), + memcpy(mp->mp_ptrs, fp->mp_ptrs, page_numkeys(fp) * sizeof(mp->mp_ptrs[0])); for (i = 0; i < page_numkeys(fp); i++) { cASSERT(mc, mp->mp_ptrs[i] + offset <= UINT16_MAX); @@ -16988,8 +16985,8 @@ __hot int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, cASSERT(mc, key->iov_len < UINT16_MAX); node_set_ks(node, key->iov_len); memcpy(node_key(node), key->iov_base, key->iov_len); - cASSERT(mc, (char *)node_key(node) + node_ds(node) < - (char *)(mc->mc_pg[mc->mc_top]) + env->me_psize); + cASSERT(mc, ptr_disp(node_key(node), node_ds(node)) < + ptr_disp(mc->mc_pg[mc->mc_top], env->me_psize)); goto fix_parent; } @@ -17133,7 +17130,7 @@ new_sub:; /* let caller know how many succeeded, if any */ data[1].iov_len = mcount; if (mcount < dcount) { - data[0].iov_base = (char *)data[0].iov_base + data[0].iov_len; + data[0].iov_base = ptr_disp(data[0].iov_base, data[0].iov_len); insert_key = insert_data = false; goto more; } @@ -17350,12 +17347,12 @@ __hot static int __must_check_result node_add_leaf2(MDBX_cursor *mc, mp->mp_lower = (indx_t)lower; mp->mp_upper = (indx_t)upper; - char *const ptr = page_leaf2key(mp, indx, ksize); + void *const ptr = page_leaf2key(mp, indx, ksize); cASSERT(mc, nkeys >= indx); const size_t diff = nkeys - indx; if (likely(diff > 0)) /* Move higher keys up one slot. */ - memmove(ptr + ksize, ptr, diff * ksize); + memmove(ptr_disp(ptr, ksize), ptr, diff * ksize); /* insert new key */ memcpy(ptr, key->iov_base, ksize); return MDBX_SUCCESS; @@ -17515,9 +17512,9 @@ __hot static void node_del(MDBX_cursor *mc, size_t ksize) { if (IS_LEAF2(mp)) { cASSERT(mc, ksize >= sizeof(indx_t)); size_t diff = nkeys - 1 - hole; - char *base = page_leaf2key(mp, hole, ksize); + void *const base = page_leaf2key(mp, hole, ksize); if (diff) - memmove(base, base + ksize, diff * ksize); + memmove(base, ptr_disp(base, ksize), diff * ksize); cASSERT(mc, mp->mp_lower >= sizeof(indx_t)); mp->mp_lower -= sizeof(indx_t); cASSERT(mc, (size_t)UINT16_MAX - mp->mp_upper >= ksize - sizeof(indx_t)); @@ -17541,8 +17538,8 @@ __hot static void node_del(MDBX_cursor *mc, size_t ksize) { ? mp->mp_ptrs[r] + (indx_t)hole_size : mp->mp_ptrs[r]; - char *base = (char *)mp + mp->mp_upper + PAGEHDRSZ; - memmove(base + hole_size, base, hole_offset - mp->mp_upper); + void *const base = ptr_disp(mp, mp->mp_upper + PAGEHDRSZ); + memmove(ptr_disp(base, hole_size), base, hole_offset - mp->mp_upper); cASSERT(mc, mp->mp_lower >= sizeof(indx_t)); mp->mp_lower -= sizeof(indx_t); @@ -17564,7 +17561,6 @@ __hot static void node_del(MDBX_cursor *mc, size_t ksize) { static void node_shrink(MDBX_page *mp, size_t indx) { MDBX_node *node; MDBX_page *sp, *xp; - char *base; size_t nsize, delta, len, ptr; intptr_t i; @@ -17582,7 +17578,7 @@ static void node_shrink(MDBX_page *mp, size_t indx) { assert(nsize % 1 == 0); len = nsize; } else { - xp = (MDBX_page *)((char *)sp + delta); /* destination subpage */ + xp = ptr_disp(sp, delta); /* destination subpage */ for (i = page_numkeys(sp); --i >= 0;) { assert(sp->mp_ptrs[i] >= delta); xp->mp_ptrs[i] = (indx_t)(sp->mp_ptrs[i] - delta); @@ -17595,8 +17591,8 @@ static void node_shrink(MDBX_page *mp, size_t indx) { node_set_ds(node, nsize); /* Shift upward */ - base = (char *)mp + mp->mp_upper + PAGEHDRSZ; - memmove(base + delta, base, (char *)sp + len - base); + void *const base = ptr_disp(mp, mp->mp_upper + PAGEHDRSZ); + memmove(ptr_disp(base, delta), base, ptr_dist(sp, base) + len); ptr = mp->mp_ptrs[indx]; for (i = page_numkeys(mp); --i >= 0;) { @@ -18090,7 +18086,6 @@ int mdbx_cursor_count(const MDBX_cursor *mc, size_t *countp) { static int update_key(MDBX_cursor *mc, const MDBX_val *key) { MDBX_page *mp; MDBX_node *node; - char *base; size_t len; ptrdiff_t delta, ksize, oksize; intptr_t ptr, i, nkeys, indx; @@ -18135,9 +18130,9 @@ static int update_key(MDBX_cursor *mc, const MDBX_val *key) { } } - base = (char *)mp + mp->mp_upper + PAGEHDRSZ; + void *const base = ptr_disp(mp, mp->mp_upper + PAGEHDRSZ); len = ptr - mp->mp_upper + NODESIZE; - memmove(base - delta, base, len); + memmove(ptr_disp(base, -delta), base, len); cASSERT(mc, mp->mp_upper >= delta); mp->mp_upper -= (indx_t)delta; @@ -18500,7 +18495,7 @@ static int page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { rc = node_add_leaf2(cdst, j++, &key); if (unlikely(rc != MDBX_SUCCESS)) return rc; - key.iov_base = (char *)key.iov_base + key.iov_len; + key.iov_base = ptr_disp(key.iov_base, key.iov_len); } while (++i != src_nkeys); } else { MDBX_node *srcnode = page_node(psrc, 0); @@ -18991,7 +18986,7 @@ __cold static int page_check(MDBX_cursor *const mc, const MDBX_page *const mp) { rc = bad_page(mp, "invalid pgno (%u)\n", mp->mp_pgno); MDBX_env *const env = mc->mc_txn->mt_env; - const ptrdiff_t offset = (uint8_t *)mp - env->me_dxb_mmap.dxb; + const ptrdiff_t offset = ptr_dist(mp, env->me_map); unsigned flags_mask = P_ILL_BITS; unsigned flags_expected = 0; if (offset < 0 || @@ -19063,7 +19058,7 @@ __cold static int page_check(MDBX_cursor *const mc, const MDBX_page *const mp) { rc = bad_page(mp, "invalid page lower(%u)/upper(%u) with limit %zu\n", mp->mp_lower, mp->mp_upper, page_space(env)); - char *const end_of_page = (char *)mp + env->me_psize; + const char *const end_of_page = ptr_disp(mp, env->me_psize); const size_t nkeys = page_numkeys(mp); STATIC_ASSERT(P_BRANCH == 1); if (unlikely(nkeys <= (uint8_t)(mp->mp_flags & P_BRANCH))) { @@ -19094,7 +19089,7 @@ __cold static int page_check(MDBX_cursor *const mc, const MDBX_page *const mp) { MDBX_val here, prev = {0, 0}; for (size_t i = 0; i < nkeys; ++i) { if (IS_LEAF2(mp)) { - char *const key = page_leaf2key(mp, i, leaf2_ksize); + const char *const key = page_leaf2key(mp, i, leaf2_ksize); if (unlikely(end_of_page < key + leaf2_ksize)) { rc = bad_page(mp, "leaf2-key beyond (%zu) page-end\n", key + leaf2_ksize - end_of_page); @@ -19111,8 +19106,8 @@ __cold static int page_check(MDBX_cursor *const mc, const MDBX_page *const mp) { mc->mc_dbx->md_klen_min = mc->mc_dbx->md_klen_max = leaf2_ksize; } if ((mc->mc_checking & CC_SKIPORD) == 0) { + here.iov_base = (void *)key; here.iov_len = leaf2_ksize; - here.iov_base = key; if (prev.iov_base && unlikely(mc->mc_dbx->md_cmp(&prev, &here) >= 0)) rc = bad_page(mp, "leaf2-key #%zu wrong order (%s >= %s)\n", i, DKEY(&prev), DVAL(&here)); @@ -19120,7 +19115,7 @@ __cold static int page_check(MDBX_cursor *const mc, const MDBX_page *const mp) { } } else { const MDBX_node *const node = page_node(mp, i); - const char *node_end = (char *)node + NODESIZE; + const char *const node_end = ptr_disp(node, NODESIZE); if (unlikely(node_end > end_of_page)) { rc = bad_page(mp, "node[%zu] (%zu) beyond page-end\n", i, node_end - end_of_page); @@ -19129,7 +19124,7 @@ __cold static int page_check(MDBX_cursor *const mc, const MDBX_page *const mp) { const size_t ksize = node_ks(node); if (unlikely(ksize > ksize_max)) rc = bad_page(mp, "node[%zu] too long key (%zu)\n", i, ksize); - char *key = node_key(node); + const char *const key = node_key(node); if (unlikely(end_of_page < key + ksize)) { rc = bad_page(mp, "node[%zu] key (%zu) beyond page-end\n", i, key + ksize - end_of_page); @@ -19142,7 +19137,7 @@ __cold static int page_check(MDBX_cursor *const mc, const MDBX_page *const mp) { mp, "node[%zu] key size (%zu) <> min/max key-length (%zu/%zu)\n", i, ksize, mc->mc_dbx->md_klen_min, mc->mc_dbx->md_klen_max); if ((mc->mc_checking & CC_SKIPORD) == 0) { - here.iov_base = key; + here.iov_base = (void *)key; here.iov_len = ksize; if (prev.iov_base && unlikely(mc->mc_dbx->md_cmp(&prev, &here) >= 0)) rc = bad_page(mp, "node[%zu] key wrong order (%s >= %s)\n", i, @@ -19282,8 +19277,8 @@ __cold static int page_check(MDBX_cursor *const mc, const MDBX_page *const mp) { for (int j = 0; j < nsubkeys; j++) { if (IS_LEAF2(sp)) { /* LEAF2 pages have no mp_ptrs[] or node headers */ - size_t sub_ksize = sp->mp_leaf2_ksize; - char *sub_key = page_leaf2key(sp, j, sub_ksize); + const size_t sub_ksize = sp->mp_leaf2_ksize; + const char *const sub_key = page_leaf2key(sp, j, sub_ksize); if (unlikely(end_of_subpage < sub_key + sub_ksize)) { rc = bad_page(mp, "nested-leaf2-key beyond (%zu) nested-page\n", sub_key + sub_ksize - end_of_subpage); @@ -19302,8 +19297,8 @@ __cold static int page_check(MDBX_cursor *const mc, const MDBX_page *const mp) { mc->mc_dbx->md_vlen_min = mc->mc_dbx->md_vlen_max = sub_ksize; } if ((mc->mc_checking & CC_SKIPORD) == 0) { + sub_here.iov_base = (void *)sub_key; sub_here.iov_len = sub_ksize; - sub_here.iov_base = sub_key; if (sub_prev.iov_base && unlikely(mc->mc_dbx->md_dcmp(&sub_prev, &sub_here) >= 0)) rc = bad_page(mp, @@ -19313,7 +19308,7 @@ __cold static int page_check(MDBX_cursor *const mc, const MDBX_page *const mp) { } } else { const MDBX_node *const sub_node = page_node(sp, j); - const char *sub_node_end = (char *)sub_node + NODESIZE; + const char *const sub_node_end = ptr_disp(sub_node, NODESIZE); if (unlikely(sub_node_end > end_of_subpage)) { rc = bad_page(mp, "nested-node beyond (%zu) nested-page\n", end_of_subpage - sub_node_end); @@ -19323,9 +19318,9 @@ __cold static int page_check(MDBX_cursor *const mc, const MDBX_page *const mp) { rc = bad_page(mp, "nested-node invalid flags (%u)\n", node_flags(sub_node)); - size_t sub_ksize = node_ks(sub_node); - char *sub_key = node_key(sub_node); - size_t sub_dsize = node_ds(sub_node); + const size_t sub_ksize = node_ks(sub_node); + const char *const sub_key = node_key(sub_node); + const size_t sub_dsize = node_ds(sub_node); /* char *sub_data = node_data(sub_node); */ if (unlikely(sub_ksize < mc->mc_dbx->md_vlen_min || @@ -19336,8 +19331,8 @@ __cold static int page_check(MDBX_cursor *const mc, const MDBX_page *const mp) { sub_ksize, mc->mc_dbx->md_vlen_min, mc->mc_dbx->md_vlen_max); if ((mc->mc_checking & CC_SKIPORD) == 0) { + sub_here.iov_base = (void *)sub_key; sub_here.iov_len = sub_ksize; - sub_here.iov_base = sub_key; if (sub_prev.iov_base && unlikely(mc->mc_dbx->md_dcmp(&sub_prev, &sub_here) >= 0)) rc = bad_page(mp, @@ -19775,14 +19770,12 @@ static int page_split(MDBX_cursor *mc, const MDBX_val *const newkey, TRACE("old-first-key is %s", DKEY_DEBUG(&sepkey)); } else { if (IS_LEAF2(sister)) { - char *split, *ins; - size_t lsize, rsize, ksize; /* Move half of the keys to the right sibling */ const intptr_t distance = mc->mc_ki[mc->mc_top] - split_indx; - ksize = mc->mc_db->md_xsize; - split = page_leaf2key(mp, split_indx, ksize); - rsize = (nkeys - split_indx) * ksize; - lsize = (nkeys - split_indx) * sizeof(indx_t); + size_t ksize = mc->mc_db->md_xsize; + void *const split = page_leaf2key(mp, split_indx, ksize); + size_t rsize = (nkeys - split_indx) * ksize; + size_t lsize = (nkeys - split_indx) * sizeof(indx_t); cASSERT(mc, mp->mp_lower >= lsize); mp->mp_lower -= (indx_t)lsize; cASSERT(mc, sister->mp_lower + lsize <= UINT16_MAX); @@ -19795,10 +19788,11 @@ static int page_split(MDBX_cursor *mc, const MDBX_val *const newkey, sepkey.iov_base = (newindx != split_indx) ? split : newkey->iov_base; if (distance < 0) { cASSERT(mc, ksize >= sizeof(indx_t)); - ins = page_leaf2key(mp, mc->mc_ki[mc->mc_top], ksize); + void *const ins = page_leaf2key(mp, mc->mc_ki[mc->mc_top], ksize); memcpy(sister->mp_ptrs, split, rsize); sepkey.iov_base = sister->mp_ptrs; - memmove(ins + ksize, ins, (split_indx - mc->mc_ki[mc->mc_top]) * ksize); + memmove(ptr_disp(ins, ksize), ins, + (split_indx - mc->mc_ki[mc->mc_top]) * ksize); memcpy(ins, newkey->iov_base, ksize); cASSERT(mc, UINT16_MAX - mp->mp_lower >= (int)sizeof(indx_t)); mp->mp_lower += sizeof(indx_t); @@ -19806,9 +19800,10 @@ static int page_split(MDBX_cursor *mc, const MDBX_val *const newkey, mp->mp_upper -= (indx_t)(ksize - sizeof(indx_t)); } else { memcpy(sister->mp_ptrs, split, distance * ksize); - ins = page_leaf2key(sister, distance, ksize); + void *const ins = page_leaf2key(sister, distance, ksize); memcpy(ins, newkey->iov_base, ksize); - memcpy(ins + ksize, split + distance * ksize, rsize - distance * ksize); + memcpy(ptr_disp(ins, ksize), ptr_disp(split, distance * ksize), + rsize - distance * ksize); cASSERT(mc, UINT16_MAX - sister->mp_lower >= (int)sizeof(indx_t)); sister->mp_lower += sizeof(indx_t); cASSERT(mc, sister->mp_upper >= ksize - sizeof(indx_t)); @@ -19887,8 +19882,7 @@ static int page_split(MDBX_cursor *mc, const MDBX_val *const newkey, cASSERT(mc, i <= nkeys); size_t size = new_size; if (i != newindx) { - MDBX_node *node = - (MDBX_node *)((char *)mp + tmp_ki_copy->mp_ptrs[i] + PAGEHDRSZ); + MDBX_node *node = ptr_disp(mp, tmp_ki_copy->mp_ptrs[i] + PAGEHDRSZ); size = NODESIZE + node_ks(node) + sizeof(indx_t); if (IS_LEAF(mp)) size += (node_flags(node) & F_BIGDATA) ? sizeof(pgno_t) @@ -19924,8 +19918,7 @@ static int page_split(MDBX_cursor *mc, const MDBX_val *const newkey, sepkey = *newkey; if (split_indx != newindx) { MDBX_node *node = - (MDBX_node *)((char *)mp + tmp_ki_copy->mp_ptrs[split_indx] + - PAGEHDRSZ); + ptr_disp(mp, tmp_ki_copy->mp_ptrs[split_indx] + PAGEHDRSZ); sepkey.iov_len = node_ks(node); sepkey.iov_base = node_key(node); } @@ -20082,8 +20075,7 @@ static int page_split(MDBX_cursor *mc, const MDBX_val *const newkey, /* Update index for the new key. */ mc->mc_ki[mc->mc_top] = (indx_t)n; } else { - MDBX_node *node = - (MDBX_node *)((char *)mp + tmp_ki_copy->mp_ptrs[i] + PAGEHDRSZ); + MDBX_node *node = ptr_disp(mp, tmp_ki_copy->mp_ptrs[i] + PAGEHDRSZ); rkey.iov_base = node_key(node); rkey.iov_len = node_ks(node); if (IS_LEAF(mp)) { @@ -20378,7 +20370,7 @@ static int compacting_put_bytes(mdbx_compacting_ctx *ctx, const void *src, assert(pgno == 0 || bytes > PAGEHDRSZ); while (bytes > 0) { const size_t side = ctx->mc_head & 1; - const size_t left = (size_t)MDBX_ENVCOPY_WRITEBUF - ctx->mc_wlen[side]; + const size_t left = MDBX_ENVCOPY_WRITEBUF - ctx->mc_wlen[side]; if (left < (pgno ? PAGEHDRSZ : 1)) { int err = compacting_toggle_write_buffers(ctx); if (unlikely(err != MDBX_SUCCESS)) @@ -20401,7 +20393,7 @@ static int compacting_put_bytes(mdbx_compacting_ctx *ctx, const void *src, } pgno = 0; } - src = (const char *)src + chunk; + src = ptr_disp(src, chunk); } else memset(dst, 0, chunk); bytes -= chunk; @@ -20434,8 +20426,7 @@ static int compacting_put_page(mdbx_compacting_ctx *ctx, const MDBX_page *mp, if (unlikely(err != MDBX_SUCCESS)) return err; return compacting_put_bytes( - ctx, (const char *)mp + ctx->mc_env->me_psize - tail_bytes, tail_bytes, 0, - 0); + ctx, ptr_disp(mp, ctx->mc_env->me_psize - tail_bytes), tail_bytes, 0, 0); } __cold static int compacting_walk_tree(mdbx_compacting_ctx *ctx, @@ -20451,18 +20442,18 @@ __cold static int compacting_walk_tree(mdbx_compacting_ctx *ctx, return rc; /* Make cursor pages writable */ - char *const buf = osal_malloc(pgno2bytes(ctx->mc_env, mc->mc_snum)); + void *const buf = osal_malloc(pgno2bytes(ctx->mc_env, mc->mc_snum)); if (buf == NULL) return MDBX_ENOMEM; - char *ptr = buf; + void *ptr = buf; for (size_t i = 0; i < mc->mc_top; i++) { - page_copy((MDBX_page *)ptr, mc->mc_pg[i], ctx->mc_env->me_psize); - mc->mc_pg[i] = (MDBX_page *)ptr; - ptr += ctx->mc_env->me_psize; + page_copy(ptr, mc->mc_pg[i], ctx->mc_env->me_psize); + mc->mc_pg[i] = ptr; + ptr = ptr_disp(ptr, ctx->mc_env->me_psize); } /* This is writable space for a leaf page. Usually not needed. */ - MDBX_page *const leaf = (MDBX_page *)ptr; + MDBX_page *const leaf = ptr; while (mc->mc_snum > 0) { MDBX_page *mp = mc->mc_pg[mc->mc_top]; @@ -20823,8 +20814,7 @@ __cold static int env_copy_asis(MDBX_env *env, MDBX_txn *read_txn, * but writing ones after the data was flushed */ memcpy(buffer, env->me_map, meta_bytes); MDBX_meta *const headcopy = /* LY: get pointer to the snapshot copy */ - (MDBX_meta *)(buffer + - ((uint8_t *)meta_recent(env, &troika).ptr_c - env->me_map)); + ptr_disp(buffer, ptr_dist(meta_recent(env, &troika).ptr_c, env->me_map)); mdbx_txn_unlock(env); if (flags & MDBX_CP_FORCE_DYNAMIC_SIZE) @@ -20899,7 +20889,7 @@ __cold static int env_copy_asis(MDBX_env *env, MDBX_txn *read_txn, ? (size_t)MDBX_ENVCOPY_WRITEBUF : used_size - offset; /* copy to avoid EFAULT in case swapped-out */ - memcpy(data_buffer, env->me_map + offset, chunk); + memcpy(data_buffer, ptr_disp(env->me_map, offset), chunk); rc = osal_write(fd, data_buffer, chunk); offset += chunk; } @@ -21746,7 +21736,7 @@ static int dbi_open(MDBX_txn *txn, const char *table_name, unsigned user_flags, } /* Done here so we cannot fail after creating a new DB */ - char *namedup = osal_strdup(table_name); + char *const namedup = osal_strdup(table_name); if (unlikely(!namedup)) { rc = MDBX_ENOMEM; goto early_bailout; @@ -21888,7 +21878,7 @@ static int dbi_close_locked(MDBX_env *env, MDBX_dbi dbi) { if (unlikely(dbi >= env->me_numdbs)) return MDBX_BAD_DBI; - char *ptr = env->me_dbxs[dbi].md_name.iov_base; + char *const ptr = env->me_dbxs[dbi].md_name.iov_base; /* If there was no name, this was already closed */ if (unlikely(!ptr)) return MDBX_BAD_DBI; @@ -23481,7 +23471,7 @@ int mdbx_is_dirty(const MDBX_txn *txn, const void *ptr) { return rc; const MDBX_env *env = txn->mt_env; - const ptrdiff_t offset = (uint8_t *)ptr - env->me_map; + const ptrdiff_t offset = ptr_dist(ptr, env->me_map); if (offset >= 0) { const pgno_t pgno = bytes2pgno(env, offset); if (likely(pgno < txn->mt_next_pgno)) { diff --git a/src/internals.h b/src/internals.h index 1fcfbace..64d9a779 100644 --- a/src/internals.h +++ b/src/internals.h @@ -572,7 +572,14 @@ typedef struct MDBX_page { /* Size of the page header, excluding dynamic data at the end */ #define PAGEHDRSZ offsetof(MDBX_page, mp_ptrs) -#define mp_next(mp) (*(MDBX_page **)((mp)->mp_ptrs + 2)) +/* Pointer displacement without casting to char* to avoid pointer-aliasing */ +#define ptr_disp(ptr, disp) ((void *)(((intptr_t)(ptr)) + ((intptr_t)(disp)))) + +/* Pointer distance as signed number of bytes */ +#define ptr_dist(more, less) (((intptr_t)(more)) - ((intptr_t)(less))) + +#define mp_next(mp) \ + (*(MDBX_page **)ptr_disp((mp)->mp_ptrs, sizeof(void *) - sizeof(uint32_t))) #pragma pack(pop) @@ -1204,7 +1211,7 @@ struct MDBX_env { #define ENV_INTERNAL_FLAGS (MDBX_FATAL_ERROR | MDBX_ENV_ACTIVE | MDBX_ENV_TXKEY) uint32_t me_flags; osal_mmap_t me_dxb_mmap; /* The main data file */ -#define me_map me_dxb_mmap.dxb +#define me_map me_dxb_mmap.base #define me_lazy_fd me_dxb_mmap.fd #define me_fd4data me_ioring.fd mdbx_filehandle_t me_dsync_fd, me_fd4meta; @@ -1470,7 +1477,8 @@ MDBX_NORETURN __cold void assert_fail(const char *msg, const char *func, #endif /* MDBX_CPU_WRITEBACK_INCOHERENT */ MDBX_MAYBE_UNUSED static __inline void -osal_flush_incoherent_mmap(void *addr, size_t nbytes, const intptr_t pagesize) { +osal_flush_incoherent_mmap(const void *addr, size_t nbytes, + const intptr_t pagesize) { #if MDBX_MMAP_INCOHERENT_FILE_WRITE char *const begin = (char *)(-pagesize & (intptr_t)addr); char *const end = @@ -1486,7 +1494,7 @@ osal_flush_incoherent_mmap(void *addr, size_t nbytes, const intptr_t pagesize) { #ifdef DCACHE /* MIPS has cache coherency issues. * Note: for any nbytes >= on-chip cache size, entire is flushed. */ - cacheflush(addr, nbytes, DCACHE); + cacheflush((void *)addr, nbytes, DCACHE); #else #error "Oops, cacheflush() not available" #endif /* DCACHE */ diff --git a/src/osal.c b/src/osal.c index e08e09ab..3099a269 100644 --- a/src/osal.c +++ b/src/osal.c @@ -628,7 +628,7 @@ MDBX_INTERNAL_FUNC int osal_ioring_create(osal_ioring_t *ior, assert(osal_iov_max > 0); #endif /* MDBX_HAVE_PWRITEV && _SC_IOV_MAX */ - ior->boundary = (char *)(ior->pool + ior->allocated); + ior->boundary = ptr_disp(ior->pool, ior->allocated); return MDBX_SUCCESS; } @@ -645,9 +645,9 @@ static __inline size_t ior_offset(const ior_item_t *item) { static __inline ior_item_t *ior_next(ior_item_t *item, size_t sgvcnt) { #if defined(ior_sgv_element) assert(sgvcnt > 0); - return (ior_item_t *)((char *)item + sizeof(ior_item_t) - - sizeof(ior_sgv_element) + - sizeof(ior_sgv_element) * sgvcnt); + return (ior_item_t *)ptr_disp(item, sizeof(ior_item_t) - + sizeof(ior_sgv_element) + + sizeof(ior_sgv_element) * sgvcnt); #else assert(sgvcnt == 1); (void)sgvcnt; @@ -684,15 +684,15 @@ MDBX_INTERNAL_FUNC int osal_ioring_add(osal_ioring_t *ior, const size_t offset, size_t i = 0; do { item->sgv[ior->last_sgvcnt + i].Buffer = PtrToPtr64(data); - data = (char *)data + ior->pagesize; + data = ptr_disp(data, ior->pagesize); } while (++i < segments); ior->slots_left -= segments; item->sgv[ior->last_sgvcnt += segments].Buffer = 0; assert((item->single.iov_len & ior_WriteFile_flag) == 0); return MDBX_SUCCESS; } - const void *end = (char *)(item->single.iov_base) + item->single.iov_len - - ior_WriteFile_flag; + const void *end = ptr_disp(item->single.iov_base, + item->single.iov_len - ior_WriteFile_flag); if (unlikely(end == data)) { assert((item->single.iov_len & ior_WriteFile_flag) != 0); item->single.iov_len += bytes; @@ -700,8 +700,8 @@ MDBX_INTERNAL_FUNC int osal_ioring_add(osal_ioring_t *ior, const size_t offset, } #elif MDBX_HAVE_PWRITEV assert((int)item->sgvcnt > 0); - const void *end = (char *)(item->sgv[item->sgvcnt - 1].iov_base) + - item->sgv[item->sgvcnt - 1].iov_len; + const void *end = ptr_disp(item->sgv[item->sgvcnt - 1].iov_base, + item->sgv[item->sgvcnt - 1].iov_len); if (unlikely(end == data)) { item->sgv[item->sgvcnt - 1].iov_len += bytes; ior->last_bytes += bytes; @@ -718,7 +718,7 @@ MDBX_INTERNAL_FUNC int osal_ioring_add(osal_ioring_t *ior, const size_t offset, return MDBX_SUCCESS; } #else - const void *end = (char *)(item->single.iov_base) + item->single.iov_len; + const void *end = ptr_disp(item->single.iov_base, item->single.iov_len); if (unlikely(end == data)) { item->single.iov_len += bytes; return MDBX_SUCCESS; @@ -747,7 +747,7 @@ MDBX_INTERNAL_FUNC int osal_ioring_add(osal_ioring_t *ior, const size_t offset, /* WriteFileGather() */ item->sgv[0].Buffer = PtrToPtr64(data); for (size_t i = 1; i < segments; ++i) { - data = (char *)data + ior->pagesize; + data = ptr_disp(data, ior->pagesize); item->sgv[slots_used].Buffer = PtrToPtr64(data); } item->sgv[slots_used].Buffer = 0; @@ -1131,7 +1131,7 @@ MDBX_INTERNAL_FUNC int osal_ioring_resize(osal_ioring_t *ior, size_t items) { memset(ior->pool + ior->allocated, 0, sizeof(ior_item_t) * (items - ior->allocated)); ior->allocated = (unsigned)items; - ior->boundary = (char *)(ior->pool + ior->allocated); + ior->boundary = ptr_disp(ior->pool, ior->allocated); #if defined(_WIN32) || defined(_WIN64) if (useSetFileIoOverlappedRange) { if (mdbx_SetFileIoOverlappedRange(ior->fd, ptr, (ULONG)bytes)) @@ -1474,7 +1474,7 @@ MDBX_INTERNAL_FUNC int osal_pwrite(mdbx_filehandle_t fd, const void *buf, #endif bytes -= written; offset += written; - buf = (char *)buf + written; + buf = ptr_disp(buf, written); } } @@ -1504,7 +1504,7 @@ MDBX_INTERNAL_FUNC int osal_write(mdbx_filehandle_t fd, const void *buf, } #endif bytes -= written; - buf = (char *)buf + written; + buf = ptr_disp(buf, written); } } @@ -1701,7 +1701,7 @@ MDBX_INTERNAL_FUNC int osal_thread_join(osal_thread_t thread) { MDBX_INTERNAL_FUNC int osal_msync(const osal_mmap_t *map, size_t offset, size_t length, enum osal_syncmode_bits mode_bits) { - uint8_t *ptr = (uint8_t *)map->address + offset; + void *ptr = ptr_disp(map->base, offset); #if defined(_WIN32) || defined(_WIN64) if (!FlushViewOfFile(ptr, length)) return (int)GetLastError(); @@ -2063,7 +2063,7 @@ MDBX_INTERNAL_FUNC int osal_mmap(const int flags, osal_mmap_t *map, assert(size <= limit); map->limit = 0; map->current = 0; - map->address = nullptr; + map->base = nullptr; map->filesize = 0; #if defined(_WIN32) || defined(_WIN64) map->section = NULL; @@ -2115,7 +2115,7 @@ MDBX_INTERNAL_FUNC int osal_mmap(const int flags, osal_mmap_t *map, : mdbx_RunningUnderWine() ? size : limit; err = NtMapViewOfSection( - map->section, GetCurrentProcess(), &map->address, + map->section, GetCurrentProcess(), &map->base, /* ZeroBits */ 0, /* CommitSize */ 0, /* SectionOffset */ NULL, &ViewSize, @@ -2126,10 +2126,10 @@ MDBX_INTERNAL_FUNC int osal_mmap(const int flags, osal_mmap_t *map, if (!NT_SUCCESS(err)) { NtClose(map->section); map->section = 0; - map->address = nullptr; + map->base = nullptr; return ntstatus2errcode(err); } - assert(map->address != MAP_FAILED); + assert(map->base != MAP_FAILED); map->current = (size_t)SectionSize.QuadPart; map->limit = ViewSize; @@ -2160,7 +2160,7 @@ MDBX_INTERNAL_FUNC int osal_mmap(const int flags, osal_mmap_t *map, #define MAP_NORESERVE 0 #endif - map->address = mmap( + map->base = mmap( NULL, limit, (flags & MDBX_WRITEMAP) ? PROT_READ | PROT_WRITE : PROT_READ, MAP_SHARED | MAP_FILE | MAP_NORESERVE | (F_ISSET(flags, MDBX_UTTERLY_NOSYNC) ? MAP_NOSYNC : 0) | @@ -2168,10 +2168,10 @@ MDBX_INTERNAL_FUNC int osal_mmap(const int flags, osal_mmap_t *map, : MAP_CONCEAL), map->fd, 0); - if (unlikely(map->address == MAP_FAILED)) { + if (unlikely(map->base == MAP_FAILED)) { map->limit = 0; map->current = 0; - map->address = nullptr; + map->base = nullptr; assert(errno != 0); return errno; } @@ -2179,39 +2179,38 @@ MDBX_INTERNAL_FUNC int osal_mmap(const int flags, osal_mmap_t *map, #if MDBX_ENABLE_MADVISE #ifdef MADV_DONTFORK - if (unlikely(madvise(map->address, map->limit, MADV_DONTFORK) != 0)) + if (unlikely(madvise(map->base, map->limit, MADV_DONTFORK) != 0)) return errno; #endif /* MADV_DONTFORK */ #ifdef MADV_NOHUGEPAGE - (void)madvise(map->address, map->limit, MADV_NOHUGEPAGE); + (void)madvise(map->base, map->limit, MADV_NOHUGEPAGE); #endif /* MADV_NOHUGEPAGE */ #endif /* MDBX_ENABLE_MADVISE */ #endif /* ! Windows */ - VALGRIND_MAKE_MEM_DEFINED(map->address, map->current); - MDBX_ASAN_UNPOISON_MEMORY_REGION(map->address, map->current); + VALGRIND_MAKE_MEM_DEFINED(map->base, map->current); + MDBX_ASAN_UNPOISON_MEMORY_REGION(map->base, map->current); return MDBX_SUCCESS; } MDBX_INTERNAL_FUNC int osal_munmap(osal_mmap_t *map) { - VALGRIND_MAKE_MEM_NOACCESS(map->address, map->current); + VALGRIND_MAKE_MEM_NOACCESS(map->base, map->current); /* Unpoisoning is required for ASAN to avoid false-positive diagnostic * when this memory will re-used by malloc or another mmapping. * See https://libmdbx.dqdkfa.ru/dead-github/pull/93#issuecomment-613687203 */ - MDBX_ASAN_UNPOISON_MEMORY_REGION(map->address, - (map->filesize && map->filesize < map->limit) - ? map->filesize - : map->limit); + MDBX_ASAN_UNPOISON_MEMORY_REGION( + map->base, (map->filesize && map->filesize < map->limit) ? map->filesize + : map->limit); #if defined(_WIN32) || defined(_WIN64) if (map->section) NtClose(map->section); - NTSTATUS rc = NtUnmapViewOfSection(GetCurrentProcess(), map->address); + NTSTATUS rc = NtUnmapViewOfSection(GetCurrentProcess(), map->base); if (!NT_SUCCESS(rc)) ntstatus2errcode(rc); #else - if (unlikely(munmap(map->address, map->limit))) { + if (unlikely(munmap(map->base, map->limit))) { assert(errno != 0); return errno; } @@ -2219,7 +2218,7 @@ MDBX_INTERNAL_FUNC int osal_munmap(osal_mmap_t *map) { map->limit = 0; map->current = 0; - map->address = nullptr; + map->base = nullptr; return MDBX_SUCCESS; } @@ -2252,7 +2251,7 @@ MDBX_INTERNAL_FUNC int osal_mresize(const int flags, osal_mmap_t *map, return err; /* check ability of address space for growth before unmap */ - PVOID BaseAddress = (PBYTE)map->address + map->limit; + PVOID BaseAddress = (PBYTE)map->base + map->limit; SIZE_T RegionSize = limit - map->limit; status = NtAllocateVirtualMemory(GetCurrentProcess(), &BaseAddress, 0, &RegionSize, MEM_RESERVE, PAGE_NOACCESS); @@ -2279,8 +2278,8 @@ MDBX_INTERNAL_FUNC int osal_mresize(const int flags, osal_mmap_t *map, * when this memory will re-used by malloc or another mmapping. * See https://libmdbx.dqdkfa.ru/dead-github/pull/93#issuecomment-613687203 */ - MDBX_ASAN_UNPOISON_MEMORY_REGION(map->address, map->limit); - status = NtUnmapViewOfSection(GetCurrentProcess(), map->address); + MDBX_ASAN_UNPOISON_MEMORY_REGION(map->base, map->limit); + status = NtUnmapViewOfSection(GetCurrentProcess(), map->base); if (!NT_SUCCESS(status)) return ntstatus2errcode(status); status = NtClose(map->section); @@ -2292,7 +2291,7 @@ MDBX_INTERNAL_FUNC int osal_mresize(const int flags, osal_mmap_t *map, bailout_ntstatus: err = ntstatus2errcode(status); bailout: - map->address = NULL; + map->base = NULL; map->current = map->limit = 0; if (ReservedAddress) { ReservedSize = 0; @@ -2307,7 +2306,7 @@ MDBX_INTERNAL_FUNC int osal_mresize(const int flags, osal_mmap_t *map, retry_file_and_section: /* resizing of the file may take a while, * therefore we reserve address space to avoid occupy it by other threads */ - ReservedAddress = map->address; + ReservedAddress = map->base; status = NtAllocateVirtualMemory(GetCurrentProcess(), &ReservedAddress, 0, &ReservedSize, MEM_RESERVE, PAGE_NOACCESS); if (!NT_SUCCESS(status)) { @@ -2317,7 +2316,7 @@ retry_file_and_section: if (flags & MDBX_MRESIZE_MAY_MOVE) /* the base address could be changed */ - map->address = NULL; + map->base = NULL; } err = osal_filesize(map->fd, &map->filesize); @@ -2362,7 +2361,7 @@ retry_file_and_section: retry_mapview:; SIZE_T ViewSize = (flags & MDBX_RDONLY) ? size : limit; status = NtMapViewOfSection( - map->section, GetCurrentProcess(), &map->address, + map->section, GetCurrentProcess(), &map->base, /* ZeroBits */ 0, /* CommitSize */ 0, /* SectionOffset */ NULL, &ViewSize, @@ -2373,15 +2372,15 @@ retry_mapview:; if (!NT_SUCCESS(status)) { if (status == (NTSTATUS) /* STATUS_CONFLICTING_ADDRESSES */ 0xC0000018 && - map->address && (flags & MDBX_MRESIZE_MAY_MOVE) != 0) { + map->base && (flags & MDBX_MRESIZE_MAY_MOVE) != 0) { /* try remap at another base address */ - map->address = NULL; + map->base = NULL; goto retry_mapview; } NtClose(map->section); map->section = NULL; - if (map->address && (size != map->current || limit != map->limit)) { + if (map->base && (size != map->current || limit != map->limit)) { /* try remap with previously size and limit, * but will return MDBX_UNABLE_EXTEND_MAPSIZE on success */ rc = (limit > map->limit) ? MDBX_UNABLE_EXTEND_MAPSIZE : MDBX_EPERM; @@ -2393,7 +2392,7 @@ retry_mapview:; /* no way to recovery */ goto bailout_ntstatus; } - assert(map->address != MAP_FAILED); + assert(map->base != MAP_FAILED); map->current = (size_t)SectionSize.QuadPart; map->limit = ViewSize; @@ -2425,7 +2424,7 @@ retry_mapview:; * - this allows us to clear the mask only within the file size * when closing the mapping. */ MDBX_ASAN_UNPOISON_MEMORY_REGION( - (char *)map->address + size, + ptr_disp(map->base, size), ((map->current < map->limit) ? map->current : map->limit) - size); } map->current = size; @@ -2437,7 +2436,7 @@ retry_mapview:; if (limit < map->limit) { /* unmap an excess at end of mapping. */ // coverity[offset_free : FALSE] - if (unlikely(munmap(map->dxb + limit, map->limit - limit))) { + if (unlikely(munmap(ptr_disp(map->base, limit), map->limit - limit))) { assert(errno != 0); return errno; } @@ -2450,10 +2449,10 @@ retry_mapview:; return err; assert(limit > map->limit); - uint8_t *ptr = MAP_FAILED; + void *ptr = MAP_FAILED; #if (defined(__linux__) || defined(__gnu_linux__)) && defined(_GNU_SOURCE) - ptr = mremap(map->address, map->limit, limit, + ptr = mremap(map->base, map->limit, limit, #if defined(MREMAP_MAYMOVE) (flags & MDBX_MRESIZE_MAY_MOVE) ? MREMAP_MAYMOVE : #endif /* MREMAP_MAYMOVE */ @@ -2482,11 +2481,11 @@ retry_mapview:; if (ptr == MAP_FAILED) { /* Try to mmap additional space beyond the end of mapping. */ - ptr = mmap(map->dxb + map->limit, limit - map->limit, mmap_prot, + ptr = mmap(ptr_disp(map->base, map->limit), limit - map->limit, mmap_prot, mmap_flags | MAP_FIXED_NOREPLACE, map->fd, map->limit); - if (ptr == map->dxb + map->limit) + if (ptr == ptr_disp(map->base, map->limit)) /* успешно прилепили отображение в конец */ - ptr = map->dxb; + ptr = map->base; else if (ptr != MAP_FAILED) { /* the desired address is busy, unmap unsuitable one */ if (unlikely(munmap(ptr, limit - map->limit))) { @@ -2519,13 +2518,13 @@ retry_mapview:; return MDBX_UNABLE_EXTEND_MAPSIZE; } - if (unlikely(munmap(map->address, map->limit))) { + if (unlikely(munmap(map->base, map->limit))) { assert(errno != 0); return errno; } // coverity[pass_freed_arg : FALSE] - ptr = mmap(map->address, limit, mmap_prot, + ptr = mmap(map->base, limit, mmap_prot, (flags & MDBX_MRESIZE_MAY_MOVE) ? mmap_flags : mmap_flags | (MAP_FIXED_NOREPLACE ? MAP_FIXED_NOREPLACE @@ -2535,13 +2534,13 @@ retry_mapview:; unlikely(ptr == MAP_FAILED) && !(flags & MDBX_MRESIZE_MAY_MOVE) && errno == /* kernel don't support MAP_FIXED_NOREPLACE */ EINVAL) // coverity[pass_freed_arg : FALSE] - ptr = mmap(map->address, limit, mmap_prot, mmap_flags | MAP_FIXED, - map->fd, 0); + ptr = + mmap(map->base, limit, mmap_prot, mmap_flags | MAP_FIXED, map->fd, 0); if (unlikely(ptr == MAP_FAILED)) { /* try to restore prev mapping */ // coverity[pass_freed_arg : FALSE] - ptr = mmap(map->address, map->limit, mmap_prot, + ptr = mmap(map->base, map->limit, mmap_prot, (flags & MDBX_MRESIZE_MAY_MOVE) ? mmap_flags : mmap_flags | (MAP_FIXED_NOREPLACE ? MAP_FIXED_NOREPLACE @@ -2551,21 +2550,20 @@ retry_mapview:; unlikely(ptr == MAP_FAILED) && !(flags & MDBX_MRESIZE_MAY_MOVE) && errno == /* kernel don't support MAP_FIXED_NOREPLACE */ EINVAL) // coverity[pass_freed_arg : FALSE] - ptr = mmap(map->address, map->limit, mmap_prot, mmap_flags | MAP_FIXED, + ptr = mmap(map->base, map->limit, mmap_prot, mmap_flags | MAP_FIXED, map->fd, 0); if (unlikely(ptr == MAP_FAILED)) { - VALGRIND_MAKE_MEM_NOACCESS(map->address, map->current); + VALGRIND_MAKE_MEM_NOACCESS(map->base, map->current); /* Unpoisoning is required for ASAN to avoid false-positive diagnostic * when this memory will re-used by malloc or another mmapping. * See * https://libmdbx.dqdkfa.ru/dead-github/pull/93#issuecomment-613687203 */ MDBX_ASAN_UNPOISON_MEMORY_REGION( - map->address, - (map->current < map->limit) ? map->current : map->limit); + map->base, (map->current < map->limit) ? map->current : map->limit); map->limit = 0; map->current = 0; - map->address = nullptr; + map->base = nullptr; assert(errno != 0); return errno; } @@ -2575,38 +2573,38 @@ retry_mapview:; } assert(ptr && ptr != MAP_FAILED); - if (map->address != ptr) { - VALGRIND_MAKE_MEM_NOACCESS(map->address, map->current); + if (map->base != ptr) { + VALGRIND_MAKE_MEM_NOACCESS(map->base, map->current); /* Unpoisoning is required for ASAN to avoid false-positive diagnostic * when this memory will re-used by malloc or another mmapping. * See * https://libmdbx.dqdkfa.ru/dead-github/pull/93#issuecomment-613687203 */ MDBX_ASAN_UNPOISON_MEMORY_REGION( - map->address, (map->current < map->limit) ? map->current : map->limit); + map->base, (map->current < map->limit) ? map->current : map->limit); VALGRIND_MAKE_MEM_DEFINED(ptr, map->current); MDBX_ASAN_UNPOISON_MEMORY_REGION(ptr, map->current); - map->address = ptr; + map->base = ptr; } map->limit = limit; #if MDBX_ENABLE_MADVISE #ifdef MADV_DONTFORK - if (unlikely(madvise(map->address, map->limit, MADV_DONTFORK) != 0)) { + if (unlikely(madvise(map->base, map->limit, MADV_DONTFORK) != 0)) { assert(errno != 0); return errno; } #endif /* MADV_DONTFORK */ #ifdef MADV_NOHUGEPAGE - (void)madvise(map->address, map->limit, MADV_NOHUGEPAGE); + (void)madvise(map->base, map->limit, MADV_NOHUGEPAGE); #endif /* MADV_NOHUGEPAGE */ #endif /* MDBX_ENABLE_MADVISE */ #endif /* POSIX / Windows */ assert(rc != MDBX_SUCCESS || - (map->address != nullptr && map->address != MAP_FAILED && + (map->base != nullptr && map->base != MAP_FAILED && map->current == size && map->limit == limit)); return rc; } diff --git a/src/osal.h b/src/osal.h index e8abacf3..fad805b4 100644 --- a/src/osal.h +++ b/src/osal.h @@ -228,10 +228,9 @@ typedef wchar_t pathchar_t; typedef char pathchar_t; #endif -typedef struct osal_mmap_param { +typedef struct osal_mmap { union { - void *address; - uint8_t *dxb; + void *base; struct MDBX_lockinfo *lck; }; mdbx_filehandle_t fd; From b7734369a2dd6e281502a36fbdb9baaec75589e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Tue, 29 Nov 2022 01:16:36 +0300 Subject: [PATCH 245/364] =?UTF-8?q?mdbx:=20=D0=BA=D0=B0=D0=B2=D1=8B=D1=87?= =?UTF-8?q?=D0=BA=D0=B8=20=D0=B4=D0=BB=D1=8F=20=D0=BF=D1=80=D0=B5=D0=B4?= =?UTF-8?q?=D1=83=D0=BF=D1=80=D0=B5=D0=B6=D0=B4=D0=B5=D0=BD=D0=B8=D1=8F=20?= =?UTF-8?q?=D0=BE=20=D0=B4=D1=8B=D1=80=D1=8F=D0=B2=D0=BE=D1=81=D1=82=D0=B8?= =?UTF-8?q?=20RISC-V=20=D0=B4=D0=BB=D1=8F=20=D1=81=D0=BE=D0=B2=D0=BC=D0=B5?= =?UTF-8?q?=D1=81=D1=82=D0=B8=D0=BC=D0=BE=D1=81=D1=82=D0=B8.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mdbx.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mdbx.h b/mdbx.h index 98e8b494..b4d25f74 100644 --- a/mdbx.h +++ b/mdbx.h @@ -77,10 +77,10 @@ OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. #if defined(__riscv) || defined(__riscv__) || defined(__RISCV) || \ defined(__RISCV__) -#warning The RISC-V architecture is intentionally insecure by design. \ +#warning "The RISC-V architecture is intentionally insecure by design. \ Please delete this admonition at your own risk, \ if you make such decision informed and consciously. \ - Refer to https://clck.ru/32d9xH for more information. + Refer to https://clck.ru/32d9xH for more information." #endif /* RISC-V */ #ifdef _MSC_VER From 2776480f18dabe2ee083bea05512715d6675a74e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Tue, 29 Nov 2022 02:35:42 +0300 Subject: [PATCH 246/364] =?UTF-8?q?mdbx:=20=D0=BE=D0=BF=D1=82=D0=B8=D0=BC?= =?UTF-8?q?=D0=B8=D0=B7=D0=B0=D1=86=D0=B8=D1=8F=20`pnl=5Fmerge()`=20=D0=B4?= =?UTF-8?q?=D0=BB=D1=8F=20=D1=81=D0=BB=D1=83=D1=87=D0=B0=D0=B5=D0=B2=20?= =?UTF-8?q?=D0=BD=D0=B5=D0=BF=D0=B5=D1=80=D0=B5=D0=BA=D1=80=D1=8B=D0=B2?= =?UTF-8?q?=D0=B0=D1=8E=D1=89=D0=B8=D1=85=D1=81=D1=8F=20=D0=BE=D0=B1=D1=8A?= =?UTF-8?q?=D0=B5=D0=B4=D0=B8=D0=BD=D1=8F=D0=B5=D0=BC=D1=8B=D1=85=20=D1=81?= =?UTF-8?q?=D0=BF=D0=B8=D1=81=D0=BA=D0=BE=D0=B2.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/src/core.c b/src/core.c index 44bb8bf1..3fc7f05d 100644 --- a/src/core.c +++ b/src/core.c @@ -2451,8 +2451,21 @@ __hot static size_t pnl_merge(MDBX_PNL dst, const MDBX_PNL src) { if (likely(src_len > 0)) { total += src_len; assert(MDBX_PNL_ALLOCLEN(dst) >= total); - dst[0] = /* the detent */ (MDBX_PNL_ASCENDING ? 0 : P_INVALID); - pnl_merge_inner(dst + total, dst + dst_len, src + src_len, src); + if (!MDBX_DEBUG && total < (MDBX_HAVE_CMOV ? 21 : 12)) + goto avoid_call_libc_for_short_cases; + if (dst_len == 0 || + MDBX_PNL_ORDERED(MDBX_PNL_LAST(dst), MDBX_PNL_FIRST(src))) + memcpy(MDBX_PNL_END(dst), MDBX_PNL_BEGIN(src), src_len * sizeof(pgno_t)); + else if (MDBX_PNL_ORDERED(MDBX_PNL_LAST(src), MDBX_PNL_FIRST(dst))) { + memmove(MDBX_PNL_BEGIN(dst) + src_len, MDBX_PNL_BEGIN(dst), + dst_len * sizeof(pgno_t)); + memcpy(MDBX_PNL_BEGIN(dst), MDBX_PNL_BEGIN(src), + src_len * sizeof(pgno_t)); + } else { + avoid_call_libc_for_short_cases: + dst[0] = /* the detent */ (MDBX_PNL_ASCENDING ? 0 : P_INVALID); + pnl_merge_inner(dst + total, dst + dst_len, src + src_len, src); + } MDBX_PNL_SETSIZE(dst, total); } assert(pnl_check_allocated(dst, MAX_PAGENO + 1)); From 512e6dbd080e9b123734c301100392cbe857c210 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Wed, 30 Nov 2022 21:56:12 +0300 Subject: [PATCH 247/364] =?UTF-8?q?mdbx:=20=D0=BE=D1=82=D0=BA=D0=BB=D1=8E?= =?UTF-8?q?=D1=87=D0=B5=D0=BD=D0=B8=D0=B5=20=D0=B1=D0=B5=D0=B7=D1=83=D1=81?= =?UTF-8?q?=D0=BB=D0=BE=D0=B2=D0=BD=D0=BE=D0=B3=D0=BE=20=D0=BF=D1=80=D0=B5?= =?UTF-8?q?=D0=B4=D0=BF=D0=BE=D1=87=D1=82=D0=B5=D0=BD=D0=B8=D1=8F=20=D0=B7?= =?UTF-8?q?=D0=B0=D0=BF=D0=B8=D1=81=D0=B8=20=D1=87=D0=B5=D1=80=D0=B5=D0=B7?= =?UTF-8?q?=20=D0=B4=D0=B5=D1=81=D0=BA=D1=80=D0=B8=D0=BF=D1=82=D0=BE=D1=80?= =?UTF-8?q?=20=D1=81=20`O=5FDSYNC`.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Требуется переработка = динамический выбор между write(O_DSYNC) и write()+fdatasync(), в зависимости от количества записываемых линейных фрагментов. --- src/core.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/core.c b/src/core.c index 3fc7f05d..ff289306 100644 --- a/src/core.c +++ b/src/core.c @@ -14241,8 +14241,10 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, if (env->me_dsync_fd != INVALID_HANDLE_VALUE) { if ((flags & MDBX_NOMETASYNC) == 0) env->me_fd4meta = env->me_dsync_fd; +#if defined(_WIN32) || defined(_WIN64) if (env->me_fd4data == env->me_lazy_fd) env->me_fd4data = env->me_dsync_fd; +#endif /* Windows must die */ osal_fseek(env->me_dsync_fd, safe_parking_lot_offset); } } From 163486fa3a6370dd86d355f86920eb1a27e97db3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Thu, 1 Dec 2022 03:00:40 +0300 Subject: [PATCH 248/364] =?UTF-8?q?mdbx:=20=D0=B4=D0=BE=D0=B1=D0=B0=D0=B2?= =?UTF-8?q?=D0=BB=D0=B5=D0=BD=D0=B8=D0=B5=20FIXME=20=D0=B4=D0=BB=D1=8F=20`?= =?UTF-8?q?MDBX=5FNOMETASYNC`.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/src/core.c b/src/core.c index ff289306..b5d959af 100644 --- a/src/core.c +++ b/src/core.c @@ -11244,9 +11244,26 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { } const meta_ptr_t head = meta_recent(env, &txn->tw.troika); + /* sync prev meta */ if (head.is_steady && atomic_load32(&env->me_lck->mti_meta_sync_txnid, mo_Relaxed) != (uint32_t)head.txnid) { - /* sync prev meta */ + /* FIXME: Тут есть унаследованный от LMDB недочет. + * + * Проблем нет, если все процессы работающие с БД не используют WRITEMAP. + * Тогда мета-страница (обновленная, но не сброшенная на диск) будет + * сохранена в результате fdatasync() при записи данных этой транзакции. + * + * Проблем нет, если все процессы работающие с БД используют WRITEMAP + * без MDBX_AVOID_MSYNC. + * Тогда мета-страница (обновленная, но не сброшенная на диск) будет + * сохранена в результате msync() при записи данных этой транзакции. + * + * Если же происходит комбинирование WRITEMAP и записи через файловый + * дескриптор, то требуется явно обновлять мета-страницу. Однако, + * так полностью теряется выгода от NOMETASYNC. + * + * Дефект же в том, что сейчас нет возможности отличить последний случай от + * двух предыдущих и поэтому приходится всегда задействовать meta_sync(). */ rc = meta_sync(env, head); if (unlikely(rc != MDBX_SUCCESS)) { ERROR("txn-%s: error %d", "presync-meta", rc); From 9f2d30c1a901df457748f1bab1d4410f0c475c07 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Fri, 2 Dec 2022 10:27:13 +0300 Subject: [PATCH 249/364] =?UTF-8?q?mdbx:=20=D0=B8=D0=B7=D0=BC=D0=B5=D0=BD?= =?UTF-8?q?=D0=B5=D0=BD=D0=B8=D0=B5=20=D1=80=D0=B0=D0=B7=D0=BC=D0=B5=D1=80?= =?UTF-8?q?=D0=B0=20=D0=BE=D1=82=D0=BE=D0=B1=D1=80=D0=B0=D0=B6=D0=B5=D0=BD?= =?UTF-8?q?=D0=B8=D1=8F=20=D0=B2=D0=BD=D1=83=D1=82=D1=80=D0=B8=20`env=5Fsy?= =?UTF-8?q?nc()`=20=D0=B5=D1=81=D0=BB=D0=B8=20=D1=8D=D1=82=D0=BE=20=D1=82?= =?UTF-8?q?=D1=80=D0=B5=D0=B1=D1=83=D0=B5=D1=82=D1=81=D1=8F=20=D0=B4=D0=BB?= =?UTF-8?q?=D1=8F=20=D1=81=D0=B1=D1=80=D0=BE=D1=81=D0=B0=20=D0=B4=D0=B0?= =?UTF-8?q?=D0=BD=D0=BD=D1=8B=D1=85=20=D0=BD=D0=B0=20=D0=B4=D0=B8=D1=81?= =?UTF-8?q?=D0=BA.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/core.c b/src/core.c index b5d959af..7b1b3d17 100644 --- a/src/core.c +++ b/src/core.c @@ -7646,6 +7646,15 @@ retry:; goto bailout; } + if (!inside_txn && locked && (env->me_flags & MDBX_WRITEMAP) && + unlikely(head.ptr_c->mm_geo.next > + bytes2pgno(env, env->me_dxb_mmap.current))) { + rc = map_resize_implicit(env, head.ptr_c->mm_geo.next, + head.ptr_c->mm_geo.now, head.ptr_c->mm_geo.upper); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + } + const size_t autosync_threshold = atomic_load32(&env->me_lck->mti_autosync_threshold, mo_Relaxed); const uint64_t autosync_period = From 822952ef017e34bc8e8be69f819abc08768c0f1b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Fri, 2 Dec 2022 19:50:31 +0300 Subject: [PATCH 250/364] =?UTF-8?q?mdbx:=20=D0=B2=D0=BD=D1=83=D1=82=D1=80?= =?UTF-8?q?=D0=B5=D0=BD=D0=BD=D0=B5=D0=B5=20=D0=BF=D0=B5=D1=80=D0=B5=D0=B8?= =?UTF-8?q?=D0=BC=D0=B5=D0=BD=D0=BE=D0=B2=D0=B0=D0=BD=D0=B8=D0=B5=20`MDBX?= =?UTF-8?q?=5FSYNC=5FKICK`=20(=D0=BA=D0=BE=D1=81=D0=BC=D0=B5=D1=82=D0=B8?= =?UTF-8?q?=D0=BA=D0=B0).?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core.c | 12 ++++++------ src/osal.c | 4 ++-- src/osal.h | 2 +- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/core.c b/src/core.c index 7b1b3d17..e5953d96 100644 --- a/src/core.c +++ b/src/core.c @@ -1681,7 +1681,7 @@ __cold static int uniq_check(const osal_mmap_t *pending, MDBX_env **found) { err = uniq_poke(pending, &scan->me_lck_mmap, &salt); if (err == MDBX_RESULT_TRUE) { (void)osal_msync(&scan->me_lck_mmap, 0, sizeof(MDBX_lockinfo), - MDBX_SYNC_NONE); + MDBX_SYNC_KICK); err = uniq_poke(pending, &scan->me_lck_mmap, &salt); } if (err == MDBX_RESULT_TRUE) { @@ -4813,7 +4813,7 @@ __cold static int txn_spill_slowpath(MDBX_txn *const txn, MDBX_cursor *const m0, const MDBX_env *env = txn->mt_env; rc = osal_msync(&txn->mt_env->me_dxb_mmap, 0, - pgno_align2os_bytes(env, txn->mt_next_pgno), MDBX_SYNC_NONE); + pgno_align2os_bytes(env, txn->mt_next_pgno), MDBX_SYNC_KICK); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; dpl_clear(txn->tw.dirtylist); @@ -5920,7 +5920,7 @@ __cold static int map_resize(MDBX_env *env, const pgno_t used_pgno, env->me_lck->mti_pgop_stat.msync.weak += 1; #endif /* MDBX_ENABLE_PGOP_STAT */ rc = osal_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, used_pgno), - MDBX_SYNC_NONE); + MDBX_SYNC_KICK); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } @@ -11978,7 +11978,7 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending, rc = MDBX_RESULT_FALSE /* carry steady */; if (atomic_load64(&env->me_lck->mti_unsynced_pages, mo_Relaxed)) { eASSERT(env, ((flags ^ env->me_flags) & MDBX_WRITEMAP) == 0); - enum osal_syncmode_bits mode_bits = MDBX_SYNC_NONE; + enum osal_syncmode_bits mode_bits = MDBX_SYNC_KICK; unsigned sync_op = 0; if ((flags & MDBX_SAFE_NOSYNC) == 0) { sync_op = 1; @@ -12129,7 +12129,7 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending, #endif /* MDBX_ENABLE_PGOP_STAT */ rc = osal_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, NUM_METAS), (flags & MDBX_NOMETASYNC) - ? MDBX_SYNC_NONE + ? MDBX_SYNC_KICK : MDBX_SYNC_DATA | MDBX_SYNC_IODQ); } else { #if MDBX_ENABLE_PGOP_STAT @@ -13533,7 +13533,7 @@ __cold static int setup_lck(MDBX_env *env, pathchar_t *lck_pathname, #if MDBX_ENABLE_PGOP_STAT lck->mti_pgop_stat.wops.weak = 1; #endif /* MDBX_ENABLE_PGOP_STAT */ - err = osal_msync(&env->me_lck_mmap, 0, (size_t)size, MDBX_SYNC_NONE); + err = osal_msync(&env->me_lck_mmap, 0, (size_t)size, MDBX_SYNC_KICK); if (unlikely(err != MDBX_SUCCESS)) { ERROR("initial-%s for lck-file failed", "msync"); goto bailout; diff --git a/src/osal.c b/src/osal.c index 3099a269..b1e6ded4 100644 --- a/src/osal.c +++ b/src/osal.c @@ -1560,7 +1560,7 @@ MDBX_INTERNAL_FUNC int osal_fsync(mdbx_filehandle_t fd, * see http://www.spinics.net/lists/linux-ext4/msg33714.html */ while (1) { switch (mode_bits & (MDBX_SYNC_DATA | MDBX_SYNC_SIZE)) { - case MDBX_SYNC_NONE: + case MDBX_SYNC_KICK: return MDBX_SUCCESS /* nothing to do */; #if defined(_POSIX_SYNCHRONIZED_IO) && _POSIX_SYNCHRONIZED_IO > 0 case MDBX_SYNC_DATA: @@ -1714,7 +1714,7 @@ MDBX_INTERNAL_FUNC int osal_msync(const osal_mmap_t *map, size_t offset, // so just leave such optimization to the libc discretion. // // assert(linux_kernel_version > 0x02061300); - // if (mode_bits == MDBX_SYNC_NONE) + // if (mode_bits == MDBX_SYNC_KICK) // return MDBX_SUCCESS; #endif /* Linux */ if (msync(ptr, length, (mode_bits & MDBX_SYNC_DATA) ? MS_SYNC : MS_ASYNC)) diff --git a/src/osal.h b/src/osal.h index fad805b4..aaa7809a 100644 --- a/src/osal.h +++ b/src/osal.h @@ -520,7 +520,7 @@ osal_thread_create(osal_thread_t *thread, MDBX_INTERNAL_FUNC int osal_thread_join(osal_thread_t thread); enum osal_syncmode_bits { - MDBX_SYNC_NONE = 0, + MDBX_SYNC_KICK = 0, MDBX_SYNC_DATA = 1, MDBX_SYNC_SIZE = 2, MDBX_SYNC_IODQ = 4 From 23d236f70e690b21065fe2119caad0b71d9adf4b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Sat, 3 Dec 2022 14:55:38 +0300 Subject: [PATCH 251/364] =?UTF-8?q?mdbx:=20=D0=B4=D0=BE=D0=B1=D0=B0=D0=B2?= =?UTF-8?q?=D0=BB=D0=B5=D0=BD=D0=B8=D0=B5=20`MDBX=5Fopt=5Fwritethrough=5Ft?= =?UTF-8?q?hreshold`=20=D0=B8=20=D1=81=D0=BE=D0=BF=D1=83=D1=82=D1=81=D1=82?= =?UTF-8?q?=D0=B2=D1=83=D1=8E=D1=89=D0=B8=D0=B5=20=D0=B4=D0=BE=D1=80=D0=B0?= =?UTF-8?q?=D0=B1=D0=BE=D1=82=D0=BA=D0=B8.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mdbx.h | 28 ++++++ src/core.c | 218 +++++++++++++++++++++++++++++++++------------- src/internals.h | 11 ++- src/lck-windows.c | 36 +++++--- src/osal.c | 62 +++++++------ src/osal.h | 24 ++--- 6 files changed, 266 insertions(+), 113 deletions(-) diff --git a/mdbx.h b/mdbx.h index b4d25f74..3f58ccb0 100644 --- a/mdbx.h +++ b/mdbx.h @@ -2220,6 +2220,34 @@ enum MDBX_option_t { * to 50% (half empty) which corresponds to the range from 8192 and to 32768 * in units respectively. */ MDBX_opt_merge_threshold_16dot16_percent, + + /** \brief Controls the choosing between use write-through disk writes and + * usual ones with followed flush by the `fdatasync()` syscall. + * \details Depending on the operating system, storage subsystem + * characteristics and the use case, higher performance can be achieved by + * either using write-through or a serie of usual/lazy writes followed by + * the flush-to-disk. + * + * Basically for N chunks the latency/cost of write-through is: + * latency = N * (emit + round-trip-to-storage + storage-execution); + * And for serie of lazy writes with flush is: + * latency = N * (emit + storage-execution) + flush + round-trip-to-storage. + * + * So, for large N and/or noteable round-trip-to-storage the write+flush + * approach is win. But for small N and/or near-zero NVMe-like latency + * the write-through is better. + * + * To solve this issue libmdbx provide `MDBX_opt_writethrough_threshold`: + * - when N described above less or equal specified threshold, + * a write-through approach will be used; + * - otherwise, when N great than specified threshold, + * a write-and-flush approach will be used. + * + * \note MDBX_opt_writethrough_threshold affects only \ref MDBX_SYNC_DURABLE + * mode without \ref MDBX_WRITEMAP, and not supported on Windows. + * On Windows a write-through is used always but \ref MDBX_NOMETASYNC could + * be used for switching to write-and-flush. */ + MDBX_opt_writethrough_threshold, }; #ifndef __cplusplus /** \ingroup c_settings */ diff --git a/src/core.c b/src/core.c index e5953d96..a01c18cd 100644 --- a/src/core.c +++ b/src/core.c @@ -4446,6 +4446,7 @@ static __inline int page_retire(MDBX_cursor *mc, MDBX_page *mp) { typedef struct iov_ctx { MDBX_env *env; osal_ioring_t *ior; + mdbx_filehandle_t fd; int err; #ifndef MDBX_NEED_WRITTEN_RANGE #define MDBX_NEED_WRITTEN_RANGE 1 @@ -4458,10 +4459,12 @@ typedef struct iov_ctx { } iov_ctx_t; __must_check_result static int iov_init(MDBX_txn *const txn, iov_ctx_t *ctx, - size_t items, size_t npages) { + size_t items, size_t npages, + mdbx_filehandle_t fd) { ctx->env = txn->mt_env; ctx->ior = &txn->mt_env->me_ioring; - ctx->err = osal_ioring_reserve(ctx->ior, items, + ctx->fd = fd; + ctx->err = osal_ioring_prepare(ctx->ior, items, pgno_align2os_bytes(txn->mt_env, npages)); if (likely(ctx->err == MDBX_SUCCESS)) { #if MDBX_NEED_WRITTEN_RANGE @@ -4534,12 +4537,10 @@ static void iov_complete(iov_ctx_t *ctx) { __must_check_result static int iov_write(iov_ctx_t *ctx) { eASSERT(ctx->env, !iov_empty(ctx)); - osal_ioring_write_result_t r = osal_ioring_write(ctx->ior); + osal_ioring_write_result_t r = osal_ioring_write(ctx->ior, ctx->fd); #if MDBX_ENABLE_PGOP_STAT ctx->env->me_lck->mti_pgop_stat.wops.weak += r.wops; #endif /* MDBX_ENABLE_PGOP_STAT */ - if (!ctx->env->me_lck->mti_eoos_timestamp.weak) - ctx->env->me_lck->mti_eoos_timestamp.weak = osal_monotime(); ctx->err = r.err; if (unlikely(ctx->err != MDBX_SUCCESS)) ERROR("Write error: %s", mdbx_strerror(ctx->err)); @@ -4596,7 +4597,6 @@ __must_check_result static int iov_page(MDBX_txn *txn, iov_ctx_t *ctx, ? ctx->flush_end : dp->mp_pgno + (pgno_t)npages; #endif /* MDBX_NEED_WRITTEN_RANGE */ - env->me_lck->mti_unsynced_pages.weak += npages; return MDBX_SUCCESS; } @@ -4816,6 +4816,8 @@ __cold static int txn_spill_slowpath(MDBX_txn *const txn, MDBX_cursor *const m0, pgno_align2os_bytes(env, txn->mt_next_pgno), MDBX_SYNC_KICK); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; + env->me_lck->mti_unsynced_pages.weak += + txn->tw.dirtylist->pages_including_loose - txn->tw.loose_count; dpl_clear(txn->tw.dirtylist); txn->tw.dirtyroom = env->me_options.dp_limit - txn->tw.loose_count; for (MDBX_page *lp = txn->tw.loose_pages; lp != nullptr; lp = mp_next(lp)) { @@ -4950,7 +4952,12 @@ __cold static int txn_spill_slowpath(MDBX_txn *const txn, MDBX_cursor *const m0, tASSERT(txn, prio2spill < prio2adjacent && prio2adjacent <= 256); iov_ctx_t ctx; - rc = iov_init(txn, &ctx, amount_entries, amount_npages); + rc = + iov_init(txn, &ctx, amount_entries, amount_npages, +#if defined(_WIN32) || defined(_WIN64) + txn->mt_env->me_overlapped_fd ? txn->mt_env->me_overlapped_fd : +#endif + txn->mt_env->me_lazy_fd); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; @@ -5028,6 +5035,7 @@ __cold static int txn_spill_slowpath(MDBX_txn *const txn, MDBX_cursor *const m0, if (unlikely(rc != MDBX_SUCCESS)) goto bailout; + txn->mt_env->me_lck->mti_unsynced_pages.weak += spilled_npages; if (!MDBX_AVOID_MSYNC || !(txn->mt_flags & MDBX_WRITEMAP)) { pnl_sort(txn->tw.spilled.list, (size_t)txn->mt_next_pgno << 1); txn->mt_flags |= MDBX_TXN_SPILLS; @@ -10543,7 +10551,7 @@ static int txn_write(MDBX_txn *txn, iov_ctx_t *ctx) { tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); MDBX_dpl *const dl = dpl_sort(txn); int rc = MDBX_SUCCESS; - size_t r, w; + size_t r, w, total_npages = 0; for (w = 0, r = 1; r <= dl->length; ++r) { MDBX_page *dp = dl->items[r].ptr; if (dp->mp_flags & P_LOOSE) { @@ -10551,9 +10559,10 @@ static int txn_write(MDBX_txn *txn, iov_ctx_t *ctx) { continue; } unsigned npages = dpl_npages(dl, r); + total_npages += npages; rc = iov_page(txn, ctx, dp, npages); if (unlikely(rc != MDBX_SUCCESS)) - break; + return rc; } if (!iov_empty(ctx)) { @@ -10561,6 +10570,13 @@ static int txn_write(MDBX_txn *txn, iov_ctx_t *ctx) { rc = iov_write(ctx); } + if (likely(rc == MDBX_SUCCESS) && ctx->fd == txn->mt_env->me_lazy_fd) { + txn->mt_env->me_lck->mti_unsynced_pages.weak += total_npages; + if (!txn->mt_env->me_lck->mti_eoos_timestamp.weak) + txn->mt_env->me_lck->mti_eoos_timestamp.weak = osal_monotime(); + } + + txn->tw.dirtylist->pages_including_loose -= total_npages; while (r <= dl->length) dl->items[++w] = dl->items[r++]; @@ -10569,6 +10585,8 @@ static int txn_write(MDBX_txn *txn, iov_ctx_t *ctx) { tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length == (txn->mt_parent ? txn->mt_parent->tw.dirtyroom : txn->mt_env->me_options.dp_limit)); + tASSERT(txn, txn->tw.dirtylist->length == txn->tw.loose_count); + tASSERT(txn, txn->tw.dirtylist->pages_including_loose == txn->tw.loose_count); return rc; } @@ -11235,6 +11253,7 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { if (unlikely(rc != MDBX_SUCCESS)) goto fail; + tASSERT(txn, txn->tw.loose_count == 0); txn->mt_dbs[FREE_DBI].md_mod_txnid = (txn->mt_dbistate[FREE_DBI] & DBI_DIRTY) ? txn->mt_txnid : txn->mt_dbs[FREE_DBI].md_mod_txnid; @@ -11252,40 +11271,74 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { goto fail; } + bool need_flush_for_nometasync = false; const meta_ptr_t head = meta_recent(env, &txn->tw.troika); + const uint32_t meta_sync_txnid = + atomic_load32(&env->me_lck->mti_meta_sync_txnid, mo_Relaxed); /* sync prev meta */ - if (head.is_steady && atomic_load32(&env->me_lck->mti_meta_sync_txnid, - mo_Relaxed) != (uint32_t)head.txnid) { - /* FIXME: Тут есть унаследованный от LMDB недочет. + if (head.is_steady && meta_sync_txnid != (uint32_t)head.txnid) { + /* Исправление унаследованного от LMDB недочета: * - * Проблем нет, если все процессы работающие с БД не используют WRITEMAP. + * Всё хорошо, если все процессы работающие с БД не используют WRITEMAP. * Тогда мета-страница (обновленная, но не сброшенная на диск) будет * сохранена в результате fdatasync() при записи данных этой транзакции. * - * Проблем нет, если все процессы работающие с БД используют WRITEMAP + * Всё хорошо, если все процессы работающие с БД используют WRITEMAP * без MDBX_AVOID_MSYNC. * Тогда мета-страница (обновленная, но не сброшенная на диск) будет * сохранена в результате msync() при записи данных этой транзакции. * - * Если же происходит комбинирование WRITEMAP и записи через файловый - * дескриптор, то требуется явно обновлять мета-страницу. Однако, - * так полностью теряется выгода от NOMETASYNC. - * - * Дефект же в том, что сейчас нет возможности отличить последний случай от - * двух предыдущих и поэтому приходится всегда задействовать meta_sync(). */ - rc = meta_sync(env, head); - if (unlikely(rc != MDBX_SUCCESS)) { - ERROR("txn-%s: error %d", "presync-meta", rc); - goto fail; + * Если же в процессах работающих с БД используется оба метода, как sync() + * в режиме MDBX_WRITEMAP, так и записи через файловый дескриптор, то + * становится невозможным обеспечить фиксацию на диске мета-страницы + * предыдущей транзакции и данных текущей транзакции, за счет одной + * sync-операцией выполняемой после записи данных текущей транзакции. + * Соответственно, требуется явно обновлять мета-страницу, что полностью + * уничтожает выгоду от NOMETASYNC. */ + const uint32_t txnid_dist = + ((txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC) + ? MDBX_NOMETASYNC_LAZY_FD + : MDBX_NOMETASYNC_LAZY_WRITEMAP; + /* Смысл "магии" в том, чтобы избежать отдельного вызова fdatasync() + * или msync() для гарантированной фиксации на диске мета-страницы, + * которая была "лениво" отправлена на запись в предыдущей транзакции, + * но не сброшена на диск из-за активного режима MDBX_NOMETASYNC. */ + if ( +#if defined(_WIN32) || defined(_WIN64) + !env->me_overlapped_fd && +#endif + meta_sync_txnid == (uint32_t)head.txnid - txnid_dist) + need_flush_for_nometasync = true; + else { + rc = meta_sync(env, head); + if (unlikely(rc != MDBX_SUCCESS)) { + ERROR("txn-%s: error %d", "presync-meta", rc); + goto fail; + } } } if (txn->tw.dirtylist) { tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); + tASSERT(txn, txn->tw.loose_count == 0); + + mdbx_filehandle_t fd = +#if defined(_WIN32) || defined(_WIN64) + env->me_overlapped_fd ? env->me_overlapped_fd : env->me_lazy_fd; + (void)need_flush_for_nometasync; +#else +#define MDBX_WRITETHROUGH_THRESHOLD_DEFAULT 2 + (need_flush_for_nometasync || + env->me_dsync_fd == INVALID_HANDLE_VALUE || + txn->tw.dirtylist->length > env->me_options.writethrough_threshold || + atomic_load64(&env->me_lck->mti_unsynced_pages, mo_Relaxed)) + ? env->me_lazy_fd + : env->me_dsync_fd; +#endif /* Windows */ + iov_ctx_t write_ctx; rc = iov_init(txn, &write_ctx, txn->tw.dirtylist->length, - txn->tw.dirtylist->pages_including_loose - - txn->tw.loose_count); + txn->tw.dirtylist->pages_including_loose, fd); if (unlikely(rc != MDBX_SUCCESS)) { ERROR("txn-%s: error %d", "iov-init", rc); goto fail; @@ -11298,6 +11351,9 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { } } else { tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) != 0 && !MDBX_AVOID_MSYNC); + env->me_lck->mti_unsynced_pages.weak += txn->tw.writemap_dirty_npages; + if (!env->me_lck->mti_eoos_timestamp.weak) + env->me_lck->mti_eoos_timestamp.weak = osal_monotime(); } /* TODO: use ctx.flush_begin & ctx.flush_end for range-sync */ @@ -12020,6 +12076,8 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending, atomic_store64(&env->me_lck->mti_unsynced_pages, 0, mo_Relaxed); } else { assert(rc == MDBX_RESULT_TRUE /* carry non-steady */); + eASSERT(env, env->me_lck->mti_unsynced_pages.weak > 0); + eASSERT(env, env->me_lck->mti_eoos_timestamp.weak != 0); unaligned_poke_u64(4, pending->mm_sign, MDBX_DATASIGN_WEAK); } @@ -12188,9 +12246,15 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending, if (unlikely(rc != MDBX_RESULT_TRUE)) goto fail; } + + const uint32_t sync_txnid_dist = + ((flags & MDBX_NOMETASYNC) == 0) ? 0 + : ((flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC) + ? MDBX_NOMETASYNC_LAZY_FD + : MDBX_NOMETASYNC_LAZY_WRITEMAP; env->me_lck->mti_meta_sync_txnid.weak = pending->mm_txnid_a[__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__].weak - - ((flags & MDBX_NOMETASYNC) ? UINT32_MAX / 3 : 0); + sync_txnid_dist; *troika = meta_tap(env); for (MDBX_txn *txn = env->me_txn0; txn; txn = txn->mt_child) @@ -12349,11 +12413,8 @@ __cold int mdbx_env_create(MDBX_env **penv) { env->me_maxreaders = DEFAULT_READERS; env->me_maxdbs = env->me_numdbs = CORE_DBS; - env->me_lazy_fd = env->me_dsync_fd = env->me_fd4meta = env->me_fd4data = -#if defined(_WIN32) || defined(_WIN64) - env->me_overlapped_fd = -#endif /* Windows */ - env->me_lfd = INVALID_HANDLE_VALUE; + env->me_lazy_fd = env->me_dsync_fd = env->me_fd4meta = env->me_lfd = + INVALID_HANDLE_VALUE; env->me_pid = osal_getpid(); env->me_stuck_meta = -1; @@ -12371,6 +12432,14 @@ __cold int mdbx_env_create(MDBX_env **penv) { env->me_options.dp_loose_limit = 64; env->me_options.merge_threshold_16dot16_percent = 65536 / 4 /* 25% */; +#if !(defined(_WIN32) || defined(_WIN64)) + env->me_options.writethrough_threshold = +#if defined(__linux__) || defined(__gnu_linux__) + mdbx_RunningOnWSL1 ? MAX_PAGENO : +#endif /* Linux */ + MDBX_WRITETHROUGH_THRESHOLD_DEFAULT; +#endif /* Windows */ + env->me_os_psize = (unsigned)os_psize; setup_pagesize(env, (env->me_os_psize < MAX_PAGESIZE) ? env->me_os_psize : MAX_PAGESIZE); @@ -14184,12 +14253,12 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, const uint64_t safe_parking_lot_offset = UINT64_C(0x7fffFFFF80000000); osal_fseek(env->me_lazy_fd, safe_parking_lot_offset); - env->me_fd4data = env->me_fd4meta = env->me_lazy_fd; + env->me_fd4meta = env->me_lazy_fd; #if defined(_WIN32) || defined(_WIN64) - uint8_t ior_flags = 0; - if ((flags & (MDBX_RDONLY | MDBX_SAFE_NOSYNC)) == MDBX_SYNC_DURABLE) { - ior_flags = IOR_OVERLAPPED; - if ((flags & MDBX_WRITEMAP) && MDBX_AVOID_MSYNC) { + eASSERT(env, env->me_overlapped_fd == 0); + bool ior_direct = false; + if (!(flags & (MDBX_RDONLY | MDBX_SAFE_NOSYNC | MDBX_NOMETASYNC))) { + if (MDBX_AVOID_MSYNC && (flags & MDBX_WRITEMAP)) { /* Запрошен режим MDBX_SAFE_NOSYNC | MDBX_WRITEMAP при активной опции * MDBX_AVOID_MSYNC. * @@ -14203,23 +14272,30 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, * 2) Кроме этого, в Windows запись в заблокированный регион файла * возможно только через тот-же дескриптор. Поэтому изначальный захват * блокировок посредством osal_lck_seize(), захват/освобождение блокировок - * во время пишущих транзакций и запись данных должны выполнять через один - * дескриптор. + * во время пишущих транзакций и запись данных должны выполнятся через + * один дескриптор. * * Таким образом, требуется прочитать волатильный заголовок БД, чтобы * узнать размер страницы, чтобы открыть дескриптор файла в режиме нужном * для записи данных, чтобы использовать именно этот дескриптор для * изначального захвата блокировок. */ MDBX_meta header; - if (read_header(env, &header, MDBX_SUCCESS, true) == MDBX_SUCCESS && - header.mm_psize >= env->me_os_psize) - ior_flags |= IOR_DIRECT; + uint64_t dxb_filesize; + int err = read_header(env, &header, MDBX_SUCCESS, true); + if ((err == MDBX_SUCCESS && header.mm_psize >= env->me_os_psize) || + (err == MDBX_ENODATA && mode && env->me_psize >= env->me_os_psize && + osal_filesize(env->me_lazy_fd, &dxb_filesize) == MDBX_SUCCESS && + dxb_filesize == 0)) + /* Может быть коллизия, если два процесса пытаются одновременно создать + * БД с разным размером страницы, который у одного меньше системной + * страницы, а у другого НЕ меньше. Эта допустимая, но очень странная + * ситуация. Поэтому считаем её ошибочной и не пытаемся разрешить. */ + ior_direct = true; } - rc = - osal_openfile((ior_flags & IOR_DIRECT) ? MDBX_OPEN_DXB_OVERLAPPED_DIRECT - : MDBX_OPEN_DXB_OVERLAPPED, - env, env_pathname.dxb, &env->me_overlapped_fd, 0); + rc = osal_openfile(ior_direct ? MDBX_OPEN_DXB_OVERLAPPED_DIRECT + : MDBX_OPEN_DXB_OVERLAPPED, + env, env_pathname.dxb, &env->me_overlapped_fd, 0); if (rc != MDBX_SUCCESS) goto bailout; env->me_data_lock_event = CreateEventW(nullptr, true, false, nullptr); @@ -14227,7 +14303,6 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, rc = (int)GetLastError(); goto bailout; } - env->me_fd4data = env->me_overlapped_fd; osal_fseek(env->me_overlapped_fd, safe_parking_lot_offset); } #else @@ -14260,17 +14335,12 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, MDBX_DEPRECATED_COALESCE | MDBX_NORDAHEAD; eASSERT(env, env->me_dsync_fd == INVALID_HANDLE_VALUE); - if ((flags & (MDBX_RDONLY | MDBX_SAFE_NOSYNC)) == 0 && - (env->me_fd4data == env->me_lazy_fd || !(flags & MDBX_NOMETASYNC))) { + if (!(flags & (MDBX_RDONLY | MDBX_SAFE_NOSYNC | MDBX_DEPRECATED_MAPASYNC))) { rc = osal_openfile(MDBX_OPEN_DXB_DSYNC, env, env_pathname.dxb, &env->me_dsync_fd, 0); if (env->me_dsync_fd != INVALID_HANDLE_VALUE) { if ((flags & MDBX_NOMETASYNC) == 0) env->me_fd4meta = env->me_dsync_fd; -#if defined(_WIN32) || defined(_WIN64) - if (env->me_fd4data == env->me_lazy_fd) - env->me_fd4data = env->me_dsync_fd; -#endif /* Windows must die */ osal_fseek(env->me_dsync_fd, safe_parking_lot_offset); } } @@ -14386,11 +14456,12 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, rc = MDBX_ENOMEM; } if (rc == MDBX_SUCCESS) - rc = osal_ioring_create(&env->me_ioring, + rc = osal_ioring_create(&env->me_ioring #if defined(_WIN32) || defined(_WIN64) - ior_flags, + , + ior_direct, env->me_overlapped_fd #endif /* Windows */ - env->me_fd4data); + ); } #if MDBX_DEBUG @@ -14462,10 +14533,13 @@ __cold static int env_close(MDBX_env *env) { } #if defined(_WIN32) || defined(_WIN64) - if (env->me_overlapped_fd != INVALID_HANDLE_VALUE) { - CloseHandle(env->me_data_lock_event); + if (env->me_overlapped_fd) { CloseHandle(env->me_overlapped_fd); - env->me_overlapped_fd = INVALID_HANDLE_VALUE; + env->me_overlapped_fd = 0; + } + if (env->me_data_lock_event != INVALID_HANDLE_VALUE) { + CloseHandle(env->me_data_lock_event); + env->me_data_lock_event = INVALID_HANDLE_VALUE; } #endif /* Windows */ @@ -24054,6 +24128,24 @@ __cold int mdbx_env_set_option(MDBX_env *env, const MDBX_option_t option, recalculate_merge_threshold(env); break; + case MDBX_opt_writethrough_threshold: + if (value != (unsigned)value) + err = MDBX_EINVAL; + else +#if defined(_WIN32) || defined(_WIN64) + /* позволяем "установить" значение по-умолчанию и совпадающее + * с поведением соответствующим текущей установке MDBX_NOMETASYNC */ + if ((unsigned)-1 != (unsigned)value && + value != ((env->me_flags & MDBX_NOMETASYNC) ? 0 : INT_MAX)) + err = MDBX_EINVAL; +#else + env->me_options.writethrough_threshold = + ((unsigned)-1 == (unsigned)value) + ? MDBX_WRITETHROUGH_THRESHOLD_DEFAULT + : (unsigned)value; +#endif + + break; default: return MDBX_EINVAL; } @@ -24127,6 +24219,14 @@ __cold int mdbx_env_get_option(const MDBX_env *env, const MDBX_option_t option, *pvalue = env->me_options.merge_threshold_16dot16_percent; break; + case MDBX_opt_writethrough_threshold: +#if defined(_WIN32) || defined(_WIN64) + *pvalue = (env->me_flags & MDBX_NOMETASYNC) ? 0 : INT_MAX; +#else + *pvalue = env->me_options.writethrough_threshold; +#endif + break; + default: return MDBX_EINVAL; } diff --git a/src/internals.h b/src/internals.h index 64d9a779..d504684c 100644 --- a/src/internals.h +++ b/src/internals.h @@ -761,6 +761,10 @@ typedef struct MDBX_lockinfo { /* Low 32-bit of txnid with which meta-pages was synced, * i.e. for sync-polling in the MDBX_NOMETASYNC mode. */ +#define MDBX_NOMETASYNC_LAZY_UNK (UINT32_MAX / 3) +#define MDBX_NOMETASYNC_LAZY_FD (MDBX_NOMETASYNC_LAZY_UNK + UINT32_MAX / 8) +#define MDBX_NOMETASYNC_LAZY_WRITEMAP \ + (MDBX_NOMETASYNC_LAZY_UNK - UINT32_MAX / 8) MDBX_atomic_uint32_t mti_meta_sync_txnid; /* Period for timed auto-sync feature, i.e. at the every steady checkpoint @@ -1213,10 +1217,10 @@ struct MDBX_env { osal_mmap_t me_dxb_mmap; /* The main data file */ #define me_map me_dxb_mmap.base #define me_lazy_fd me_dxb_mmap.fd -#define me_fd4data me_ioring.fd mdbx_filehandle_t me_dsync_fd, me_fd4meta; #if defined(_WIN32) || defined(_WIN64) - HANDLE me_overlapped_fd, me_data_lock_event; +#define me_overlapped_fd me_ioring.overlapped_fd + HANDLE me_data_lock_event; #endif /* Windows */ osal_mmap_t me_lck_mmap; /* The lock file */ #define me_lfd me_lck_mmap.fd @@ -1259,6 +1263,9 @@ struct MDBX_env { uint8_t spill_min_denominator; uint8_t spill_parent4child_denominator; unsigned merge_threshold_16dot16_percent; +#if !(defined(_WIN32) || defined(_WIN64)) + unsigned writethrough_threshold; +#endif /* Windows */ union { unsigned all; /* tracks options with non-auto values but tuned by user */ diff --git a/src/lck-windows.c b/src/lck-windows.c index 3bbe3f3b..e6ae78d2 100644 --- a/src/lck-windows.c +++ b/src/lck-windows.c @@ -152,8 +152,10 @@ static __inline int flock(HANDLE fd, unsigned flags, size_t offset, static __inline int flock_data(const MDBX_env *env, unsigned flags, size_t offset, size_t bytes) { - return flock_with_event(env->me_fd4data, env->me_data_lock_event, flags, - offset, bytes); + const HANDLE fd4data = + env->me_overlapped_fd ? env->me_overlapped_fd : env->me_lazy_fd; + return flock_with_event(fd4data, env->me_data_lock_event, flags, offset, + bytes); } static int funlock(mdbx_filehandle_t fd, size_t offset, size_t bytes) { @@ -195,17 +197,19 @@ int mdbx_txn_lock(MDBX_env *env, bool dontwait) { if (env->me_flags & MDBX_EXCLUSIVE) return MDBX_SUCCESS; - int rc = flock_with_event(env->me_fd4data, env->me_data_lock_event, + const HANDLE fd4data = + env->me_overlapped_fd ? env->me_overlapped_fd : env->me_lazy_fd; + int rc = flock_with_event(fd4data, env->me_data_lock_event, dontwait ? (LCK_EXCLUSIVE | LCK_DONTWAIT) : (LCK_EXCLUSIVE | LCK_WAITFOR), DXB_BODY); if (rc == ERROR_LOCK_VIOLATION && dontwait) { SleepEx(0, true); - rc = flock_with_event(env->me_fd4data, env->me_data_lock_event, + rc = flock_with_event(fd4data, env->me_data_lock_event, LCK_EXCLUSIVE | LCK_DONTWAIT, DXB_BODY); if (rc == ERROR_LOCK_VIOLATION) { SleepEx(0, true); - rc = flock_with_event(env->me_fd4data, env->me_data_lock_event, + rc = flock_with_event(fd4data, env->me_data_lock_event, LCK_EXCLUSIVE | LCK_DONTWAIT, DXB_BODY); } } @@ -218,7 +222,9 @@ int mdbx_txn_lock(MDBX_env *env, bool dontwait) { void mdbx_txn_unlock(MDBX_env *env) { if ((env->me_flags & MDBX_EXCLUSIVE) == 0) { - int err = funlock(env->me_fd4data, DXB_BODY); + const HANDLE fd4data = + env->me_overlapped_fd ? env->me_overlapped_fd : env->me_lazy_fd; + int err = funlock(fd4data, DXB_BODY); if (err != MDBX_SUCCESS) mdbx_panic("%s failed: err %u", __func__, err); } @@ -451,18 +457,20 @@ static void lck_unlock(MDBX_env *env) { SetLastError(ERROR_SUCCESS); } - if (env->me_fd4data != INVALID_HANDLE_VALUE) { + const HANDLE fd4data = + env->me_overlapped_fd ? env->me_overlapped_fd : env->me_lazy_fd; + if (fd4data != INVALID_HANDLE_VALUE) { /* explicitly unlock to avoid latency for other processes (windows kernel * releases such locks via deferred queues) */ do - err = funlock(env->me_fd4data, DXB_BODY); + err = funlock(fd4data, DXB_BODY); while (err == MDBX_SUCCESS); assert(err == ERROR_NOT_LOCKED || (mdbx_RunningUnderWine() && err == ERROR_LOCK_VIOLATION)); SetLastError(ERROR_SUCCESS); do - err = funlock(env->me_fd4data, DXB_WHOLE); + err = funlock(fd4data, DXB_WHOLE); while (err == MDBX_SUCCESS); assert(err == ERROR_NOT_LOCKED || (mdbx_RunningUnderWine() && err == ERROR_LOCK_VIOLATION)); @@ -522,7 +530,9 @@ static int internal_seize_lck(HANDLE lfd) { } MDBX_INTERNAL_FUNC int osal_lck_seize(MDBX_env *env) { - assert(env->me_fd4data != INVALID_HANDLE_VALUE); + const HANDLE fd4data = + env->me_overlapped_fd ? env->me_overlapped_fd : env->me_lazy_fd; + assert(fd4data != INVALID_HANDLE_VALUE); if (env->me_flags & MDBX_EXCLUSIVE) return MDBX_RESULT_TRUE /* nope since files were must be opened non-shareable */ @@ -554,7 +564,7 @@ MDBX_INTERNAL_FUNC int osal_lck_seize(MDBX_env *env) { return err; } jitter4testing(false); - err = funlock(env->me_fd4data, DXB_WHOLE); + err = funlock(fd4data, DXB_WHOLE); if (err != MDBX_SUCCESS) mdbx_panic("%s(%s) failed: err %u", __func__, "unlock-against-without-lck", err); @@ -564,8 +574,10 @@ MDBX_INTERNAL_FUNC int osal_lck_seize(MDBX_env *env) { } MDBX_INTERNAL_FUNC int osal_lck_downgrade(MDBX_env *env) { + const HANDLE fd4data = + env->me_overlapped_fd ? env->me_overlapped_fd : env->me_lazy_fd; /* Transite from exclusive-write state (E-E) to used (S-?) */ - assert(env->me_fd4data != INVALID_HANDLE_VALUE); + assert(fd4data != INVALID_HANDLE_VALUE); assert(env->me_lfd != INVALID_HANDLE_VALUE); if (env->me_flags & MDBX_EXCLUSIVE) diff --git a/src/osal.c b/src/osal.c index b1e6ded4..83e7ca57 100644 --- a/src/osal.c +++ b/src/osal.c @@ -606,16 +606,18 @@ static size_t osal_iov_max; #undef OSAL_IOV_MAX #endif /* OSAL_IOV_MAX */ -MDBX_INTERNAL_FUNC int osal_ioring_create(osal_ioring_t *ior, +MDBX_INTERNAL_FUNC int osal_ioring_create(osal_ioring_t *ior #if defined(_WIN32) || defined(_WIN64) - uint8_t flags, + , + bool enable_direct, + mdbx_filehandle_t overlapped_fd #endif /* Windows */ - mdbx_filehandle_t fd) { +) { memset(ior, 0, sizeof(osal_ioring_t)); - ior->fd = fd; #if defined(_WIN32) || defined(_WIN64) - ior->flags = flags; + ior->overlapped_fd = overlapped_fd; + ior->direct = enable_direct && overlapped_fd; const unsigned pagesize = (unsigned)osal_syspagesize(); ior->pagesize = pagesize; ior->pagesize_ln2 = (uint8_t)log2n_powerof2(pagesize); @@ -664,7 +666,7 @@ MDBX_INTERNAL_FUNC int osal_ioring_add(osal_ioring_t *ior, const size_t offset, #if defined(_WIN32) || defined(_WIN64) const unsigned segments = (unsigned)(bytes >> ior->pagesize_ln2); const bool use_gather = - (ior->flags & IOR_DIRECT) && ior->slots_left >= segments; + ior->direct && ior->overlapped_fd && ior->slots_left >= segments; #endif /* Windows */ ior_item_t *item = ior->pool; @@ -678,6 +680,7 @@ MDBX_INTERNAL_FUNC int osal_ioring_add(osal_ioring_t *ior, const size_t offset, (uintptr_t)(uint64_t)item->sgv[0].Buffer) & ior_alignment_mask) == 0 && ior->last_sgvcnt + segments < OSAL_IOV_MAX) { + assert(ior->overlapped_fd); assert((item->single.iov_len & ior_WriteFile_flag) == 0); assert(item->sgv[ior->last_sgvcnt].Buffer == 0); ior->last_bytes += bytes; @@ -745,6 +748,7 @@ MDBX_INTERNAL_FUNC int osal_ioring_add(osal_ioring_t *ior, const size_t offset, assert((item->single.iov_len & ior_WriteFile_flag) != 0); } else { /* WriteFileGather() */ + assert(ior->overlapped_fd); item->sgv[0].Buffer = PtrToPtr64(data); for (size_t i = 1; i < segments; ++i) { data = ptr_disp(data, ior->pagesize); @@ -814,7 +818,7 @@ MDBX_INTERNAL_FUNC void osal_ioring_walk( } MDBX_INTERNAL_FUNC osal_ioring_write_result_t -osal_ioring_write(osal_ioring_t *ior) { +osal_ioring_write(osal_ioring_t *ior, mdbx_filehandle_t fd) { osal_ioring_write_result_t r = {MDBX_SUCCESS, 0}; #if defined(_WIN32) || defined(_WIN64) @@ -828,6 +832,7 @@ osal_ioring_write(osal_ioring_t *ior) { size_t i = 1, bytes = item->single.iov_len - ior_WriteFile_flag; r.wops += 1; if (bytes & ior_WriteFile_flag) { + assert(ior->overlapped_fd && fd == ior->overlapped_fd); bytes = ior->pagesize; while (item->sgv[i].Buffer) { bytes += ior->pagesize; @@ -840,11 +845,10 @@ osal_ioring_write(osal_ioring_t *ior) { r.err = GetLastError(); bailout_rc: assert(r.err != MDBX_SUCCESS); - CancelIo(ior->fd); + CancelIo(fd); return r; } - if (WriteFileGather(ior->fd, item->sgv, (DWORD)bytes, nullptr, - &item->ov)) { + if (WriteFileGather(fd, item->sgv, (DWORD)bytes, nullptr, &item->ov)) { assert(item->ov.Internal == 0 && WaitForSingleObject(item->ov.hEvent, 0) == WAIT_OBJECT_0); ior_put_event(ior, item->ov.hEvent); @@ -854,7 +858,7 @@ osal_ioring_write(osal_ioring_t *ior) { if (unlikely(r.err != ERROR_IO_PENDING)) { ERROR("%s: fd %p, item %p (%zu), pgno %u, bytes %zu, offset %" PRId64 ", err %d", - "WriteFileGather", ior->fd, __Wpedantic_format_voidptr(item), + "WriteFileGather", fd, __Wpedantic_format_voidptr(item), item - ior->pool, ((MDBX_page *)item->single.iov_base)->mp_pgno, bytes, item->ov.Offset + ((uint64_t)item->ov.OffsetHigh << 32), r.err); @@ -863,11 +867,11 @@ osal_ioring_write(osal_ioring_t *ior) { assert(wait_for > ior->event_pool + ior->event_stack); *--wait_for = item->ov.hEvent; } - } else if (ior->flags & IOR_OVERLAPPED) { + } else if (fd == ior->overlapped_fd) { assert(bytes < MAX_WRITE); retry: item->ov.hEvent = ior; - if (WriteFileEx(ior->fd, item->single.iov_base, (DWORD)bytes, &item->ov, + if (WriteFileEx(fd, item->single.iov_base, (DWORD)bytes, &item->ov, ior_wocr)) { async_started += 1; } else { @@ -876,7 +880,7 @@ osal_ioring_write(osal_ioring_t *ior) { default: ERROR("%s: fd %p, item %p (%zu), pgno %u, bytes %zu, offset %" PRId64 ", err %d", - "WriteFileEx", ior->fd, __Wpedantic_format_voidptr(item), + "WriteFileEx", fd, __Wpedantic_format_voidptr(item), item - ior->pool, ((MDBX_page *)item->single.iov_base)->mp_pgno, bytes, item->ov.Offset + ((uint64_t)item->ov.OffsetHigh << 32), r.err); @@ -887,7 +891,7 @@ osal_ioring_write(osal_ioring_t *ior) { WARNING( "%s: fd %p, item %p (%zu), pgno %u, bytes %zu, offset %" PRId64 ", err %d", - "WriteFileEx", ior->fd, __Wpedantic_format_voidptr(item), + "WriteFileEx", fd, __Wpedantic_format_voidptr(item), item - ior->pool, ((MDBX_page *)item->single.iov_base)->mp_pgno, bytes, item->ov.Offset + ((uint64_t)item->ov.OffsetHigh << 32), r.err); @@ -905,12 +909,12 @@ osal_ioring_write(osal_ioring_t *ior) { } else { assert(bytes < MAX_WRITE); DWORD written = 0; - if (!WriteFile(ior->fd, item->single.iov_base, (DWORD)bytes, &written, + if (!WriteFile(fd, item->single.iov_base, (DWORD)bytes, &written, &item->ov)) { r.err = (int)GetLastError(); ERROR("%s: fd %p, item %p (%zu), pgno %u, bytes %zu, offset %" PRId64 ", err %d", - "WriteFile", ior->fd, __Wpedantic_format_voidptr(item), + "WriteFile", fd, __Wpedantic_format_voidptr(item), item - ior->pool, ((MDBX_page *)item->single.iov_base)->mp_pgno, bytes, item->ov.Offset + ((uint64_t)item->ov.OffsetHigh << 32), r.err); @@ -974,8 +978,7 @@ osal_ioring_write(osal_ioring_t *ior) { } if (!HasOverlappedIoCompleted(&item->ov)) { DWORD written = 0; - if (unlikely( - !GetOverlappedResult(ior->fd, &item->ov, &written, true))) { + if (unlikely(!GetOverlappedResult(fd, &item->ov, &written, true))) { ERROR("%s: item %p (%zu), pgno %u, bytes %zu, offset %" PRId64 ", err %d", "GetOverlappedResult", __Wpedantic_format_voidptr(item), @@ -1025,16 +1028,16 @@ osal_ioring_write(osal_ioring_t *ior) { #if MDBX_HAVE_PWRITEV assert(item->sgvcnt > 0); if (item->sgvcnt == 1) - r.err = osal_pwrite(ior->fd, item->sgv[0].iov_base, item->sgv[0].iov_len, + r.err = osal_pwrite(fd, item->sgv[0].iov_base, item->sgv[0].iov_len, item->offset); else - r.err = osal_pwritev(ior->fd, item->sgv, item->sgvcnt, item->offset); + r.err = osal_pwritev(fd, item->sgv, item->sgvcnt, item->offset); // TODO: io_uring_prep_write(sqe, fd, ...); item = ior_next(item, item->sgvcnt); #else - r.err = osal_pwrite(ior->fd, item->single.iov_base, item->single.iov_len, + r.err = osal_pwrite(fd, item->single.iov_base, item->single.iov_len, item->offset); item = ior_next(item, 1); #endif @@ -1055,8 +1058,10 @@ MDBX_INTERNAL_FUNC void osal_ioring_reset(osal_ioring_t *ior) { #if defined(_WIN32) || defined(_WIN64) if (ior->last) { for (ior_item_t *item = ior->pool; item <= ior->last;) { - if (!HasOverlappedIoCompleted(&item->ov)) - CancelIoEx(ior->fd, &item->ov); + if (!HasOverlappedIoCompleted(&item->ov)) { + assert(ior->overlapped_fd); + CancelIoEx(ior->overlapped_fd, &item->ov); + } if (item->ov.hEvent && item->ov.hEvent != ior) ior_put_event(ior, item->ov.hEvent); size_t i = 1; @@ -1090,13 +1095,12 @@ MDBX_INTERNAL_FUNC int osal_ioring_resize(osal_ioring_t *ior, size_t items) { #if defined(_WIN32) || defined(_WIN64) if (ior->state & IOR_STATE_LOCKED) return MDBX_SUCCESS; - const bool useSetFileIoOverlappedRange = (ior->flags & IOR_OVERLAPPED) && - mdbx_SetFileIoOverlappedRange && - items > 7; + const bool useSetFileIoOverlappedRange = + ior->overlapped_fd && mdbx_SetFileIoOverlappedRange && items > 42; const size_t ceiling = useSetFileIoOverlappedRange ? ((items < 65536 / 2 / sizeof(ior_item_t)) ? 65536 : 65536 * 4) - : 4096; + : 1024; const size_t bytes = ceil_powerof2(sizeof(ior_item_t) * items, ceiling); items = bytes / sizeof(ior_item_t); #endif /* Windows */ @@ -1134,7 +1138,7 @@ MDBX_INTERNAL_FUNC int osal_ioring_resize(osal_ioring_t *ior, size_t items) { ior->boundary = ptr_disp(ior->pool, ior->allocated); #if defined(_WIN32) || defined(_WIN64) if (useSetFileIoOverlappedRange) { - if (mdbx_SetFileIoOverlappedRange(ior->fd, ptr, (ULONG)bytes)) + if (mdbx_SetFileIoOverlappedRange(ior->overlapped_fd, ptr, (ULONG)bytes)) ior->state += IOR_STATE_LOCKED; else return GetLastError(); diff --git a/src/osal.h b/src/osal.h index aaa7809a..31640c89 100644 --- a/src/osal.h +++ b/src/osal.h @@ -312,13 +312,12 @@ typedef struct osal_ioring { unsigned slots_left; unsigned allocated; #if defined(_WIN32) || defined(_WIN64) -#define IOR_DIRECT 1 -#define IOR_OVERLAPPED 2 #define IOR_STATE_LOCKED 1 + HANDLE overlapped_fd; unsigned pagesize; unsigned last_sgvcnt; size_t last_bytes; - uint8_t flags, state, pagesize_ln2; + uint8_t direct, state, pagesize_ln2; unsigned event_stack; HANDLE *event_pool; volatile LONG async_waiting; @@ -335,7 +334,6 @@ typedef struct osal_ioring { #define ior_last_sgvcnt(ior, item) (1) #define ior_last_bytes(ior, item) (item)->single.iov_len #endif /* !Windows */ - mdbx_filehandle_t fd; ior_item_t *last; ior_item_t *pool; char *boundary; @@ -344,11 +342,13 @@ typedef struct osal_ioring { #ifndef __cplusplus /* Actually this is not ioring for now, but on the way. */ -MDBX_INTERNAL_FUNC int osal_ioring_create(osal_ioring_t *, +MDBX_INTERNAL_FUNC int osal_ioring_create(osal_ioring_t * #if defined(_WIN32) || defined(_WIN64) - uint8_t flags, + , + bool enable_direct, + mdbx_filehandle_t overlapped_fd #endif /* Windows */ - mdbx_filehandle_t fd); +); MDBX_INTERNAL_FUNC int osal_ioring_resize(osal_ioring_t *, size_t items); MDBX_INTERNAL_FUNC void osal_ioring_destroy(osal_ioring_t *); MDBX_INTERNAL_FUNC void osal_ioring_reset(osal_ioring_t *); @@ -359,7 +359,7 @@ typedef struct osal_ioring_write_result { unsigned wops; } osal_ioring_write_result_t; MDBX_INTERNAL_FUNC osal_ioring_write_result_t -osal_ioring_write(osal_ioring_t *ior); +osal_ioring_write(osal_ioring_t *ior, mdbx_filehandle_t fd); typedef struct iov_ctx iov_ctx_t; MDBX_INTERNAL_FUNC void osal_ioring_walk( @@ -377,11 +377,13 @@ osal_ioring_used(const osal_ioring_t *ior) { } MDBX_MAYBE_UNUSED static inline int -osal_ioring_reserve(osal_ioring_t *ior, size_t items, size_t bytes) { +osal_ioring_prepare(osal_ioring_t *ior, size_t items, size_t bytes) { items = (items > 32) ? items : 32; #if defined(_WIN32) || defined(_WIN64) - const size_t npages = bytes >> ior->pagesize_ln2; - items = (items > npages) ? items : npages; + if (ior->direct) { + const size_t npages = bytes >> ior->pagesize_ln2; + items = (items > npages) ? items : npages; + } #else (void)bytes; #endif From db83bd34d2d0d0c8c65512c4d0fe374605eb1d6c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Sun, 4 Dec 2022 18:08:06 +0300 Subject: [PATCH 252/364] =?UTF-8?q?mdbx-test:=20=D1=87=D1=82=D0=B5=D0=BD?= =?UTF-8?q?=D0=B8=D0=B5=20=D0=B0=D0=BA=D1=82=D1=83=D0=B0=D0=BB=D1=8C=D0=BD?= =?UTF-8?q?=D1=8B=D1=85=20=D1=84=D0=BB=D0=B0=D0=B3=D0=BE=D0=B2=20=D1=80?= =?UTF-8?q?=D0=B5=D0=B6=D0=B8=D0=BC=D0=B0=20=D1=80=D0=B0=D0=B1=D0=BE=D1=82?= =?UTF-8?q?=D1=8B=20=D0=91=D0=94.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test/test.c++ | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/test/test.c++ b/test/test.c++ index 1d06dd77..c42b598f 100644 --- a/test/test.c++ +++ b/test/test.c++ @@ -158,12 +158,17 @@ void testcase::db_open() { if (config.params.random_writemap && flipcoin()) mode ^= MDBX_WRITEMAP; - actual_env_mode = mode; int rc = mdbx_env_open(db_guard.get(), config.params.pathname_db.c_str(), mode, 0640); if (unlikely(rc != MDBX_SUCCESS)) failure_perror("mdbx_env_open()", rc); + unsigned env_flags_proxy; + rc = mdbx_env_get_flags(db_guard.get(), &env_flags_proxy); + if (unlikely(rc != MDBX_SUCCESS)) + failure_perror("mdbx_env_get_flags()", rc); + actual_env_mode = MDBX_env_flags_t(env_flags_proxy); + rc = mdbx_env_set_syncperiod(db_guard.get(), unsigned(0.042 * 65536)); if (unlikely(rc != MDBX_SUCCESS) && rc != MDBX_BUSY) failure_perror("mdbx_env_set_syncperiod()", rc); From 48a56d1d0525b6f4cab7e677dbe419b2c6646edb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Sun, 4 Dec 2022 18:10:54 +0300 Subject: [PATCH 253/364] =?UTF-8?q?mdbx:=20=D0=B7=D0=B0=D0=BF=D1=80=D0=B5?= =?UTF-8?q?=D1=89=D0=B5=D0=BD=D0=B8=D0=B5=20=D1=80=D0=B0=D0=B7=D0=BD=D0=BE?= =?UTF-8?q?=D0=B3=D0=BE=20`MDBX=5FWRITEMAP`=20=D0=BC=D0=B5=D0=B6=D0=B4?= =?UTF-8?q?=D1=83=20=D0=BF=D1=80=D0=BE=D1=86=D0=B5=D1=81=D1=81=D0=B0=D0=BC?= =?UTF-8?q?=D0=B8=20=D0=B2=20=D1=80=D0=B5=D0=B6=D0=B8=D0=BC=D0=B0=D1=85=20?= =?UTF-8?q?=D1=81=20=D0=BE=D1=82=D0=BB=D0=BE=D0=B6=D0=B5=D0=BD=D0=BD=D0=BE?= =?UTF-8?q?=D0=B9/=D0=BB=D0=B5=D0=BD=D0=B8=D0=B2=D0=BE=D0=B9=20=D0=B7?= =?UTF-8?q?=D0=B0=D0=BF=D0=B8=D1=81=D1=8C=D1=8E.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Ранее упущенный не очевидный момент: При работе БД в режимах не-синхронной/отложенной фиксации на диске, все процессы-писатели должны иметь одинаковый режим MDBX_WRITEMAP. В противном случае, сброс на диск следует выполнять дважды: сначала msync(), затем fdatasync(). При этом msync() не обязан отрабатывать в процессах без MDBX_WRITEMAP, так как файл в память отображен только для чтения. Поэтому, в общем случае, различия по MDBX_WRITEMAP не позволяют выполнить фиксацию данных на диск, после их изменения в другом процессе. В режиме MDBX_UTTERLY_NOSYNC позволять совместную работу с MDBX_WRITEMAP также не следует, поскольку никакой процесс (в том числе последний) не может гарантированно сбросить данные на диск, а следовательно не должен помечать какую-либо транзакцию как steady. В результате, требуется либо запретить совместную работу процессам с разным MDBX_WRITEMAP в режиме отложенной записи, либо отслеживать такое смешивание и блокировать steady-пометки - что контрпродуктивно. --- src/core.c | 73 ++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 51 insertions(+), 22 deletions(-) diff --git a/src/core.c b/src/core.c index a01c18cd..30de3226 100644 --- a/src/core.c +++ b/src/core.c @@ -14328,12 +14328,6 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, } osal_fseek(env->me_lfd, safe_parking_lot_offset); - const MDBX_env_flags_t rigorous_flags = - MDBX_SAFE_NOSYNC | MDBX_DEPRECATED_MAPASYNC; - const MDBX_env_flags_t mode_flags = rigorous_flags | MDBX_NOMETASYNC | - MDBX_LIFORECLAIM | - MDBX_DEPRECATED_COALESCE | MDBX_NORDAHEAD; - eASSERT(env, env->me_dsync_fd == INVALID_HANDLE_VALUE); if (!(flags & (MDBX_RDONLY | MDBX_SAFE_NOSYNC | MDBX_DEPRECATED_MAPASYNC))) { rc = osal_openfile(MDBX_OPEN_DXB_DSYNC, env, env_pathname.dxb, @@ -14345,11 +14339,19 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, } } + const MDBX_env_flags_t lazy_flags = + MDBX_SAFE_NOSYNC | MDBX_UTTERLY_NOSYNC | MDBX_NOMETASYNC; + const MDBX_env_flags_t mode_flags = lazy_flags | MDBX_LIFORECLAIM | + MDBX_NORDAHEAD | MDBX_RDONLY | + MDBX_WRITEMAP; + MDBX_lockinfo *const lck = env->me_lck_mmap.lck; if (lck && lck_rc != MDBX_RESULT_TRUE && (env->me_flags & MDBX_RDONLY) == 0) { - while (atomic_load32(&lck->mti_envmode, mo_AcquireRelease) == MDBX_RDONLY) { + MDBX_env_flags_t snap_flags; + while ((snap_flags = atomic_load32(&lck->mti_envmode, mo_AcquireRelease)) == + MDBX_RDONLY) { if (atomic_cas32(&lck->mti_envmode, MDBX_RDONLY, - env->me_flags & mode_flags)) { + (snap_flags = (env->me_flags & mode_flags)))) { /* The case: * - let's assume that for some reason the DB file is smaller * than it should be according to the geometry, @@ -14368,15 +14370,44 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, if (env->me_flags & MDBX_ACCEDE) { /* Pickup current mode-flags (MDBX_LIFORECLAIM, MDBX_NORDAHEAD, etc). */ - const unsigned diff = - (lck->mti_envmode.weak ^ env->me_flags) & mode_flags; - NOTICE("accede mode-flags: 0x%X, 0x%X -> 0x%X", diff, env->me_flags, - env->me_flags ^ diff); + const MDBX_env_flags_t diff = + (snap_flags ^ env->me_flags) & + ((snap_flags & lazy_flags) ? mode_flags + : mode_flags & ~MDBX_WRITEMAP); env->me_flags ^= diff; + NOTICE("accede mode-flags: 0x%X, 0x%X -> 0x%X", diff, + env->me_flags ^ diff, env->me_flags); } - if ((lck->mti_envmode.weak ^ env->me_flags) & rigorous_flags) { - ERROR("%s", "current mode/flags incompatible with requested"); + /* Ранее упущенный не очевидный момент: При работе БД в режимах + * не-синхронной/отложенной фиксации на диске, все процессы-писатели должны + * иметь одинаковый режим MDBX_WRITEMAP. + * + * В противном случае, сброс на диск следует выполнять дважды: сначала + * msync(), затем fdatasync(). При этом msync() не обязан отрабатывать + * в процессах без MDBX_WRITEMAP, так как файл в память отображен только + * для чтения. Поэтому, в общем случае, различия по MDBX_WRITEMAP не + * позволяют выполнить фиксацию данных на диск, после их изменения в другом + * процессе. + * + * В режиме MDBX_UTTERLY_NOSYNC позволять совместную работу с MDBX_WRITEMAP + * также не следует, поскольку никакой процесс (в том числе последний) не + * может гарантированно сбросить данные на диск, а следовательно не должен + * помечать какую-либо транзакцию как steady. + * + * В результате, требуется либо запретить совместную работу процессам с + * разным MDBX_WRITEMAP в режиме отложенной записи, либо отслеживать такое + * смешивание и блокировать steady-пометки - что контрпродуктивно. */ + const MDBX_env_flags_t rigorous_flags = + (snap_flags & lazy_flags) + ? MDBX_SAFE_NOSYNC | MDBX_UTTERLY_NOSYNC | MDBX_WRITEMAP + : MDBX_SAFE_NOSYNC | MDBX_UTTERLY_NOSYNC; + const MDBX_env_flags_t rigorous_diff = + (snap_flags ^ env->me_flags) & rigorous_flags; + if (rigorous_diff) { + ERROR("current mode/flags 0x%X incompatible with requested 0x%X, " + "rigorous diff 0x%X", + env->me_flags, snap_flags, rigorous_diff); rc = MDBX_INCOMPATIBLE; goto bailout; } @@ -14397,11 +14428,14 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, } DEBUG("opened dbenv %p", (void *)env); + if (!lck || lck_rc == MDBX_RESULT_TRUE) { + env->me_lck->mti_envmode.weak = env->me_flags & mode_flags; + env->me_lck->mti_meta_sync_txnid.weak = + (uint32_t)recent_committed_txnid(env); + env->me_lck->mti_reader_check_timestamp.weak = osal_monotime(); + } if (lck) { if (lck_rc == MDBX_RESULT_TRUE) { - lck->mti_envmode.weak = env->me_flags & (mode_flags | MDBX_RDONLY); - lck->mti_meta_sync_txnid.weak = (uint32_t)recent_committed_txnid(env); - lck->mti_reader_check_timestamp.weak = osal_monotime(); rc = osal_lck_downgrade(env); DEBUG("lck-downgrade-%s: rc %i", (env->me_flags & MDBX_EXCLUSIVE) ? "partial" : "full", rc); @@ -14420,11 +14454,6 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, goto bailout; env->me_flags |= MDBX_ENV_TXKEY; } - } else { - env->me_lck->mti_envmode.weak = env->me_flags & (mode_flags | MDBX_RDONLY); - env->me_lck->mti_meta_sync_txnid.weak = - (uint32_t)recent_committed_txnid(env); - env->me_lck->mti_reader_check_timestamp.weak = osal_monotime(); } if ((flags & MDBX_RDONLY) == 0) { From dc27d5d30abcdbeea85196427477888950d678a9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Sat, 3 Dec 2022 15:35:27 +0300 Subject: [PATCH 254/364] =?UTF-8?q?mdbx:=20=D1=80=D0=B5=D1=84=D0=B0=D0=BA?= =?UTF-8?q?=D1=82=D0=BE=D1=80=D0=B8=D0=BD=D0=B3=20=D1=81=20=D1=84=D0=BE?= =?UTF-8?q?=D1=80=D0=BC=D0=B8=D1=80=D0=BE=D0=B2=D0=B0=D0=BD=D0=B8=D0=B5?= =?UTF-8?q?=D0=BC=20`page=5Falloc=5Ffinalize()`=20=D0=B8=20=D1=81=D0=BE?= =?UTF-8?q?=D0=BA=D1=80=D0=B0=D1=88=D0=B5=D0=BD=D0=B8=D0=B5=D0=BC=20=D0=BC?= =?UTF-8?q?=D0=B5=D1=82=D1=80=D0=B8=D0=BA=20`MDBX=5FENABLE=5FPROFGC`.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mdbx.h | 14 ++--- src/core.c | 162 +++++++++++++++++++----------------------------- src/internals.h | 7 +-- 3 files changed, 69 insertions(+), 114 deletions(-) diff --git a/mdbx.h b/mdbx.h index 3f58ccb0..db395328 100644 --- a/mdbx.h +++ b/mdbx.h @@ -3794,13 +3794,10 @@ struct MDBX_commit_latency { /** \brief Время "по настенным часам" затраченное на чтение и поиск внутри * GC ради данных пользователя. */ uint32_t work_rtime_monotonic; - /** \brief Монотонное время по "настенным часам" затраченное + /** \brief Время ЦПУ в режиме пользователе затраченное * на подготовку страниц извлекаемых из GC для данных пользователя, * включая подкачку с диска. */ - uint32_t work_xtime_monotonic; - /** \brief Время ЦПУ в режиме пользователе затраченное на чтение и поиск - * внтури GC ради данных пользователя. */ - uint32_t work_rtime_cpu; + uint32_t work_xtime_cpu; /** \brief Количество итераций поиска внутри GC при выделении страниц * ради данных пользователя. */ uint32_t work_rsteps; @@ -3817,13 +3814,10 @@ struct MDBX_commit_latency { /** \brief Время "по настенным часам" затраченное на чтение и поиск внутри * GC для целей поддержки и обновления самой GC. */ uint32_t self_rtime_monotonic; - /** \brief Монотонное время по "настенным часам" затраченное на подготовку + /** \brief Время ЦПУ в режиме пользователе затраченное на подготовку * страниц извлекаемых из GC для целей поддержки и обновления самой GC, * включая подкачку с диска. */ - uint32_t self_xtime_monotonic; - /** \brief Время ЦПУ в режиме пользователе затраченное на чтение и поиск - * внтури GC для целей поддержки и обновления самой GC. */ - uint32_t self_rtime_cpu; + uint32_t self_xtime_cpu; /** \brief Количество итераций поиска внутри GC при выделении страниц * для целей поддержки и обновления самой GC. */ uint32_t self_rsteps; diff --git a/src/core.c b/src/core.c index 30de3226..48a64091 100644 --- a/src/core.c +++ b/src/core.c @@ -6753,13 +6753,62 @@ __hot static pgno_t pnl_get_sequence(MDBX_PNL pnl, const size_t num, return 0; } -static pgr_t page_alloc_slowpath(const MDBX_cursor *mc, const size_t num, +static __inline pgr_t page_alloc_finalize(MDBX_env *const env, + MDBX_txn *const txn, + const MDBX_cursor *const mc, + const pgno_t pgno, const size_t num) { +#if MDBX_ENABLE_PROFGC + size_t majflt_before; + const uint64_t cputime_before = osal_cputime(&majflt_before); + profgc_stat_t *const prof = (mc->mc_dbi == FREE_DBI) + ? &env->me_lck->mti_pgop_stat.gc_prof.self + : &env->me_lck->mti_pgop_stat.gc_prof.work; +#else + (void)mc; +#endif /* MDBX_ENABLE_PROFGC */ + ENSURE(env, pgno >= NUM_METAS); + + pgr_t ret; + if (env->me_flags & MDBX_WRITEMAP) { + ret.page = pgno2page(env, pgno); + MDBX_ASAN_UNPOISON_MEMORY_REGION(ret.page, pgno2bytes(env, num)); + VALGRIND_MAKE_MEM_UNDEFINED(ret.page, pgno2bytes(env, num)); + } else { + ret.page = page_malloc(txn, num); + if (unlikely(!ret.page)) { + ret.err = MDBX_ENOMEM; + goto bailout; + } + } + + if (unlikely(env->me_flags & MDBX_PAGEPERTURB)) + memset(ret.page, -1, pgno2bytes(env, num)); + VALGRIND_MAKE_MEM_UNDEFINED(ret.page, pgno2bytes(env, num)); + + ret.page->mp_pgno = pgno; + ret.page->mp_leaf2_ksize = 0; + ret.page->mp_flags = 0; + if ((ASSERT_ENABLED() || AUDIT_ENABLED()) && num > 1) { + ret.page->mp_pages = (pgno_t)num; + ret.page->mp_flags = P_OVERFLOW; + } + + ret.err = page_dirty(txn, ret.page, (pgno_t)num); +bailout: + tASSERT(txn, pnl_check_allocated(txn->tw.relist, + txn->mt_next_pgno - MDBX_ENABLE_REFUND)); +#if MDBX_ENABLE_PROFGC + size_t majflt_after; + prof->xtime_cpu += osal_cputime(&majflt_after) - cputime_before; + prof->majflt += majflt_after - majflt_before; +#endif /* MDBX_ENABLE_PROFGC */ + return ret; +} + +static pgr_t page_alloc_slowpath(const MDBX_cursor *const mc, const size_t num, uint8_t flags) { #if MDBX_ENABLE_PROFGC const uint64_t monotime_before = osal_monotime(); - size_t majflt_before; - const uint64_t cputime_before = osal_cputime(&majflt_before); - uint64_t monotime_shot = 0; #endif /* MDBX_ENABLE_PROFGC */ pgr_t ret; @@ -7162,9 +7211,6 @@ no_gc: aligned = txn->mt_geo.upper; eASSERT(env, aligned >= newnext); -#if MDBX_ENABLE_PROFGC - monotime_shot = osal_monotime(); -#endif /* MDBX_ENABLE_PROFGC */ VERBOSE("try growth datafile to %zu pages (+%zu)", aligned, aligned - txn->mt_end_pgno); ret.err = map_resize_implicit(env, txn->mt_next_pgno, (pgno_t)aligned, @@ -7193,36 +7239,7 @@ done: eASSERT(env, pgno >= NUM_METAS && pgno + num <= txn->mt_next_pgno); } - ENSURE(env, pgno >= NUM_METAS); -#if MDBX_ENABLE_PROFGC - if (!monotime_shot) - monotime_shot = osal_monotime(); -#endif /* MDBX_ENABLE_PROFGC */ - if (env->me_flags & MDBX_WRITEMAP) { - ret.page = pgno2page(env, pgno); - MDBX_ASAN_UNPOISON_MEMORY_REGION(ret.page, pgno2bytes(env, num)); - VALGRIND_MAKE_MEM_UNDEFINED(ret.page, pgno2bytes(env, num)); - } else { - ret.page = page_malloc(txn, num); - if (unlikely(!ret.page)) { - ret.err = MDBX_ENOMEM; - goto fail; - } - } - - if (unlikely(env->me_flags & MDBX_PAGEPERTURB)) - memset(ret.page, -1, pgno2bytes(env, num)); - VALGRIND_MAKE_MEM_UNDEFINED(ret.page, pgno2bytes(env, num)); - - ret.page->mp_pgno = pgno; - ret.page->mp_leaf2_ksize = 0; - ret.page->mp_flags = 0; - if ((ASSERT_ENABLED() || AUDIT_ENABLED()) && num > 1) { - ret.page->mp_pages = (pgno_t)num; - ret.page->mp_flags = P_OVERFLOW; - } - - ret.err = page_dirty(txn, ret.page, (pgno_t)num); + ret = page_alloc_finalize(env, txn, mc, pgno, num); if (unlikely(ret.err != MDBX_SUCCESS)) { fail: eASSERT(env, ret.err != MDBX_SUCCESS); @@ -7260,23 +7277,13 @@ done: ret.page = NULL; } - eASSERT(env, pnl_check_allocated(txn->tw.relist, - txn->mt_next_pgno - MDBX_ENABLE_REFUND)); #if MDBX_ENABLE_PROFGC - size_t majflt_after; - prof->rtime_cpu += osal_cputime(&majflt_after) - cputime_before; - prof->majflt += majflt_after - majflt_before; - const uint64_t monotime_now = osal_monotime(); - if (monotime_shot) { - prof->xtime_monotonic += monotime_shot - monotime_before; - prof->rtime_monotonic += monotime_now - monotime_shot; - } else - prof->rtime_monotonic += monotime_now - monotime_before; + prof->rtime_monotonic += osal_monotime() - monotime_before; #endif /* MDBX_ENABLE_PROFGC */ return ret; } -__hot static pgr_t page_alloc(const MDBX_cursor *mc) { +__hot static pgr_t page_alloc(const MDBX_cursor *const mc) { MDBX_txn *const txn = mc->mc_txn; /* If there are any loose pages, just use them */ @@ -7303,48 +7310,9 @@ __hot static pgr_t page_alloc(const MDBX_cursor *mc) { return ret; } - if (likely(MDBX_PNL_GETSIZE(txn->tw.relist) > 0)) { - const pgno_t pgno = pnl_get_single(txn->tw.relist); - MDBX_env *const env = txn->mt_env; - -#if MDBX_ENABLE_PROFGC - const uint64_t monotime_before = osal_monotime(); - size_t majflt_before; - const uint64_t cputime_before = osal_cputime(&majflt_before); - profgc_stat_t *const prof = (mc->mc_dbi == FREE_DBI) - ? &env->me_lck->mti_pgop_stat.gc_prof.self - : &env->me_lck->mti_pgop_stat.gc_prof.work; -#endif /* MDBX_ENABLE_PROFGC */ - pgr_t ret; - if (env->me_flags & MDBX_WRITEMAP) { - ret.page = pgno2page(env, pgno); - MDBX_ASAN_UNPOISON_MEMORY_REGION(ret.page, env->me_psize); - } else { - ret.page = page_malloc(txn, 1); - if (unlikely(!ret.page)) { - ret.err = MDBX_ENOMEM; - goto bailout; - } - } - - VALGRIND_MAKE_MEM_UNDEFINED(ret.page, env->me_psize); - ret.page->mp_pgno = pgno; - ret.page->mp_leaf2_ksize = 0; - ret.page->mp_flags = 0; - tASSERT(txn, ret.page->mp_pgno >= NUM_METAS); - - ret.err = page_dirty(txn, ret.page, 1); - bailout: - tASSERT(txn, pnl_check_allocated(txn->tw.relist, - txn->mt_next_pgno - MDBX_ENABLE_REFUND)); -#if MDBX_ENABLE_PROFGC - size_t majflt_after; - prof->rtime_cpu += osal_cputime(&majflt_after) - cputime_before; - prof->majflt += majflt_after - majflt_before; - prof->xtime_monotonic += osal_monotime() - monotime_before; -#endif /* MDBX_ENABLE_PROFGC */ - return ret; - } + if (likely(MDBX_PNL_GETSIZE(txn->tw.relist) > 0)) + return page_alloc_finalize(txn->mt_env, txn, mc, + pnl_get_single(txn->tw.relist), 1); return page_alloc_slowpath(mc, 1, MDBX_ALLOC_DEFAULT); } @@ -10969,10 +10937,8 @@ static void take_gcprof(MDBX_txn *txn, MDBX_commit_latency *latency) { latency->gc_prof.work_counter = ptr->gc_prof.work.spe_counter; latency->gc_prof.work_rtime_monotonic = osal_monotime_to_16dot16(ptr->gc_prof.work.rtime_monotonic); - latency->gc_prof.work_xtime_monotonic = - osal_monotime_to_16dot16(ptr->gc_prof.work.xtime_monotonic); - latency->gc_prof.work_rtime_cpu = - osal_monotime_to_16dot16(ptr->gc_prof.work.rtime_cpu); + latency->gc_prof.work_xtime_cpu = + osal_monotime_to_16dot16(ptr->gc_prof.work.xtime_cpu); latency->gc_prof.work_rsteps = ptr->gc_prof.work.rsteps; latency->gc_prof.work_xpages = ptr->gc_prof.work.xpages; latency->gc_prof.work_majflt = ptr->gc_prof.work.majflt; @@ -10980,10 +10946,8 @@ static void take_gcprof(MDBX_txn *txn, MDBX_commit_latency *latency) { latency->gc_prof.self_counter = ptr->gc_prof.self.spe_counter; latency->gc_prof.self_rtime_monotonic = osal_monotime_to_16dot16(ptr->gc_prof.self.rtime_monotonic); - latency->gc_prof.self_xtime_monotonic = - osal_monotime_to_16dot16(ptr->gc_prof.self.xtime_monotonic); - latency->gc_prof.self_rtime_cpu = - osal_monotime_to_16dot16(ptr->gc_prof.self.rtime_cpu); + latency->gc_prof.self_xtime_cpu = + osal_monotime_to_16dot16(ptr->gc_prof.self.xtime_cpu); latency->gc_prof.self_rsteps = ptr->gc_prof.self.rsteps; latency->gc_prof.self_xpages = ptr->gc_prof.self.xpages; latency->gc_prof.self_majflt = ptr->gc_prof.self.majflt; diff --git a/src/internals.h b/src/internals.h index d504684c..3c206f1c 100644 --- a/src/internals.h +++ b/src/internals.h @@ -587,12 +587,9 @@ typedef struct profgc_stat { /* Монотонное время по "настенным часам" * затраченное на чтение и поиск внутри GC */ uint64_t rtime_monotonic; - /* Монотонное время по "настенным часам" затраченное - * на подготовку страниц извлекаемых из GC, включая подкачку с диска. */ - uint64_t xtime_monotonic; /* Процессорное время в режим пользователя - * затраченное на чтение и поиск внутри GC */ - uint64_t rtime_cpu; + * на подготовку страниц извлекаемых из GC, включая подкачку с диска. */ + uint64_t xtime_cpu; /* Количество итераций чтения-поиска внутри GC при выделении страниц */ uint32_t rsteps; /* Количество запросов на выделение последовательностей страниц, From be3ff9277200aeb1cd8d0b5e7c1274b264222adb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Sun, 4 Dec 2022 20:04:13 +0300 Subject: [PATCH 255/364] =?UTF-8?q?mdbx:=20=D0=BF=D1=80=D0=B5=D0=B4=D0=BE?= =?UTF-8?q?=D1=82=D0=B2=D1=80=D0=B0=D1=89=D0=B5=D0=BD=D0=B8=D0=B5=20=D0=B1?= =?UTF-8?q?=D0=B5=D1=81=D0=BF=D0=BE=D0=BB=D0=B5=D0=B7=D0=BD=D1=8B=D1=85=20?= =?UTF-8?q?page-faults=20=D0=B2=20=D1=80=D0=B5=D0=B6=D0=B8=D0=BC=D0=B5=20`?= =?UTF-8?q?MDBX=5FWRITEMAP`=20(=D0=BE=D0=BF=D1=86=D0=B8=D1=8F=20=D1=81?= =?UTF-8?q?=D0=B1=D0=BE=D1=80=D0=BA=D0=B8=20`MDBX=5FENABLE=5FPREFAULT`).?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mdbx.h | 21 ++++++++--------- src/core.c | 60 ++++++++++++++++++++++++++++++++++++++++++++++--- src/internals.h | 2 ++ src/options.h | 12 ++++++++++ 4 files changed, 82 insertions(+), 13 deletions(-) diff --git a/mdbx.h b/mdbx.h index db395328..a7715fd4 100644 --- a/mdbx.h +++ b/mdbx.h @@ -2583,16 +2583,17 @@ struct MDBX_envinfo { * first process opened the database after everyone had previously closed it). */ struct { - uint64_t newly; /**< Quantity of a new pages added */ - uint64_t cow; /**< Quantity of pages copied for update */ - uint64_t clone; /**< Quantity of parent's dirty pages clones - for nested transactions */ - uint64_t split; /**< Page splits */ - uint64_t merge; /**< Page merges */ - uint64_t spill; /**< Quantity of spilled dirty pages */ - uint64_t unspill; /**< Quantity of unspilled/reloaded pages */ - uint64_t wops; /**< Number of explicit write operations (not a pages) - to a disk */ + uint64_t newly; /**< Quantity of a new pages added */ + uint64_t cow; /**< Quantity of pages copied for update */ + uint64_t clone; /**< Quantity of parent's dirty pages clones + for nested transactions */ + uint64_t split; /**< Page splits */ + uint64_t merge; /**< Page merges */ + uint64_t spill; /**< Quantity of spilled dirty pages */ + uint64_t unspill; /**< Quantity of unspilled/reloaded pages */ + uint64_t wops; /**< Number of explicit write operations (not a pages) + to a disk */ + uint64_t prefault; /**< Number of prefault write operations (not a pages) */ uint64_t msync; /**< Number of explicit msync-to-disk operations (not a pages) */ uint64_t diff --git a/src/core.c b/src/core.c index 48a64091..6babacae 100644 --- a/src/core.c +++ b/src/core.c @@ -6773,18 +6773,68 @@ static __inline pgr_t page_alloc_finalize(MDBX_env *const env, ret.page = pgno2page(env, pgno); MDBX_ASAN_UNPOISON_MEMORY_REGION(ret.page, pgno2bytes(env, num)); VALGRIND_MAKE_MEM_UNDEFINED(ret.page, pgno2bytes(env, num)); + +#if MDBX_ENABLE_PREFAULT + /* Содержимое выделенной страницы не нужно, но если страница отсутствует + * в ОЗУ (что весьма вероятно), то любое обращение к ней приведет + * к page-fault: + * - прерыванию по отсутствию страницы; + * - переключение контекста в режим ядра с засыпанием процесса; + * - чтение страницы с диска; + * - обновление PTE и пробуждением процесса; + * - переключение контекста по доступности ЦПУ. + * + * Пытаемся минимизировать накладные расходы записывая страницу, что при + * наличии unified page cache приведет к появлению страницы в ОЗУ без чтения + * с диска. При этом запись на диск должна быть отложена адекватным ядром, + * так как страница отображена в память в режиме чтения-записи и следом в + * неё пишет ЦПУ. */ + void *const pattern = ptr_disp( + env->me_pbuf, + (env->me_flags & MDBX_PAGEPERTURB) ? env->me_psize : env->me_psize * 2); + size_t file_offset = pgno2bytes(env, pgno); + /* TODO: добавить проверку через mincore() c кэшированием результатов. */ + if (likely(num == 1)) { + osal_pwrite(env->me_lazy_fd, pattern, env->me_psize, file_offset); + } else { + struct iovec iov[MDBX_AUXILARY_IOV_MAX]; + iov[0].iov_len = env->me_psize; + iov[0].iov_base = pattern; + size_t n = 1, left = num - 1; + do { + iov[n].iov_len = env->me_psize; + iov[n].iov_base = pattern; + if (++n == MDBX_AUXILARY_IOV_MAX) { + osal_pwritev(env->me_lazy_fd, iov, MDBX_AUXILARY_IOV_MAX, + file_offset); + file_offset += pgno2bytes(env, MDBX_AUXILARY_IOV_MAX); +#if MDBX_ENABLE_PGOP_STAT + env->me_lck->mti_pgop_stat.prefault.weak += 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ + n = 0; + } + } while (--left); + osal_pwritev(env->me_lazy_fd, iov, n, file_offset); + } +#if MDBX_ENABLE_PGOP_STAT + env->me_lck->mti_pgop_stat.prefault.weak += 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ +#else + if (unlikely(env->me_flags & MDBX_PAGEPERTURB)) + memset(ret.page, -1, pgno2bytes(env, num)); +#endif /* MDBX_ENABLE_PREFAULT */ + } else { ret.page = page_malloc(txn, num); if (unlikely(!ret.page)) { ret.err = MDBX_ENOMEM; goto bailout; } + if (unlikely(env->me_flags & MDBX_PAGEPERTURB)) + memset(ret.page, -1, pgno2bytes(env, num)); } - if (unlikely(env->me_flags & MDBX_PAGEPERTURB)) - memset(ret.page, -1, pgno2bytes(env, num)); VALGRIND_MAKE_MEM_UNDEFINED(ret.page, pgno2bytes(env, num)); - ret.page->mp_pgno = pgno; ret.page->mp_leaf2_ksize = 0; ret.page->mp_flags = 0; @@ -14428,6 +14478,7 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, rc = alloc_page_buf(env); if (rc == MDBX_SUCCESS) { memset(env->me_pbuf, -1, env->me_psize * 2); + memset(ptr_disp(env->me_pbuf, env->me_psize * 2), 0, env->me_psize); MDBX_txn *txn = osal_calloc(1, size); if (txn) { txn->mt_dbs = ptr_disp(txn, tsize); @@ -21586,6 +21637,8 @@ __cold static int fetch_envinfo_ex(const MDBX_env *env, const MDBX_txn *txn, atomic_load64(&lck->mti_pgop_stat.unspill, mo_Relaxed); arg->mi_pgop_stat.wops = atomic_load64(&lck->mti_pgop_stat.wops, mo_Relaxed); + arg->mi_pgop_stat.prefault = + atomic_load64(&lck->mti_pgop_stat.prefault, mo_Relaxed); arg->mi_pgop_stat.msync = atomic_load64(&lck->mti_pgop_stat.msync, mo_Relaxed); arg->mi_pgop_stat.fsync = @@ -24706,6 +24759,7 @@ __dll_export " MDBX_AVOID_MSYNC=" MDBX_STRINGIFY(MDBX_AVOID_MSYNC) " MDBX_ENABLE_REFUND=" MDBX_STRINGIFY(MDBX_ENABLE_REFUND) " MDBX_ENABLE_MADVISE=" MDBX_STRINGIFY(MDBX_ENABLE_MADVISE) + " MDBX_ENABLE_PREFAULT=" MDBX_STRINGIFY(MDBX_ENABLE_PREFAULT) " MDBX_ENABLE_PGOP_STAT=" MDBX_STRINGIFY(MDBX_ENABLE_PGOP_STAT) " MDBX_ENABLE_PROFGC=" MDBX_STRINGIFY(MDBX_ENABLE_PROFGC) #if MDBX_DISABLE_VALIDATION diff --git a/src/internals.h b/src/internals.h index 3c206f1c..8f44ec18 100644 --- a/src/internals.h +++ b/src/internals.h @@ -619,6 +619,8 @@ typedef struct pgop_stat { MDBX_atomic_uint64_t fsync; /* Number of explicit fsync/flush-to-disk operations */ + MDBX_atomic_uint64_t prefault; /* Number of prefault write operations */ + /* Статистика для профилирования GC. * Логически эти данные может быть стоит вынести в другую структуру, * но разница будет сугубо косметическая. */ diff --git a/src/options.h b/src/options.h index 2ab0dce6..dda2b2ad 100644 --- a/src/options.h +++ b/src/options.h @@ -87,6 +87,18 @@ #error MDBX_ENABLE_PGOP_STAT must be defined as 0 or 1 #endif /* MDBX_ENABLE_PGOP_STAT */ +/** Controls prevention of page-faults of reclaimed and allocated pages in the + * MDBX_WRITEMAP mode by clearing ones through file handle before touching. */ +#ifndef MDBX_ENABLE_PREFAULT +#if MDBX_MMAP_INCOHERENT_FILE_WRITE +#define MDBX_ENABLE_PREFAULT 0 +#else +#define MDBX_ENABLE_PREFAULT 1 +#endif +#elif !(MDBX_ENABLE_PREFAULT == 0 || MDBX_ENABLE_PREFAULT == 1) +#error MDBX_ENABLE_PREFAULT must be defined as 0 or 1 +#endif /* MDBX_ENABLE_PREFAULT */ + /** Enables chunking long list of retired pages during huge transactions commit * to avoid use sequences of pages. */ #ifndef MDBX_ENABLE_BIGFOOT From a772a9d3e11274016c73562f97c01d95635afda7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Mon, 5 Dec 2022 10:41:05 +0300 Subject: [PATCH 256/364] =?UTF-8?q?mdbx:=20=D0=B4=D0=BE=D0=B1=D0=B0=D0=B2?= =?UTF-8?q?=D0=BB=D0=B5=D0=BD=D0=B8=D0=B5=20=D0=BF=D1=80=D0=BE=D0=B2=D0=B5?= =?UTF-8?q?=D1=80=D0=BA=D0=B8=20=D0=BF=D0=BE=D1=81=D1=80=D0=B5=D0=B4=D1=81?= =?UTF-8?q?=D1=82=D0=B2=D0=BE=D0=BC=20`mincore()`=20=D1=81=20=D0=BA=D1=8D?= =?UTF-8?q?=D1=88=D0=B8=D1=80=D0=BE=D0=B2=D0=B0=D0=BD=D0=B8=D0=B5=D0=BC=20?= =?UTF-8?q?=D0=BF=D1=80=D0=B8=D1=81=D1=83=D1=82=D1=81=D1=82=D0=B2=D0=B8?= =?UTF-8?q?=D1=8F=20=D1=81=D1=82=D1=80=D0=B0=D0=BD=D0=B8=D1=86=20=D0=B2=20?= =?UTF-8?q?=D0=BF=D0=B0=D0=BC=D1=8F=D1=82=D0=B8=20(=D0=BE=D0=BF=D1=86?= =?UTF-8?q?=D0=B8=D1=8F=20=D1=81=D0=B1=D0=BE=D1=80=D0=BA=D0=B8=20`MDBX=5FE?= =?UTF-8?q?NABLE=5FMINCORE`).?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mdbx.h | 1 + src/core.c | 176 +++++++++++++++++++++++++++++++++++++++--------- src/internals.h | 7 ++ src/options.h | 13 ++++ src/osal.c | 3 +- src/osal.h | 3 +- 6 files changed, 171 insertions(+), 32 deletions(-) diff --git a/mdbx.h b/mdbx.h index a7715fd4..e249ed1e 100644 --- a/mdbx.h +++ b/mdbx.h @@ -2594,6 +2594,7 @@ struct MDBX_envinfo { uint64_t wops; /**< Number of explicit write operations (not a pages) to a disk */ uint64_t prefault; /**< Number of prefault write operations (not a pages) */ + uint64_t mincore; /**< Number of mincore() calls */ uint64_t msync; /**< Number of explicit msync-to-disk operations (not a pages) */ uint64_t diff --git a/src/core.c b/src/core.c index 6babacae..b463aa5b 100644 --- a/src/core.c +++ b/src/core.c @@ -5599,6 +5599,11 @@ __hot static int __must_check_result page_dirty(MDBX_txn *txn, MDBX_page *mp, return MDBX_SUCCESS; } +static void mincore_clean_cache(const MDBX_env *const env) { + memset(env->me_lck->mti_mincore_cache.begin, -1, + sizeof(env->me_lck->mti_mincore_cache.begin)); +} + #if !(defined(_WIN32) || defined(_WIN64)) MDBX_MAYBE_UNUSED static __always_inline int ignore_enosys(int err) { #ifdef ENOSYS @@ -5723,6 +5728,7 @@ __cold static int set_readahead(const MDBX_env *env, const pgno_t edge, #endif } } else { + mincore_clean_cache(env); #if defined(MADV_RANDOM) err = madvise(ptr, length, MADV_RANDOM) ? ignore_enosys(errno) : MDBX_SUCCESS; @@ -5938,6 +5944,7 @@ __cold static int map_resize(MDBX_env *env, const pgno_t used_pgno, ? 0 : bytes2pgno(env, size_bytes); munlock_after(env, aligned_munlock_pgno, size_bytes); + mincore_clean_cache(env); #if MDBX_ENABLE_MADVISE if (size_bytes < prev_size) { @@ -6753,6 +6760,99 @@ __hot static pgno_t pnl_get_sequence(MDBX_PNL pnl, const size_t num, return 0; } +#if MDBX_ENABLE_MINCORE +static __inline bool bit_tas(uint64_t *field, char bit) { + const uint64_t m = UINT64_C(1) << bit; + const bool r = (*field & m) != 0; + *field |= m; + return r; +} + +static bool mincore_fetch(MDBX_env *const env, const size_t unit_begin) { + MDBX_lockinfo *const lck = env->me_lck; + for (size_t i = 1; i < ARRAY_LENGTH(lck->mti_mincore_cache.begin); ++i) { + const ptrdiff_t dist = unit_begin - lck->mti_mincore_cache.begin[i]; + if (likely(dist >= 0 && dist < 64)) { + const pgno_t tmp_begin = lck->mti_mincore_cache.begin[i]; + const uint64_t tmp_mask = lck->mti_mincore_cache.mask[i]; + do { + lck->mti_mincore_cache.begin[i] = lck->mti_mincore_cache.begin[i - 1]; + lck->mti_mincore_cache.mask[i] = lck->mti_mincore_cache.mask[i - 1]; + } while (--i); + lck->mti_mincore_cache.begin[0] = tmp_begin; + lck->mti_mincore_cache.mask[0] = tmp_mask; + return bit_tas(lck->mti_mincore_cache.mask, (char)dist); + } + } + + size_t pages = 64; + unsigned unit_log = sys_pagesize_ln2; + unsigned shift = 0; + if (env->me_psize > env->me_os_psize) { + unit_log = env->me_psize2log; + shift = env->me_psize2log - sys_pagesize_ln2; + pages <<= shift; + } + + const size_t offset = unit_begin << unit_log; + size_t length = pages << sys_pagesize_ln2; + if (offset + length > env->me_dxb_mmap.current) { + length = env->me_dxb_mmap.current - offset; + pages = length >> sys_pagesize_ln2; + } + +#if MDBX_ENABLE_PGOP_STAT + env->me_lck->mti_pgop_stat.mincore.weak += 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ + uint8_t *const vector = alloca(pages); + if (unlikely(mincore(ptr_disp(env->me_dxb_mmap.base, offset), length, + (void *)vector))) { + NOTICE("mincore(+%zu, %zu), err %d", offset, length, errno); + return false; + } + + for (size_t i = 1; i < ARRAY_LENGTH(lck->mti_mincore_cache.begin); ++i) { + lck->mti_mincore_cache.begin[i] = lck->mti_mincore_cache.begin[i - 1]; + lck->mti_mincore_cache.mask[i] = lck->mti_mincore_cache.mask[i - 1]; + } + lck->mti_mincore_cache.begin[0] = unit_begin; + + uint64_t mask = 0; +#ifdef MINCORE_INCORE + STATIC_ASSERT(MINCORE_INCORE == 1); +#endif + for (size_t i = 0; i < pages; ++i) { + uint64_t bit = (vector[i] & 1) == 0; + bit <<= i >> shift; + mask |= bit; + } + + lck->mti_mincore_cache.mask[0] = ~mask; + return bit_tas(lck->mti_mincore_cache.mask, 0); +} +#endif /* MDBX_ENABLE_MINCORE */ + +MDBX_MAYBE_UNUSED static __inline bool mincore_probe(MDBX_env *const env, + const pgno_t pgno) { +#if MDBX_ENABLE_MINCORE + const size_t offset_aligned = + floor_powerof2(pgno2bytes(env, pgno), env->me_os_psize); + const unsigned unit_log2 = (env->me_psize2log > sys_pagesize_ln2) + ? env->me_psize2log + : sys_pagesize_ln2; + const size_t unit_begin = offset_aligned >> unit_log2; + eASSERT(env, (unit_begin << unit_log2) == offset_aligned); + const ptrdiff_t dist = unit_begin - env->me_lck->mti_mincore_cache.begin[0]; + if (likely(dist >= 0 && dist < 64)) + return bit_tas(env->me_lck->mti_mincore_cache.mask, (char)dist); + return mincore_fetch(env, unit_begin); +#else + (void)env; + (void)pgno; + return false; +#endif /* MDBX_ENABLE_MINCORE */ +} + static __inline pgr_t page_alloc_finalize(MDBX_env *const env, MDBX_txn *const txn, const MDBX_cursor *const mc, @@ -6769,6 +6869,7 @@ static __inline pgr_t page_alloc_finalize(MDBX_env *const env, ENSURE(env, pgno >= NUM_METAS); pgr_t ret; + bool need_clean = (env->me_flags & MDBX_PAGEPERTURB) != 0; if (env->me_flags & MDBX_WRITEMAP) { ret.page = pgno2page(env, pgno); MDBX_ASAN_UNPOISON_MEMORY_REGION(ret.page, pgno2bytes(env, num)); @@ -6789,51 +6890,62 @@ static __inline pgr_t page_alloc_finalize(MDBX_env *const env, * с диска. При этом запись на диск должна быть отложена адекватным ядром, * так как страница отображена в память в режиме чтения-записи и следом в * неё пишет ЦПУ. */ - void *const pattern = ptr_disp( - env->me_pbuf, - (env->me_flags & MDBX_PAGEPERTURB) ? env->me_psize : env->me_psize * 2); - size_t file_offset = pgno2bytes(env, pgno); - /* TODO: добавить проверку через mincore() c кэшированием результатов. */ - if (likely(num == 1)) { - osal_pwrite(env->me_lazy_fd, pattern, env->me_psize, file_offset); - } else { - struct iovec iov[MDBX_AUXILARY_IOV_MAX]; - iov[0].iov_len = env->me_psize; - iov[0].iov_base = pattern; - size_t n = 1, left = num - 1; - do { - iov[n].iov_len = env->me_psize; - iov[n].iov_base = pattern; - if (++n == MDBX_AUXILARY_IOV_MAX) { - osal_pwritev(env->me_lazy_fd, iov, MDBX_AUXILARY_IOV_MAX, - file_offset); - file_offset += pgno2bytes(env, MDBX_AUXILARY_IOV_MAX); + const bool readahead_enabled = env->me_lck->mti_readahead_anchor & 1; + const pgno_t readahead_edge = env->me_lck->mti_readahead_anchor >> 1; + /* Не суетимся если страница в зоне включенного упреждающего чтения */ + if (!readahead_enabled || pgno + num > readahead_edge) { + void *const pattern = ptr_disp( + env->me_pbuf, need_clean ? env->me_psize : env->me_psize * 2); + size_t file_offset = pgno2bytes(env, pgno); + if (likely(num == 1)) { + if (!mincore_probe(env, pgno)) { + osal_pwrite(env->me_lazy_fd, pattern, env->me_psize, file_offset); #if MDBX_ENABLE_PGOP_STAT env->me_lck->mti_pgop_stat.prefault.weak += 1; #endif /* MDBX_ENABLE_PGOP_STAT */ - n = 0; + need_clean = false; } - } while (--left); - osal_pwritev(env->me_lazy_fd, iov, n, file_offset); - } + } else { + struct iovec iov[MDBX_AUXILARY_IOV_MAX]; + size_t n = 0, cleared = 0; + for (size_t i = 0; i < num; ++i) { + if (!mincore_probe(env, pgno + (pgno_t)i)) { + ++cleared; + iov[n].iov_len = env->me_psize; + iov[n].iov_base = pattern; + if (unlikely(++n == MDBX_AUXILARY_IOV_MAX)) { + osal_pwritev(env->me_lazy_fd, iov, MDBX_AUXILARY_IOV_MAX, + file_offset); #if MDBX_ENABLE_PGOP_STAT - env->me_lck->mti_pgop_stat.prefault.weak += 1; + env->me_lck->mti_pgop_stat.prefault.weak += 1; #endif /* MDBX_ENABLE_PGOP_STAT */ -#else - if (unlikely(env->me_flags & MDBX_PAGEPERTURB)) - memset(ret.page, -1, pgno2bytes(env, num)); + file_offset += pgno2bytes(env, MDBX_AUXILARY_IOV_MAX); + n = 0; + } + } + } + if (likely(n > 0)) { + osal_pwritev(env->me_lazy_fd, iov, n, file_offset); +#if MDBX_ENABLE_PGOP_STAT + env->me_lck->mti_pgop_stat.prefault.weak += 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ + } + if (cleared == num) + need_clean = false; + } + } #endif /* MDBX_ENABLE_PREFAULT */ - } else { ret.page = page_malloc(txn, num); if (unlikely(!ret.page)) { ret.err = MDBX_ENOMEM; goto bailout; } - if (unlikely(env->me_flags & MDBX_PAGEPERTURB)) - memset(ret.page, -1, pgno2bytes(env, num)); } + if (unlikely(need_clean)) + memset(ret.page, -1, pgno2bytes(env, num)); + VALGRIND_MAKE_MEM_UNDEFINED(ret.page, pgno2bytes(env, num)); ret.page->mp_pgno = pgno; ret.page->mp_leaf2_ksize = 0; @@ -14427,6 +14539,7 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, } } + mincore_clean_cache(env); const int dxb_rc = setup_dxb(env, lck_rc, mode); if (MDBX_IS_ERROR(dxb_rc)) { rc = dxb_rc; @@ -21639,6 +21752,8 @@ __cold static int fetch_envinfo_ex(const MDBX_env *env, const MDBX_txn *txn, atomic_load64(&lck->mti_pgop_stat.wops, mo_Relaxed); arg->mi_pgop_stat.prefault = atomic_load64(&lck->mti_pgop_stat.prefault, mo_Relaxed); + arg->mi_pgop_stat.mincore = + atomic_load64(&lck->mti_pgop_stat.mincore, mo_Relaxed); arg->mi_pgop_stat.msync = atomic_load64(&lck->mti_pgop_stat.msync, mo_Relaxed); arg->mi_pgop_stat.fsync = @@ -24760,6 +24875,7 @@ __dll_export " MDBX_ENABLE_REFUND=" MDBX_STRINGIFY(MDBX_ENABLE_REFUND) " MDBX_ENABLE_MADVISE=" MDBX_STRINGIFY(MDBX_ENABLE_MADVISE) " MDBX_ENABLE_PREFAULT=" MDBX_STRINGIFY(MDBX_ENABLE_PREFAULT) + " MDBX_ENABLE_MINCORE=" MDBX_STRINGIFY(MDBX_ENABLE_MINCORE) " MDBX_ENABLE_PGOP_STAT=" MDBX_STRINGIFY(MDBX_ENABLE_PGOP_STAT) " MDBX_ENABLE_PROFGC=" MDBX_STRINGIFY(MDBX_ENABLE_PROFGC) #if MDBX_DISABLE_VALIDATION diff --git a/src/internals.h b/src/internals.h index 8f44ec18..8113e29a 100644 --- a/src/internals.h +++ b/src/internals.h @@ -620,6 +620,7 @@ typedef struct pgop_stat { fsync; /* Number of explicit fsync/flush-to-disk operations */ MDBX_atomic_uint64_t prefault; /* Number of prefault write operations */ + MDBX_atomic_uint64_t mincore; /* Number of mincore() calls */ /* Статистика для профилирования GC. * Логически эти данные может быть стоит вынести в другую структуру, @@ -813,6 +814,12 @@ typedef struct MDBX_lockinfo { /* Shared anchor for tracking readahead edge and enabled/disabled status. */ pgno_t mti_readahead_anchor; + /* Shared cache for mincore() results */ + struct { + pgno_t begin[4]; + uint64_t mask[4]; + } mti_mincore_cache; + MDBX_ALIGNAS(MDBX_CACHELINE_SIZE) /* cacheline ----------------------------*/ /* Readeaders registration lock. */ diff --git a/src/options.h b/src/options.h index dda2b2ad..3e9c9243 100644 --- a/src/options.h +++ b/src/options.h @@ -99,6 +99,19 @@ #error MDBX_ENABLE_PREFAULT must be defined as 0 or 1 #endif /* MDBX_ENABLE_PREFAULT */ +/** Controls using Unix' mincore() to determine whether DB-pages + * are resident in memory. */ +#ifndef MDBX_ENABLE_MINCORE +#if MDBX_ENABLE_PREFAULT && \ + (defined(MINCORE_INCORE) || !(defined(_WIN32) || defined(_WIN64))) +#define MDBX_ENABLE_MINCORE 1 +#else +#define MDBX_ENABLE_MINCORE 0 +#endif +#elif !(MDBX_ENABLE_MINCORE == 0 || MDBX_ENABLE_MINCORE == 1) +#error MDBX_ENABLE_MINCORE must be defined as 0 or 1 +#endif /* MDBX_ENABLE_MINCORE */ + /** Enables chunking long list of retired pages during huge transactions commit * to avoid use sequences of pages. */ #ifndef MDBX_ENABLE_BIGFOOT diff --git a/src/osal.c b/src/osal.c index 83e7ca57..69a0b49f 100644 --- a/src/osal.c +++ b/src/osal.c @@ -3336,7 +3336,7 @@ __cold int mdbx_get_sysraminfo(intptr_t *page_size, intptr_t *total_pages, #ifndef xMDBX_ALLOY unsigned sys_pagesize; -MDBX_MAYBE_UNUSED unsigned sys_allocation_granularity; +MDBX_MAYBE_UNUSED unsigned sys_pagesize_ln2, sys_allocation_granularity; #endif /* xMDBX_ALLOY */ void osal_ctor(void) { @@ -3362,6 +3362,7 @@ void osal_ctor(void) { assert(sys_pagesize > 0 && (sys_pagesize & (sys_pagesize - 1)) == 0); assert(sys_allocation_granularity >= sys_pagesize && sys_allocation_granularity % sys_pagesize == 0); + sys_pagesize_ln2 = log2n_powerof2(sys_pagesize); #if defined(__linux__) || defined(__gnu_linux__) posix_clockid = choice_monoclock(); diff --git a/src/osal.h b/src/osal.h index 31640c89..cdd6fa27 100644 --- a/src/osal.h +++ b/src/osal.h @@ -211,7 +211,8 @@ typedef pthread_mutex_t osal_fastmutex_t; /* OS abstraction layer stuff */ MDBX_INTERNAL_VAR unsigned sys_pagesize; -MDBX_MAYBE_UNUSED MDBX_INTERNAL_VAR unsigned sys_allocation_granularity; +MDBX_MAYBE_UNUSED MDBX_INTERNAL_VAR unsigned sys_pagesize_ln2, + sys_allocation_granularity; /* Get the size of a memory page for the system. * This is the basic size that the platform's memory manager uses, and is From 48eeb93628ea045ef78b02579153eeb988851da7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Mon, 5 Dec 2022 19:14:08 +0300 Subject: [PATCH 257/364] =?UTF-8?q?mdbx:=20=D0=B8=D1=81=D0=BF=D1=80=D0=B0?= =?UTF-8?q?=D0=B2=D0=BB=D0=B5=D0=BD=D0=B8=D0=B5=20=D0=BF=D0=B0=D0=B4=D0=B5?= =?UTF-8?q?=D0=BD=D0=B8=D1=8F=20=D0=B2=20`env=5Fclose()`=20=D0=BF=D1=80?= =?UTF-8?q?=D0=B8=20=D0=B7=D0=B0=D0=BA=D1=80=D1=8B=D1=82=D0=B8=D0=B8=20?= =?UTF-8?q?=D1=81=D1=80=D0=B5=D0=B4=D1=8B=20=D0=BF=D0=B5=D1=80=D0=B5-?= =?UTF-8?q?=D0=BE=D1=82=D0=BA=D1=80=D1=8B=D1=82=D0=BE=D0=B9=20=D0=B2=20?= =?UTF-8?q?=D1=80=D0=B5=D0=B6=D0=B8=D0=BC=D0=B5=20=D1=82=D0=BE=D0=BB=D1=8C?= =?UTF-8?q?=D0=BA=D0=BE-=D0=B4=D0=BB=D1=8F-=D1=87=D1=82=D0=B5=D0=BD=D0=B8?= =?UTF-8?q?=D1=8F.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Ошибка не была замечена ранее из-за много-ходового сценария воспроизведения: 1. Создаём экземпляр MDBX_env посредством mdbx_env_create(); 2. Пытаемся открыть БД посредством mdbx_env_open() в режиме чтения-записи и эта попытка должны быть неудачной; 3. Не освобождая экземпляр MDBX_env повторно открываем его в режиме только-чтение; 4. Закрываем среду посредством mdbx_env_close(). Падение происходит на пункте 4, либо на пункте 3, если попытка повторного открытия будет не успешной. Причина в том, что внутренний экземпляр osal_ioring_t инициализировался только для режимов чтения-записи, а разрушался всегда. При этом после первого разрушения намеренно оставался в состоянии вызывающем падение при использовании без инициализации. [Simon Leier](https://t.me/leisim) сообщал об этой проблеме (теперь понятно что это было), но из-за сложности сценария проблему не удалось воспроизвести и идентифицировать. --- src/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/core.c b/src/core.c index b463aa5b..76b3b399 100644 --- a/src/core.c +++ b/src/core.c @@ -14671,7 +14671,8 @@ __cold static int env_close(MDBX_env *env) { } munlock_all(env); - osal_ioring_destroy(&env->me_ioring); + if (!(env->me_flags & MDBX_RDONLY)) + osal_ioring_destroy(&env->me_ioring); lcklist_lock(); const int rc = lcklist_detach_locked(env); From a9163f6307ed21536fc326a39c644bf36d21d86e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Tue, 6 Dec 2022 19:24:02 +0300 Subject: [PATCH 258/364] =?UTF-8?q?mdbx:=20=D0=B4=D0=BE=D1=80=D0=B0=D0=B1?= =?UTF-8?q?=D0=BE=D1=82=D0=BA=D0=B0=20=D0=B2=D0=BD=D1=83=D1=82=D1=80=D0=B5?= =?UTF-8?q?=D0=BD=D0=BD=D0=B8=D1=85=20LRU-=D0=BE=D1=82=D0=BC=D0=B5=D1=82?= =?UTF-8?q?=D0=BE=D0=BA=20=D0=B4=D0=BB=D1=8F=20=D0=B0=D0=BA=D0=BA=D1=83?= =?UTF-8?q?=D1=80=D0=B0=D1=82=D0=BD=D0=BE=D0=B3=D0=BE=20=D1=81=D0=BF=D0=B8?= =?UTF-8?q?=D0=BB=D0=BB=D0=B8=D0=BD=D0=B3=D0=B0=20=D0=BE=D0=B3=D1=80=D0=BE?= =?UTF-8?q?=D0=BC=D0=BD=D1=8B=D1=85=20=D1=82=D1=80=D0=B0=D0=BD=D0=B7=D0=B0?= =?UTF-8?q?=D0=BA=D1=86=D0=B8=D0=B9.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core.c | 75 ++++++++++++++++++++++++++++++++++--------------- src/internals.h | 10 ++----- 2 files changed, 55 insertions(+), 30 deletions(-) diff --git a/src/core.c b/src/core.c index 76b3b399..e13c5cbf 100644 --- a/src/core.c +++ b/src/core.c @@ -2764,7 +2764,7 @@ static __always_inline size_t dpl_setlen(MDBX_dpl *dl, size_t len) { dl->length = len; dl->items[len + 1].ptr = (MDBX_page *)&dpl_stub_pageE; dl->items[len + 1].pgno = P_INVALID; - dl->items[len + 1].extra = 0; + dl->items[len + 1].mlru = 0; return len; } @@ -2779,7 +2779,7 @@ static __always_inline void dpl_clear(MDBX_dpl *dl) { dl->pages_including_loose = 0; dl->items[0].ptr = (MDBX_page *)&dpl_stub_pageB; dl->items[0].pgno = 0; - dl->items[0].extra = 0; + dl->items[0].mlru = 0; assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); } @@ -2955,7 +2955,9 @@ __hot __noinline static size_t dpl_search(const MDBX_txn *txn, pgno_t pgno) { MDBX_NOTHROW_PURE_FUNCTION static __inline unsigned dpl_npages(const MDBX_dpl *dl, size_t i) { assert(0 <= (intptr_t)i && i <= dl->length); - unsigned n = likely(!dl->items[i].multi) ? 1 : dl->items[i].ptr->mp_pages; + unsigned n = 1; + if (unlikely(dl->items[i].mlru & MDBX_dp_multi_mask)) + n = dl->items[i].ptr->mp_pages; assert(n == (IS_OVERFLOW(dl->items[i].ptr) ? dl->items[i].ptr->mp_pages : 1)); return n; } @@ -3043,20 +3045,50 @@ static void dpl_remove(const MDBX_txn *txn, size_t i) { dpl_remove_ex(txn, i, dpl_npages(txn->tw.dirtylist, i)); } +static __noinline void txn_lru_reduce(MDBX_txn *txn) { + NOTICE("lru-reduce %u -> %u", txn->tw.dirtylru, txn->tw.dirtylru >> 1); + do { + txn->tw.dirtylru >>= 1; + MDBX_dpl *dl = txn->tw.dirtylist; + for (size_t i = 1; i <= dl->length; ++i) { + uint32_t mlru = dl->items[i].mlru; + mlru = (mlru & MDBX_dp_multi_mask) + ((mlru >> 1) & MDBX_dp_lru_mask); + dl->items[i].mlru = mlru; + } + txn = txn->mt_parent; + } while (txn); +} + +static __inline uint32_t dpl_age(const MDBX_txn *txn, size_t i) { + tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); + tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); + const MDBX_dpl *dl = txn->tw.dirtylist; + assert((intptr_t)i > 0 && i <= dl->length); + return (txn->tw.dirtylru + 1 - dl->items[i].mlru) >> 1; +} + +static __inline uint32_t txn_lru_inc(MDBX_txn *txn) { + if (unlikely(++txn->tw.dirtylru > UINT32_MAX / 3)) + txn_lru_reduce(txn); + return txn->tw.dirtylru & MDBX_dp_lru_mask; +} + static __always_inline int __must_check_result dpl_append(MDBX_txn *txn, pgno_t pgno, MDBX_page *page, size_t npages) { tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); + const MDBX_dp dp = {page, pgno, txn_lru_inc(txn) + (npages > 1)}; MDBX_dpl *dl = txn->tw.dirtylist; - assert(dl->length <= MDBX_PGL_LIMIT + MDBX_PNL_GRANULATE); - assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); + tASSERT(txn, dl->length <= MDBX_PGL_LIMIT + MDBX_PNL_GRANULATE); + tASSERT(txn, dl->items[0].pgno == 0 && + dl->items[dl->length + 1].pgno == P_INVALID); if (AUDIT_ENABLED()) { for (size_t i = dl->length; i > 0; --i) { - assert(dl->items[i].pgno != pgno); - if (unlikely(dl->items[i].pgno == pgno)) { - ERROR("Page %u already exist in the DPL at %zu", pgno, i); + assert(dl->items[i].pgno != dp.pgno); + if (unlikely(dl->items[i].pgno == dp.pgno)) { + ERROR("Page %u already exist in the DPL at %zu", dp.pgno, i); return MDBX_PROBLEM; } } @@ -3085,10 +3117,7 @@ static __always_inline int __must_check_result dpl_append(MDBX_txn *txn, /* copy the stub beyond the end */ dl->items[length + 1] = dl->items[length]; /* append page */ - dl->items[length].ptr = page; - dl->items[length].pgno = pgno; - dl->items[length].multi = npages > 1; - dl->items[length].lru = txn->tw.dirtylru++; + dl->items[length] = dp; dl->length = length; dl->sorted = sorted; dl->pages_including_loose += npages; @@ -3096,15 +3125,6 @@ static __always_inline int __must_check_result dpl_append(MDBX_txn *txn, return MDBX_SUCCESS; } -static __inline uint32_t dpl_age(const MDBX_txn *txn, size_t i) { - tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); - tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); - const MDBX_dpl *dl = txn->tw.dirtylist; - assert((intptr_t)i > 0 && i <= dl->length); - /* overflow could be here */ - return (txn->tw.dirtylru - dl->items[i].lru) & UINT32_C(0x7fffFFFF); -} - /*----------------------------------------------------------------------------*/ uint8_t runtime_flags = MDBX_RUNTIME_FLAGS_INIT; @@ -4627,7 +4647,9 @@ static size_t cursor_keep(MDBX_txn *txn, MDBX_cursor *mc) { size_t const n = dpl_search(txn, mp->mp_pgno); if (txn->tw.dirtylist->items[n].pgno == mp->mp_pgno && dpl_age(txn, n)) { - txn->tw.dirtylist->items[n].lru = txn->tw.dirtylru; + txn->tw.dirtylist->items[n].mlru = + (txn->tw.dirtylist->items[n].mlru & MDBX_dp_multi_mask) + + (txn->tw.dirtylru & MDBX_dp_lru_mask); ++keep; } } @@ -4911,6 +4933,11 @@ __cold static int txn_spill_slowpath(MDBX_txn *const txn, MDBX_cursor *const m0, const uint32_t reciprocal = (UINT32_C(255) << 24) / (age_max + 1); for (size_t i = 1; i <= dl->length; ++i) { const unsigned prio = spill_prio(txn, i, reciprocal); + TRACE("page %" PRIaPGNO + ", lru %u, is_multi %c, npages %u, age %u of %u, prio %u", + dl->items[i].pgno, dl->items[i].mlru & MDBX_dp_lru_mask, + (dl->items[i].mlru & MDBX_dp_multi_mask) ? 'Y' : 'N', + dpl_npages(dl, i), dpl_age(txn, i), age_max, prio); if (prio < 256) { radix_entries[prio] += 1; spillable_entries += 1; @@ -15160,7 +15187,9 @@ __hot static __always_inline pgr_t page_get_inline(const uint16_t ILL, const size_t i = dpl_search(spiller, pgno); tASSERT(txn, (intptr_t)i > 0); if (spiller->tw.dirtylist->items[i].pgno == pgno) { - spiller->tw.dirtylist->items[i].lru = txn->tw.dirtylru++; + const uint32_t is_multi = + spiller->tw.dirtylist->items[i].mlru & MDBX_dp_multi_mask; + spiller->tw.dirtylist->items[i].mlru = is_multi + txn_lru_inc(txn); r.page = spiller->tw.dirtylist->items[i].ptr; break; } diff --git a/src/internals.h b/src/internals.h index 8113e29a..8cdb75e6 100644 --- a/src/internals.h +++ b/src/internals.h @@ -918,13 +918,9 @@ typedef txnid_t *MDBX_TXL; typedef struct MDBX_dp { MDBX_page *ptr; pgno_t pgno; - union { - uint32_t extra; - __anonymous_struct_extension__ struct { - unsigned multi : 1; - unsigned lru : 31; - }; - }; + uint32_t mlru; +#define MDBX_dp_multi_mask 1 +#define MDBX_dp_lru_mask UINT32_C(0xffffFFFe) } MDBX_dp; /* An DPL (dirty-page list) is a sorted array of MDBX_DPs. */ From 3a77af7d8a1e11752a1a14a820d51c083c5a9b1e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Tue, 6 Dec 2022 22:20:00 +0300 Subject: [PATCH 259/364] =?UTF-8?q?mdbx:=20=D0=BE=D0=BF=D1=82=D0=B8=D0=BC?= =?UTF-8?q?=D0=B8=D0=B7=D0=B0=D1=86=D0=B8=D1=8F=20=D0=BF=D0=BE=D0=B4=D0=B4?= =?UTF-8?q?=D0=B5=D1=80=D0=B6=D0=BA=D0=B8=20=D1=81=D0=BE=D1=80=D1=82=D0=B8?= =?UTF-8?q?=D1=80=D0=BE=D0=B2=D0=BA=D0=B8=20=D0=B2=20`dpl=5Fappend()`.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core.c | 99 +++++++++++++++++++++++++++++++++++++++---------- src/internals.h | 2 +- 2 files changed, 80 insertions(+), 21 deletions(-) diff --git a/src/core.c b/src/core.c index e13c5cbf..215c840c 100644 --- a/src/core.c +++ b/src/core.c @@ -2717,11 +2717,9 @@ static int __must_check_result txl_append(MDBX_TXL *ptl, txnid_t id) { /*----------------------------------------------------------------------------*/ -#define MDBX_DPL_UNSORTED_BACKLOG 16 -#define MDBX_DPL_GAP_FOR_MERGESORT MDBX_DPL_UNSORTED_BACKLOG -#define MDBX_DPL_GAP_FOR_EDGING 2 -#define MDBX_DPL_RESERVE_GAP \ - (MDBX_DPL_GAP_FOR_MERGESORT + MDBX_DPL_GAP_FOR_EDGING) +#define MDBX_DPL_GAP_MERGESORT 16 +#define MDBX_DPL_GAP_EDGING 2 +#define MDBX_DPL_RESERVE_GAP (MDBX_DPL_GAP_MERGESORT + MDBX_DPL_GAP_EDGING) static __always_inline size_t dpl_size2bytes(ptrdiff_t size) { assert(size > CURSOR_STACK && (size_t)size <= MDBX_PGL_LIMIT); @@ -2847,7 +2845,7 @@ __hot __noinline static MDBX_dpl *dpl_sort_slowpath(const MDBX_txn *txn) { unlikely(!dpl_radixsort(dl->items + 1, dl->length))) { if (dl->sorted > unsorted / 4 + 4 && (MDBX_DPL_PREALLOC_FOR_RADIXSORT || - dl->length + unsorted < dl->detent + MDBX_DPL_GAP_FOR_MERGESORT)) { + dl->length + unsorted < dl->detent + MDBX_DPL_GAP_MERGESORT)) { MDBX_dp *const sorted_begin = dl->items + 1; MDBX_dp *const sorted_end = sorted_begin + dl->sorted; MDBX_dp *const end = @@ -3094,12 +3092,6 @@ static __always_inline int __must_check_result dpl_append(MDBX_txn *txn, } } - const size_t length = dl->length + 1; - const size_t sorted = - (dl->sorted == dl->length && dl->items[dl->length].pgno < pgno) - ? length - : dl->sorted; - if (unlikely(dl->length == dl->detent)) { if (unlikely(dl->detent >= MDBX_PGL_LIMIT)) { ERROR("DPL is full (MDBX_PGL_LIMIT %zu)", MDBX_PGL_LIMIT); @@ -3114,14 +3106,78 @@ static __always_inline int __must_check_result dpl_append(MDBX_txn *txn, tASSERT(txn, dl->length < dl->detent); } - /* copy the stub beyond the end */ - dl->items[length + 1] = dl->items[length]; - /* append page */ - dl->items[length] = dp; - dl->length = length; - dl->sorted = sorted; + /* Сортировка нужна для быстрого поиска, используем несколько тактик: + * 1) Сохраняем упорядоченность при естественной вставке в нужном порядке. + * 2) Добавляем в не-сортированный хвост, который сортируем и сливаем + * с отсортированной головой по необходимости, а пока хвост короткий + * ищем в нём сканированием, избегая большой пересортировки. + * 3) Если не-сортированный хвост короткий, а добавляемый элемент близок + * к концу отсортированной головы, то выгоднее сразу вставить элемент + * в нужное место. + * + * Алгоритмически: + * - добавлять в не-сортированный хвост следует только если вставка сильно + * дорогая, т.е. если целевая позиция элемента сильно далека от конца; + * - для быстрой проверки достаточно сравнить добавляемый элемент с отстоящим + * от конца на максимально-приемлемое расстояние; + * - если список короче, либо элемент в этой позиции меньше вставляемого, + * то следует перемещать элементы и вставлять в отсортированную голову; + * - если не-сортированный хвост длиннее, либо элемент в этой позиции больше, + * то следует добавлять в не-сортированный хвост. */ + dl->pages_including_loose += npages; + MDBX_dp *i = dl->items + dl->length; + +#define MDBX_DPL_INSERTION_THRESHOLD 42 + const ptrdiff_t pivot = (ptrdiff_t)dl->length - MDBX_DPL_INSERTION_THRESHOLD; +#if MDBX_HAVE_CMOV + const pgno_t pivot_pgno = + dl->items[(dl->length < MDBX_DPL_INSERTION_THRESHOLD) + ? 0 + : dl->length - MDBX_DPL_INSERTION_THRESHOLD] + .pgno; +#endif /* MDBX_HAVE_CMOV */ + + /* copy the stub beyond the end */ + i[2] = i[1]; + dl->length += 1; + + if (likely(pivot <= (ptrdiff_t)dl->sorted) && +#if MDBX_HAVE_CMOV + pivot_pgno < dp.pgno) { +#else + (pivot <= 0 || dl->items[pivot].pgno < dp.pgno)) { +#endif /* MDBX_HAVE_CMOV */ + dl->sorted += 1; + + /* сдвигаем несортированный хвост */ + while (i >= dl->items + dl->sorted) { +#if !defined(__GNUC__) /* пытаемся избежать вызова memmove() */ + i[1] = *i; +#elif MDBX_WORDBITS == 64 && \ + (defined(__SIZEOF_INT128__) || \ + (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)) + STATIC_ASSERT(sizeof(MDBX_dp) == sizeof(__uint128_t)); + ((__uint128_t *)i)[1] = *(volatile __uint128_t *)i; +#else + i[1].ptr = i->ptr; + i[1].pgno = i->pgno; + i[1].mlru = i->mlru; +#endif + --i; + } + /* ищем нужную позицию сдвигая отсортированные элементы */ + while (i->pgno > pgno) { + tASSERT(txn, i > dl->items); + i[1] = *i; + --i; + } + tASSERT(txn, i->pgno < dp.pgno); + } + + i[1] = dp; assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); + assert(dl->sorted <= dl->length); return MDBX_SUCCESS; } @@ -10082,7 +10138,8 @@ retry: MDBX_dpl *const dl = txn->tw.dirtylist; if (dl) { tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); - size_t w = 0; + tASSERT(txn, dl->sorted <= dl->length); + size_t w = 0, sorted_out = 0; for (size_t r = w; ++r <= dl->length;) { MDBX_page *dp = dl->items[r].ptr; tASSERT(txn, dp->mp_flags == P_LOOSE || IS_MODIFIABLE(txn, dp)); @@ -10092,6 +10149,7 @@ retry: dl->items[w] = dl->items[r]; } else { tASSERT(txn, dp->mp_flags == P_LOOSE); + sorted_out += dl->sorted >= r; if (!MDBX_AVOID_MSYNC || !(env->me_flags & MDBX_WRITEMAP)) { tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0); dpage_free(env, dp, 1); @@ -10101,8 +10159,9 @@ retry: TRACE("%s: filtered-out loose-pages from %zu -> %zu dirty-pages", dbg_prefix_mode, dl->length, w); tASSERT(txn, txn->tw.loose_count == dl->length - w); + dl->sorted -= sorted_out; + tASSERT(txn, dl->sorted <= w); dpl_setlen(dl, w); - dl->sorted = 0; dl->pages_including_loose -= txn->tw.loose_count; txn->tw.dirtyroom += txn->tw.loose_count; tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length == diff --git a/src/internals.h b/src/internals.h index 8cdb75e6..d67c912c 100644 --- a/src/internals.h +++ b/src/internals.h @@ -892,7 +892,7 @@ typedef struct MDBX_lockinfo { #endif /* MDBX_WORDBITS */ #define MDBX_READERS_LIMIT 32767 -#define MDBX_RADIXSORT_THRESHOLD 333 +#define MDBX_RADIXSORT_THRESHOLD 142 /*----------------------------------------------------------------------------*/ From 4b27c4c7c9605a05dab4c7beb003b1c956fcacc9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Wed, 7 Dec 2022 20:02:23 +0300 Subject: [PATCH 260/364] =?UTF-8?q?mdbx:=20=D0=BF=D1=80=D0=B5=D0=B4=D0=B2?= =?UTF-8?q?=D0=B0=D1=80=D0=B8=D1=82=D0=B5=D0=BB=D1=8C=D0=BD=D0=BE=D0=B5=20?= =?UTF-8?q?=D0=B2=D1=8B=D1=87=D0=B8=D1=81=D0=BB=D0=B5=D0=BD=D0=B8=D0=B5=20?= =?UTF-8?q?`me=5Fmaxgc=5Fper=5Fbranch`.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core.c | 54 ++++++++++++++++++++++++------------------------- src/internals.h | 7 ++++--- 2 files changed, 31 insertions(+), 30 deletions(-) diff --git a/src/core.c b/src/core.c index 215c840c..0bd29a4d 100644 --- a/src/core.c +++ b/src/core.c @@ -9846,38 +9846,35 @@ static int gcu_prepare_backlog(MDBX_txn *txn, gcu_context_t *ctx) { const intptr_t retired_left = MDBX_PNL_SIZEOF(txn->tw.retired_pages) - ctx->retired_stored; - size_t for_retiredlist = 0; + size_t for_relist = 0; if (MDBX_ENABLE_BIGFOOT && retired_left > 0) { - for_retiredlist = (retired_left + txn->mt_env->me_maxgc_ov1page - 1) / - txn->mt_env->me_maxgc_ov1page; - const size_t per_branch_page = - (txn->mt_env->me_psize - PAGEHDRSZ) / - (sizeof(indx_t) + sizeof(MDBX_node) + sizeof(txnid_t)); - for (size_t entries = for_retiredlist; entries > 1; for_split += entries) + for_relist = (retired_left + txn->mt_env->me_maxgc_ov1page - 1) / + txn->mt_env->me_maxgc_ov1page; + const size_t per_branch_page = txn->mt_env->me_maxgc_per_branch; + for (size_t entries = for_relist; entries > 1; for_split += entries) entries = (entries + per_branch_page - 1) / per_branch_page; } else if (!MDBX_ENABLE_BIGFOOT && retired_left != 0) { - for_retiredlist = + for_relist = number_of_ovpages(txn->mt_env, MDBX_PNL_SIZEOF(txn->tw.retired_pages)); } const size_t for_tree_before_touch = for_cow + for_rebalance + for_split; const size_t for_tree_after_touch = for_rebalance + for_split; - const size_t for_data = for_retiredlist; - const size_t for_all_before_touch = for_data + for_tree_before_touch; - const size_t for_all_after_touch = for_data + for_tree_after_touch; + const size_t for_all_before_touch = for_relist + for_tree_before_touch; + const size_t for_all_after_touch = for_relist + for_tree_after_touch; - if (likely(for_data < 2 && gcu_backlog_size(txn) > for_all_before_touch)) + if (likely(for_relist < 2 && gcu_backlog_size(txn) > for_all_before_touch)) return MDBX_SUCCESS; TRACE(">> retired-stored %zu, left %zi, backlog %zu, need %zu (4list %zu, " "4split %zu, " "4cow %zu, 4tree %zu)", ctx->retired_stored, retired_left, gcu_backlog_size(txn), - for_all_before_touch, for_data, for_split, for_cow, + for_all_before_touch, for_relist, for_split, for_cow, for_tree_before_touch); int err; - if (unlikely(for_data > 2)) { + if (unlikely(for_relist > 2)) { MDBX_val key, val; key.iov_base = val.iov_base = nullptr; key.iov_len = sizeof(txnid_t); @@ -9890,7 +9887,7 @@ static int gcu_prepare_backlog(MDBX_txn *txn, gcu_context_t *ctx) { err = gcu_touch(ctx); TRACE("== after-touch, backlog %zu, err %d", gcu_backlog_size(txn), err); - if (!MDBX_ENABLE_BIGFOOT && unlikely(for_data > 1) && + if (!MDBX_ENABLE_BIGFOOT && unlikely(for_relist > 1) && MDBX_PNL_GETSIZE(txn->tw.retired_pages) != ctx->retired_stored && err == MDBX_SUCCESS) { if (unlikely(ctx->retired_stored)) { @@ -9900,10 +9897,10 @@ static int gcu_prepare_backlog(MDBX_txn *txn, gcu_context_t *ctx) { if (!ctx->retired_stored) return /* restart by tail-recursion */ gcu_prepare_backlog(txn, ctx); } - err = page_alloc_slowpath(&ctx->cursor, for_data, MDBX_ALLOC_RESERVE).err; + err = page_alloc_slowpath(&ctx->cursor, for_relist, MDBX_ALLOC_RESERVE).err; TRACE("== after-4linear, backlog %zu, err %d", gcu_backlog_size(txn), err); cASSERT(&ctx->cursor, - gcu_backlog_size(txn) >= for_data || err != MDBX_SUCCESS); + gcu_backlog_size(txn) >= for_relist || err != MDBX_SUCCESS); } while (gcu_backlog_size(txn) < for_all_after_touch && err == MDBX_SUCCESS) @@ -9950,11 +9947,11 @@ static int update_gc(MDBX_txn *txn, gcu_context_t *ctx) { /* txn->tw.relist[] can grow and shrink during this call. * txn->tw.last_reclaimed and txn->tw.retired_pages[] can only grow. - * Page numbers cannot disappear from txn->tw.retired_pages[]. */ + * But page numbers cannot disappear from txn->tw.retired_pages[]. */ retry: - ++ctx->loop; - TRACE("%s", " >> restart"); + if (ctx->loop++) + TRACE("%s", " >> restart"); int rc = MDBX_SUCCESS; tASSERT(txn, pnl_check_allocated(txn->tw.relist, txn->mt_next_pgno - MDBX_ENABLE_REFUND)); @@ -9979,13 +9976,11 @@ retry: ctx->rid = txn->tw.last_reclaimed; while (true) { /* Come back here after each Put() in case retired-list changed */ - MDBX_val key, data; TRACE("%s", " >> continue"); if (ctx->retired_stored != MDBX_PNL_GETSIZE(txn->tw.retired_pages) && - (ctx->loop == 1 || - MDBX_PNL_GETSIZE(txn->tw.retired_pages) > env->me_maxgc_ov1page || - ctx->retired_stored > env->me_maxgc_ov1page)) { + (ctx->loop == 1 || ctx->retired_stored > env->me_maxgc_ov1page || + MDBX_PNL_GETSIZE(txn->tw.retired_pages) > env->me_maxgc_ov1page)) { rc = gcu_prepare_backlog(txn, ctx); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; @@ -9993,6 +9988,7 @@ retry: tASSERT(txn, pnl_check_allocated(txn->tw.relist, txn->mt_next_pgno - MDBX_ENABLE_REFUND)); + MDBX_val key, data; if (ctx->lifo) { if (ctx->cleaned_slot < (txn->tw.lifo_reclaimed ? MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed) @@ -10446,10 +10442,11 @@ retry: } if (need_cleanup || ctx->dense) { - if (ctx->cleaned_slot) - TRACE("%s: restart inner-loop to clear and re-create GC entries", + if (ctx->cleaned_slot) { + TRACE("%s: restart to clear and re-create GC entries", dbg_prefix_mode); - ctx->cleaned_slot = 0; + goto retry; + } continue; } } @@ -12527,6 +12524,9 @@ __cold static void setup_pagesize(MDBX_env *env, const size_t pagesize) { ENSURE(env, maxgc_ov1page > 42 && maxgc_ov1page < (intptr_t)MDBX_PGL_LIMIT / 4); env->me_maxgc_ov1page = (unsigned)maxgc_ov1page; + env->me_maxgc_per_branch = + (unsigned)((pagesize - PAGEHDRSZ) / + (sizeof(indx_t) + sizeof(MDBX_node) + sizeof(txnid_t))); STATIC_ASSERT(LEAF_NODE_MAX(MIN_PAGESIZE) > sizeof(MDBX_db) + NODESIZE + 42); STATIC_ASSERT(LEAF_NODE_MAX(MAX_PAGESIZE) < UINT16_MAX); diff --git a/src/internals.h b/src/internals.h index d67c912c..0274cfc3 100644 --- a/src/internals.h +++ b/src/internals.h @@ -1250,9 +1250,10 @@ struct MDBX_env { uint16_t *me_dbflags; /* array of flags from MDBX_db.md_flags */ MDBX_atomic_uint32_t *me_dbiseqs; /* array of dbi sequence numbers */ unsigned - me_maxgc_ov1page; /* Number of pgno_t fit in a single overflow page */ - uint32_t me_live_reader; /* have liveness lock in reader table */ - void *me_userctx; /* User-settable context */ + me_maxgc_ov1page; /* Number of pgno_t fit in a single overflow page */ + unsigned me_maxgc_per_branch; + uint32_t me_live_reader; /* have liveness lock in reader table */ + void *me_userctx; /* User-settable context */ MDBX_hsr_func *me_hsr_callback; /* Callback for kicking laggard readers */ struct { From fd7aaf5f3511b4a20314fa3ebf13395013b8a034 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Thu, 8 Dec 2022 12:58:56 +0300 Subject: [PATCH 261/364] =?UTF-8?q?mdbx:=20=D0=B4=D0=BE=D0=B1=D0=B0=D0=B2?= =?UTF-8?q?=D0=BB=D0=B5=D0=BD=D0=B8=D0=B5=20=D0=BE=D1=88=D0=B8=D0=B1=D0=BA?= =?UTF-8?q?=D0=B8=20`MDBX=5FBACKLOG=5FDEPLETED`=20=D0=B8=20=D1=81=D0=BE?= =?UTF-8?q?=D0=BE=D1=82=D0=B2=D0=B5=D1=82=D1=81=D1=82=D0=B2=D1=83=D1=8E?= =?UTF-8?q?=D1=89=D0=B5=D0=B9=20=D0=BB=D0=BE=D0=B3=D0=B8=D0=BA=D0=B8=20?= =?UTF-8?q?=D0=B2=20`page=5Falloc=5Fslowpath()`.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mdbx.h | 6 ++++++ src/bits.md | 2 +- src/core.c | 26 ++++++++++++++++++++++---- src/internals.h | 4 +++- 4 files changed, 32 insertions(+), 6 deletions(-) diff --git a/mdbx.h b/mdbx.h index e249ed1e..cb14bf33 100644 --- a/mdbx.h +++ b/mdbx.h @@ -1926,6 +1926,12 @@ enum MDBX_error_t { /** Overlapping read and write transactions for the current thread */ MDBX_TXN_OVERLAPPING = -30415, + /** Внутренняя ошибка возвращаемая в случае нехватки запаса свободных страниц + * при обновлении GC. Используется как вспомогательное средство для отладки. + * \note С точки зрения пользователя семантически + * равнозначна \ref MDBX_PROBLEM. */ + MDBX_BACKLOG_DEPLETED = -30414, + /* The last of MDBX-added error codes */ MDBX_LAST_ADDED_ERRCODE = MDBX_TXN_OVERLAPPING, diff --git a/src/bits.md b/src/bits.md index 82c9eed4..99f9f117 100644 --- a/src/bits.md +++ b/src/bits.md @@ -5,7 +5,7 @@ N | MASK | ENV | TXN | DB | PUT | DBI | NOD 2 |0000 0004|ALLOC_NEW |TXN_DIRTY |DUPSORT | |DBI_FRESH |F_DUPDATA|P_OVERFLOW| | 3 |0000 0008|ALLOC_SLOT |TXN_SPILLS |INTEGERKEY| |DBI_CREAT | |P_META | | 4 |0000 0010|ALLOC_FAKE |TXN_HAS_CHILD |DUPFIXED |NOOVERWRITE|DBI_VALID | |P_BAD | | -5 |0000 0020| | |INTEGERDUP|NODUPDATA |DBI_USRVALID| |P_LEAF2 | | +5 |0000 0020| |TXN_DRAINED_GC|INTEGERDUP|NODUPDATA |DBI_USRVALID| |P_LEAF2 | | 6 |0000 0040| | |REVERSEDUP|CURRENT |DBI_DUPDATA | |P_SUBP | | 7 |0000 0080| | | |ALLDUPS |DBI_AUDITED | | | | 8 |0000 0100| _MAY_MOVE | | | | | | | <= | diff --git a/src/core.c b/src/core.c index 0bd29a4d..0962c74c 100644 --- a/src/core.c +++ b/src/core.c @@ -6750,10 +6750,12 @@ static __inline bool is_gc_usable(MDBX_txn *txn, const MDBX_cursor *mc, !(mc->mc_flags & C_GCU)) return false; - /* avoid (recursive) search inside empty tree and while tree is - updating, https://libmdbx.dqdkfa.ru/dead-github/issues/31 */ - if (txn->mt_dbs[FREE_DBI].md_entries == 0) + /* avoid search inside empty tree and while tree is updating, + https://libmdbx.dqdkfa.ru/dead-github/issues/31 */ + if (unlikely(txn->mt_dbs[FREE_DBI].md_entries == 0)) { + txn->mt_flags |= MDBX_TXN_DRAINED_GC; return false; + } return true; } @@ -7090,8 +7092,10 @@ static pgr_t page_alloc_slowpath(const MDBX_cursor *const mc, const size_t num, //--------------------------------------------------------------------------- - if (unlikely(!is_gc_usable(txn, mc, flags))) + if (unlikely(!is_gc_usable(txn, mc, flags))) { + eASSERT(env, txn->mt_flags & MDBX_TXN_DRAINED_GC); goto no_gc; + } eASSERT(env, (flags & (MDBX_ALLOC_COALESCE | MDBX_ALLOC_LIFO | MDBX_ALLOC_SHOULD_SCAN)) == 0); @@ -7178,6 +7182,7 @@ next_gc:; if (unlikely(id >= detent)) goto depleted_gc; } + txn->mt_flags &= ~MDBX_TXN_DRAINED_GC; /* Reading next GC record */ MDBX_val data; @@ -7326,9 +7331,12 @@ scan: } depleted_gc: + TRACE("%s: last id #%" PRIaTXN ", re-len %zu", "gc-depleted", id, + MDBX_PNL_GETSIZE(txn->tw.relist)); ret.err = MDBX_NOTFOUND; if (flags & MDBX_ALLOC_SHOULD_SCAN) goto scan; + txn->mt_flags |= MDBX_TXN_DRAINED_GC; //------------------------------------------------------------------------- @@ -7431,6 +7439,14 @@ depleted_gc: no_gc: eASSERT(env, pgno == 0); +#ifndef MDBX_ENABLE_BACKLOG_DEPLETED +#define MDBX_ENABLE_BACKLOG_DEPLETED 0 +#endif /* MDBX_ENABLE_BACKLOG_DEPLETED*/ + if (MDBX_ENABLE_BACKLOG_DEPLETED && + unlikely(!(txn->mt_flags & MDBX_TXN_DRAINED_GC))) { + ret.err = MDBX_BACKLOG_DEPLETED; + goto fail; + } if (flags & MDBX_ALLOC_RESERVE) { ret.err = MDBX_NOTFOUND; goto fail; @@ -9915,6 +9931,8 @@ static int gcu_prepare_backlog(MDBX_txn *txn, gcu_context_t *ctx) { (size_t)txn->mt_dbs[FREE_DBI].md_leaf_pages, (size_t)txn->mt_dbs[FREE_DBI].md_overflow_pages, (size_t)txn->mt_dbs[FREE_DBI].md_entries); + tASSERT(txn, + err != MDBX_NOTFOUND || (txn->mt_flags & MDBX_TXN_DRAINED_GC) != 0); return (err != MDBX_NOTFOUND) ? err : MDBX_SUCCESS; } diff --git a/src/internals.h b/src/internals.h index 0274cfc3..790ee2ea 100644 --- a/src/internals.h +++ b/src/internals.h @@ -1013,9 +1013,11 @@ struct MDBX_txn { /* Additional flag for sync_locked() */ #define MDBX_SHRINK_ALLOWED UINT32_C(0x40000000) +#define MDBX_TXN_DRAINED_GC 0x20 /* GC was depleted up to oldest reader */ + #define TXN_FLAGS \ (MDBX_TXN_FINISHED | MDBX_TXN_ERROR | MDBX_TXN_DIRTY | MDBX_TXN_SPILLS | \ - MDBX_TXN_HAS_CHILD | MDBX_TXN_INVALID) + MDBX_TXN_HAS_CHILD | MDBX_TXN_INVALID | MDBX_TXN_DRAINED_GC) #if (TXN_FLAGS & (MDBX_TXN_RW_BEGIN_FLAGS | MDBX_TXN_RO_BEGIN_FLAGS)) || \ ((MDBX_TXN_RW_BEGIN_FLAGS | MDBX_TXN_RO_BEGIN_FLAGS | TXN_FLAGS) & \ From ebc4976acb4a8c9874d851e85c3f8dd4f3ecb0d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Thu, 8 Dec 2022 13:42:55 +0300 Subject: [PATCH 262/364] =?UTF-8?q?mdbx:=20=D0=BF=D0=B5=D1=80=D0=B5=D0=BD?= =?UTF-8?q?=D0=BE=D1=81=20=D0=BE=D0=B1=D0=BD=D0=BE=D0=B2=D0=BB=D0=B5=D0=BD?= =?UTF-8?q?=D0=B8=D1=8F=20geo-=D1=80=D0=B0=D0=B7=D0=BC=D0=B5=D1=80=D0=B0?= =?UTF-8?q?=20=D0=B2=20`map=5Fresize()`.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core.c | 32 +++++++++++++++++++++++--------- 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/src/core.c b/src/core.c index 0962c74c..567394da 100644 --- a/src/core.c +++ b/src/core.c @@ -6109,6 +6109,9 @@ bailout: eASSERT(env, limit_bytes == env->me_dxb_mmap.limit); eASSERT(env, size_bytes <= env->me_dxb_mmap.filesize); eASSERT(env, size_bytes == env->me_dxb_mmap.current); + /* update env-geo to avoid influences */ + env->me_dbgeo.now = env->me_dxb_mmap.current; + env->me_dbgeo.upper = env->me_dxb_mmap.limit; #ifdef MDBX_USE_VALGRIND if (prev_limit != env->me_dxb_mmap.limit || prev_map != env->me_map) { VALGRIND_DISCARD(env->me_valgrind_handle); @@ -13022,6 +13025,13 @@ mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t size_now, current_geo = &env->me_txn->mt_geo; } + /* update env-geo to avoid influences */ + env->me_dbgeo.now = pgno2bytes(env, current_geo->now); + env->me_dbgeo.lower = pgno2bytes(env, current_geo->lower); + env->me_dbgeo.upper = pgno2bytes(env, current_geo->upper); + env->me_dbgeo.grow = pgno2bytes(env, pv2pages(current_geo->grow_pv)); + env->me_dbgeo.shrink = pgno2bytes(env, pv2pages(current_geo->shrink_pv)); + MDBX_geo new_geo; new_geo.lower = bytes2pgno(env, size_lower); new_geo.now = bytes2pgno(env, size_now); @@ -13094,16 +13104,20 @@ mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t size_now, } else { meta.mm_geo = new_geo; rc = sync_locked(env, env->me_flags, &meta, &env->me_txn0->tw.troika); + if (likely(rc == MDBX_SUCCESS)) { + env->me_dbgeo.now = pgno2bytes(env, new_geo.now = meta.mm_geo.now); + env->me_dbgeo.upper = + pgno2bytes(env, new_geo.upper = meta.mm_geo.upper); + } } - - if (likely(rc == MDBX_SUCCESS)) { - /* store new geo to env to avoid influences */ - env->me_dbgeo.now = pgno2bytes(env, new_geo.now); - env->me_dbgeo.lower = pgno2bytes(env, new_geo.lower); - env->me_dbgeo.upper = pgno2bytes(env, new_geo.upper); - env->me_dbgeo.grow = pgno2bytes(env, pv2pages(new_geo.grow_pv)); - env->me_dbgeo.shrink = pgno2bytes(env, pv2pages(new_geo.shrink_pv)); - } + } + if (likely(rc == MDBX_SUCCESS)) { + /* update env-geo to avoid influences */ + eASSERT(env, env->me_dbgeo.now == pgno2bytes(env, new_geo.now)); + env->me_dbgeo.lower = pgno2bytes(env, new_geo.lower); + eASSERT(env, env->me_dbgeo.upper == pgno2bytes(env, new_geo.upper)); + env->me_dbgeo.grow = pgno2bytes(env, pv2pages(new_geo.grow_pv)); + env->me_dbgeo.shrink = pgno2bytes(env, pv2pages(new_geo.shrink_pv)); } } From a572902fde3b307e50f105c248d24a62fd9f58f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Thu, 8 Dec 2022 15:35:41 +0300 Subject: [PATCH 263/364] =?UTF-8?q?mdbx:=20=D0=B0=D0=B2=D1=82=D0=BE=D0=BC?= =?UTF-8?q?=D0=B0=D1=82=D0=B8=D1=87=D0=B5=D1=81=D0=BA=D0=B0=D1=8F=20=D1=83?= =?UTF-8?q?=D1=81=D1=82=D0=B0=D0=BD=D0=BE=D0=B2=D0=BA=D0=B0=20`rp=5Faugmen?= =?UTF-8?q?t=5Flimit`=20=D0=B2=20"=D0=B7=D0=BE=D0=BB=D0=BE=D1=82=D0=BE?= =?UTF-8?q?=D0=B5=20=D1=81=D0=B5=D1=87=D0=B5=D0=BD=D0=B8=D0=B5"=20=D0=BE?= =?UTF-8?q?=D1=82=20=D1=80=D0=B0=D0=B7=D0=BC=D0=B5=D1=80=D0=B0=20=D0=91?= =?UTF-8?q?=D0=94.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core.c | 59 +++++++++++++++++++++++++++++++------------------ src/internals.h | 5 ++++- 2 files changed, 41 insertions(+), 23 deletions(-) diff --git a/src/core.c b/src/core.c index 567394da..9b062704 100644 --- a/src/core.c +++ b/src/core.c @@ -5919,6 +5919,14 @@ __cold static void munlock_all(const MDBX_env *env) { munlock_after(env, 0, bytes_align2os_bytes(env, env->me_dxb_mmap.current)); } +__cold static unsigned default_rp_augment_limit(const MDBX_env *env) { + /* default drp_augment_limit = ceil(npages / gold_ratio) */ + const size_t augment = (env->me_dbgeo.now >> (env->me_psize2log + 10)) * 633u; + eASSERT(env, augment < MDBX_PGL_LIMIT); + return pnl_bytes2size(pnl_size2bytes( + (augment > MDBX_PNL_INITIAL) ? augment : MDBX_PNL_INITIAL)); +} + __cold static int map_resize(MDBX_env *env, const pgno_t used_pgno, const pgno_t size_pgno, const pgno_t limit_pgno, const bool implicit) { @@ -6112,6 +6120,8 @@ bailout: /* update env-geo to avoid influences */ env->me_dbgeo.now = env->me_dxb_mmap.current; env->me_dbgeo.upper = env->me_dxb_mmap.limit; + if (!env->me_options.flags.non_auto.rp_augment_limit) + env->me_options.rp_augment_limit = default_rp_augment_limit(env); #ifdef MDBX_USE_VALGRIND if (prev_limit != env->me_dxb_mmap.limit || prev_map != env->me_map) { VALGRIND_DISCARD(env->me_valgrind_handle); @@ -7236,15 +7246,17 @@ next_gc:; /* have enough unallocated space */ txn->mt_geo.upper >= txn->mt_next_pgno + num) || gc_len + MDBX_PNL_GETSIZE(txn->tw.relist) >= MDBX_PGL_LIMIT)) { - /* Stop reclaiming to avoid large/overflow the page list. - * This is a rare case while search for a continuously multi-page region - * in a large database. - * https://libmdbx.dqdkfa.ru/dead-github/issues/123 - */ - NOTICE("stop reclaiming to avoid PNL overflow: %zu (current) + %zu " - "(chunk) -> %zu", + /* Stop reclaiming to avoid large/overflow the page list. This is a rare + * case while search for a continuously multi-page region in a + * large database, see https://libmdbx.dqdkfa.ru/dead-github/issues/123 */ + NOTICE("stop reclaiming %s: %zu (current) + %zu " + "(chunk) -> %zu, rp_augment_limit %u", + likely(gc_len + MDBX_PNL_GETSIZE(txn->tw.relist) < MDBX_PGL_LIMIT) + ? "since rp_augment_limit was reached" + : "to avoid PNL overflow", MDBX_PNL_GETSIZE(txn->tw.relist), gc_len, - gc_len + MDBX_PNL_GETSIZE(txn->tw.relist)); + gc_len + MDBX_PNL_GETSIZE(txn->tw.relist), + env->me_options.rp_augment_limit); goto depleted_gc; } } @@ -12547,7 +12559,7 @@ __cold static void setup_pagesize(MDBX_env *env, const size_t pagesize) { env->me_maxgc_ov1page = (unsigned)maxgc_ov1page; env->me_maxgc_per_branch = (unsigned)((pagesize - PAGEHDRSZ) / - (sizeof(indx_t) + sizeof(MDBX_node) + sizeof(txnid_t))); + (sizeof(indx_t) + sizeof(MDBX_node) + sizeof(txnid_t))); STATIC_ASSERT(LEAF_NODE_MAX(MIN_PAGESIZE) > sizeof(MDBX_db) + NODESIZE + 42); STATIC_ASSERT(LEAF_NODE_MAX(MAX_PAGESIZE) < UINT16_MAX); @@ -12651,14 +12663,9 @@ __cold int mdbx_env_create(MDBX_env **penv) { env->me_pid = osal_getpid(); env->me_stuck_meta = -1; - env->me_options.dp_reserve_limit = 1024; - env->me_options.rp_augment_limit = 256 * 1024; - env->me_options.dp_limit = MDBX_DEBUG ? 64 * 1024 / 42 : 64 * 1024; - if (env->me_options.dp_limit > MAX_PAGENO + 1 - NUM_METAS) - env->me_options.dp_limit = MAX_PAGENO + 1 - NUM_METAS; + env->me_options.rp_augment_limit = MDBX_PNL_INITIAL; + env->me_options.dp_reserve_limit = MDBX_PNL_INITIAL; env->me_options.dp_initial = MDBX_PNL_INITIAL; - if (env->me_options.dp_initial > env->me_options.dp_limit) - env->me_options.dp_initial = env->me_options.dp_limit; env->me_options.spill_max_denominator = 8; env->me_options.spill_min_denominator = 8; env->me_options.spill_parent4child_denominator = 0; @@ -12970,6 +12977,8 @@ mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t size_now, pgno2bytes(env, pv2pages(pages2pv(bytes2pgno(env, growth_step)))); env->me_dbgeo.shrink = pgno2bytes(env, pv2pages(pages2pv(bytes2pgno(env, shrink_threshold)))); + if (!env->me_options.flags.non_auto.rp_augment_limit) + env->me_options.rp_augment_limit = default_rp_augment_limit(env); ENSURE(env, env->me_dbgeo.lower >= MIN_MAPSIZE); ENSURE(env, env->me_dbgeo.lower / (unsigned)pagesize >= MIN_PAGENO); @@ -14702,6 +14711,8 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, } if ((flags & MDBX_RDONLY) == 0) { + if (!env->me_options.flags.non_auto.rp_augment_limit) + env->me_options.rp_augment_limit = default_rp_augment_limit(env); const size_t tsize = sizeof(MDBX_txn) + sizeof(MDBX_cursor), size = tsize + env->me_maxdbs * (sizeof(MDBX_db) + sizeof(MDBX_cursor *) + @@ -23984,8 +23995,8 @@ __cold intptr_t mdbx_limits_txnsize_max(intptr_t pagesize) { STATIC_ASSERT(MAX_MAPSIZE < INTPTR_MAX); const uint64_t pgl_limit = - pagesize * (uint64_t)(MDBX_PGL_LIMIT / 1.6180339887498948482); - const uint64_t map_limit = (uint64_t)(MAX_MAPSIZE / 1.6180339887498948482); + pagesize * (uint64_t)(MDBX_PGL_LIMIT / MDBX_GOLD_RATIO_DBL); + const uint64_t map_limit = (uint64_t)(MAX_MAPSIZE / MDBX_GOLD_RATIO_DBL); return (pgl_limit < map_limit) ? (intptr_t)pgl_limit : (intptr_t)map_limit; } @@ -24332,11 +24343,15 @@ __cold int mdbx_env_set_option(MDBX_env *env, const MDBX_option_t option, break; case MDBX_opt_rp_augment_limit: - if (value == UINT64_MAX) - value = MDBX_PGL_LIMIT; - if (unlikely(value > MDBX_PGL_LIMIT)) + if (value == UINT64_MAX) { + env->me_options.flags.non_auto.rp_augment_limit = 0; + env->me_options.rp_augment_limit = default_rp_augment_limit(env); + } else if (unlikely(value > MDBX_PGL_LIMIT)) return MDBX_EINVAL; - env->me_options.rp_augment_limit = (unsigned)value; + else { + env->me_options.flags.non_auto.rp_augment_limit = 1; + env->me_options.rp_augment_limit = (unsigned)value; + } break; case MDBX_opt_txn_dp_limit: diff --git a/src/internals.h b/src/internals.h index 790ee2ea..2fc2b7a3 100644 --- a/src/internals.h +++ b/src/internals.h @@ -893,6 +893,7 @@ typedef struct MDBX_lockinfo { #define MDBX_READERS_LIMIT 32767 #define MDBX_RADIXSORT_THRESHOLD 142 +#define MDBX_GOLD_RATIO_DBL 1.6180339887498948482 /*----------------------------------------------------------------------------*/ @@ -936,7 +937,8 @@ typedef struct MDBX_dpl { } MDBX_dpl; /* PNL sizes */ -#define MDBX_PNL_GRANULATE 1024 +#define MDBX_PNL_GRANULATE_LOG2 10 +#define MDBX_PNL_GRANULATE (1 << MDBX_PNL_GRANULATE_LOG2) #define MDBX_PNL_INITIAL \ (MDBX_PNL_GRANULATE - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(pgno_t)) @@ -1276,6 +1278,7 @@ struct MDBX_env { /* tracks options with non-auto values but tuned by user */ struct { unsigned dp_limit : 1; + unsigned rp_augment_limit : 1; } non_auto; } flags; } me_options; From 1bb41ee8fcd7005b44e5e52c708e11d43f9adbd7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Fri, 9 Dec 2022 13:20:38 +0300 Subject: [PATCH 264/364] =?UTF-8?q?mdbx:=20=D0=BE=D1=82=D0=BA=D0=BB=D1=8E?= =?UTF-8?q?=D1=87=D0=B5=D0=BD=D0=B8=D0=B5=20"=D1=8D=D0=BA=D0=BE=D0=BD?= =?UTF-8?q?=D0=BE=D0=BC=D0=B8=D0=B8=20=D0=BF=D0=BE=D1=81=D0=BB=D0=B5=D0=B4?= =?UTF-8?q?=D0=BE=D0=B2=D0=B0=D1=82=D0=B5=D0=BB=D1=8C=D0=BD=D0=BE=D1=81?= =?UTF-8?q?=D1=82=D0=B5=D0=B9"=20=D0=BF=D0=BE=D1=81=D1=80=D0=B5=D0=B4?= =?UTF-8?q?=D1=81=D1=82=D0=B2=D0=BE=D0=BC=20`MDBX=5FENABLE=5FSAVING=5FSEQU?= =?UTF-8?q?ENCES=3D0`.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core.c | 107 ++++++++++++++++++++++++++++++++++++------------ src/internals.h | 2 +- 2 files changed, 82 insertions(+), 27 deletions(-) diff --git a/src/core.c b/src/core.c index 9b062704..0bb42154 100644 --- a/src/core.c +++ b/src/core.c @@ -6781,14 +6781,68 @@ __hot static bool is_already_reclaimed(const MDBX_txn *txn, txnid_t id) { return false; } -__hot static pgno_t pnl_get_single(MDBX_PNL pnl) { - const size_t len = MDBX_PNL_GETSIZE(pnl); +__hot static pgno_t relist_get_single(MDBX_txn *txn) { + const size_t len = MDBX_PNL_GETSIZE(txn->tw.relist); assert(len > 0); - pgno_t *target = MDBX_PNL_EDGE(pnl); + pgno_t *target = MDBX_PNL_EDGE(txn->tw.relist); const ptrdiff_t dir = MDBX_PNL_ASCENDING ? 1 : -1; - /* пытаемся пропускать последовательности при наличии одиночных элементов */ - if (likely(len > 2) && unlikely(target[dir] == *target + 1)) { + /* Есть ТРИ потенциально выигрышные, но противо-направленные тактики: + * + * 1. Стараться использовать страницы с наименьшими номерами. Так обмен с + * диском будет более кучным, а у страниц ближе к концу БД будет больше шансов + * попасть под авто-компактификацию. Частично эта тактика уже реализована, но + * для её эффективности требуется явно приоритезировать выделение страниц: + * - поддерживать для relist, для ближних и для дальних страниц; + * - использовать страницы из дальнего списка, если первый пуст, + * а второй слишком большой, либо при пустой GC. + * + * 2. Стараться выделять страницы последовательно. Так записываемые на диск + * регионы будут линейными, что принципиально ускоряет запись на HDD. + * Одновременно, в среднем это не повлияет на чтение, точнее говоря, если + * порядок чтения не совпадает с порядком изменения (иначе говоря, если + * чтение не коррклирует с обновлениями и/или вставками) то не повлияет, иначе + * может ускорить. Однако, последовательности в среднем достаточно редки. + * Поэтому для эффективности требуется аккумулировать и поддерживать в ОЗУ + * огромные списки страниц, а затем сохранять их обратно в БД. Текущий формат + * БД (без битовых карт) для этого крайне не удачен. Поэтому эта тактика не + * имеет шансов быть успешной без смены формата БД (Mithril). + * + * 3. Стараться экономить последовательности страниц. Это позволяет избегать + * лишнего чтения/поиска в GC при более-менее постоянном размещении и/или + * обновлении данных требующих более одной страницы. Проблема в том, что без + * информации от приложения библиотека не может знать насколько + * востребованными будут последовательности в ближайшей перспективе, а + * экономия последовательностей "на всякий случай" не только затратна + * сама-по-себе, но и работает во вред. + * + * Поэтому: + * - в TODO добавляется разделение relist на «ближние» и «дальние» страницы, + * с последующей реализацией первой тактики; + * - преимущественное использование последовательностей отправляется + * в MithrilDB как составляющая "HDD frendly" feature; + * - реализованная в 3757eb72f7c6b46862f8f17881ac88e8cecc1979 экономия + * последовательностей отключается через MDBX_ENABLE_SAVING_SEQUENCES=0. + * + * В качестве альтернативы для безусловной «экономии» последовательностей, + * в следующих версиях libmdbx, вероятно, будет предложено + * API для взаимодействия с GC: + * - получение размера GC, включая гистограммы размеров последовательностей + * и близости к концу БД; + * - включение формирования "линейного запаса" для последующего использования + * в рамках текущей транзакции; + * - намеренная загрузка GC в память для коагуляции и "выпрямления"; + * - намеренное копирование данных из страниц в конце БД для последующего + * из освобождения, т.е. контролируемая компактификация по запросу. */ + +#ifndef MDBX_ENABLE_SAVING_SEQUENCES +#define MDBX_ENABLE_SAVING_SEQUENCES 0 +#endif + if (MDBX_ENABLE_SAVING_SEQUENCES && unlikely(target[dir] == *target + 1) && + len > 2) { + /* Пытаемся пропускать последовательности при наличии одиночных элементов. + * TODO: необходимо кэшировать пропускаемые последовательности + * чтобы не сканировать список сначала при каждом выделении. */ pgno_t *scan = target + dir + dir; size_t left = len; do { @@ -6799,7 +6853,7 @@ __hot static pgno_t pnl_get_single(MDBX_PNL pnl) { #else /* вырезаем элемент с перемещением хвоста */ const pgno_t pgno = *scan; - MDBX_PNL_SETSIZE(pnl, len - 1); + MDBX_PNL_SETSIZE(txn->tw.relist, len - 1); while (++scan <= target) scan[-1] = *scan; return pgno; @@ -6812,45 +6866,47 @@ __hot static pgno_t pnl_get_single(MDBX_PNL pnl) { const pgno_t pgno = *target; #if MDBX_PNL_ASCENDING /* вырезаем элемент с перемещением хвоста */ - MDBX_PNL_SETSIZE(pnl, len - 1); - for (const pgno_t *const end = pnl + len - 1; target <= end; ++target) + MDBX_PNL_SETSIZE(txn->tw.relist, len - 1); + for (const pgno_t *const end = txn->tw.relist + len - 1; target <= end; + ++target) *target = target[1]; #else /* перемещать хвост не нужно, просто усекам список */ - MDBX_PNL_SETSIZE(pnl, len - 1); + MDBX_PNL_SETSIZE(txn->tw.relist, len - 1); #endif return pgno; } -__hot static pgno_t pnl_get_sequence(MDBX_PNL pnl, const size_t num, - uint8_t flags) { - const size_t len = MDBX_PNL_GETSIZE(pnl); - pgno_t *edge = MDBX_PNL_EDGE(pnl); +__hot static pgno_t relist_get_sequence(MDBX_txn *txn, const size_t num, + uint8_t flags) { + const size_t len = MDBX_PNL_GETSIZE(txn->tw.relist); + pgno_t *edge = MDBX_PNL_EDGE(txn->tw.relist); assert(len >= num && num > 1); const size_t seq = num - 1; #if !MDBX_PNL_ASCENDING if (edge[-(ptrdiff_t)seq] - *edge == seq) { if (unlikely(flags & MDBX_ALLOC_RESERVE)) return P_INVALID; - assert(edge == scan4range_checker(pnl, seq)); + assert(edge == scan4range_checker(txn->tw.relist, seq)); /* перемещать хвост не нужно, просто усекам список */ - MDBX_PNL_SETSIZE(pnl, len - num); + MDBX_PNL_SETSIZE(txn->tw.relist, len - num); return *edge; } #endif pgno_t *target = scan4seq_impl(edge, len, seq); - assert(target == scan4range_checker(pnl, seq)); + assert(target == scan4range_checker(txn->tw.relist, seq)); if (target) { if (unlikely(flags & MDBX_ALLOC_RESERVE)) return P_INVALID; const pgno_t pgno = *target; /* вырезаем найденную последовательность с перемещением хвоста */ - MDBX_PNL_SETSIZE(pnl, len - num); + MDBX_PNL_SETSIZE(txn->tw.relist, len - num); #if MDBX_PNL_ASCENDING - for (const pgno_t *const end = pnl + len - num; target <= end; ++target) + for (const pgno_t *const end = txn->tw.relist + len - num; target <= end; + ++target) *target = target[num]; #else - for (const pgno_t *const end = pnl + len; ++target <= end;) + for (const pgno_t *const end = txn->tw.relist + len; ++target <= end;) target[-(ptrdiff_t)num] = *target; #endif return pgno; @@ -7094,7 +7150,7 @@ static pgr_t page_alloc_slowpath(const MDBX_cursor *const mc, const size_t num, if (MDBX_PNL_GETSIZE(txn->tw.relist) >= num) { eASSERT(env, MDBX_PNL_LAST(txn->tw.relist) < txn->mt_next_pgno && MDBX_PNL_FIRST(txn->tw.relist) < txn->mt_next_pgno); - pgno = pnl_get_sequence(txn->tw.relist, num, flags); + pgno = relist_get_sequence(txn, num, flags); if (likely(pgno)) goto done; } @@ -7231,10 +7287,10 @@ next_gc:; eASSERT(env, MDBX_PNL_LAST(txn->tw.relist) < txn->mt_next_pgno && MDBX_PNL_FIRST(txn->tw.relist) < txn->mt_next_pgno); if (likely(num == 1)) { - pgno = pnl_get_single(txn->tw.relist); + pgno = relist_get_single(txn); goto done; } - pgno = pnl_get_sequence(txn->tw.relist, num, flags); + pgno = relist_get_sequence(txn, num, flags); if (likely(pgno)) goto done; } @@ -7331,10 +7387,10 @@ scan: MDBX_PNL_FIRST(txn->tw.relist) < txn->mt_next_pgno); if (likely(num == 1)) { eASSERT(env, !(flags & MDBX_ALLOC_RESERVE)); - pgno = pnl_get_single(txn->tw.relist); + pgno = relist_get_single(txn); goto done; } - pgno = pnl_get_sequence(txn->tw.relist, num, flags); + pgno = relist_get_sequence(txn, num, flags); if (likely(pgno)) goto done; } @@ -7587,8 +7643,7 @@ __hot static pgr_t page_alloc(const MDBX_cursor *const mc) { } if (likely(MDBX_PNL_GETSIZE(txn->tw.relist) > 0)) - return page_alloc_finalize(txn->mt_env, txn, mc, - pnl_get_single(txn->tw.relist), 1); + return page_alloc_finalize(txn->mt_env, txn, mc, relist_get_single(txn), 1); return page_alloc_slowpath(mc, 1, MDBX_ALLOC_DEFAULT); } diff --git a/src/internals.h b/src/internals.h index 2fc2b7a3..51ff05ca 100644 --- a/src/internals.h +++ b/src/internals.h @@ -1078,7 +1078,7 @@ struct MDBX_txn { struct { meta_troika_t troika; /* In write txns, array of cursors for each DB */ - pgno_t *relist; /* Reclaimed GC pages */ + MDBX_PNL relist; /* Reclaimed GC pages */ txnid_t last_reclaimed; /* ID of last used record */ #if MDBX_ENABLE_REFUND pgno_t loose_refund_wl /* FIXME: describe */; From 4ee8fff30547b97569ac482b39dea27f4baeaecf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Fri, 9 Dec 2022 19:18:17 +0300 Subject: [PATCH 265/364] =?UTF-8?q?mdbx:=20+1=20=D0=BA=20=D0=BF=D0=BE?= =?UTF-8?q?=D0=B4=D0=B3=D0=BE=D1=82=D0=B0=D0=B2=D0=BB=D0=B8=D0=B2=D0=B0?= =?UTF-8?q?=D0=B5=D0=BC=D0=BE=D0=BC=D1=83=20=D1=80=D0=B5=D0=B7=D0=B5=D1=80?= =?UTF-8?q?=D0=B2=D1=83=20=D0=B2=20=D0=B2=D1=8B=D1=80=D0=BE=D0=B6=D0=B4?= =?UTF-8?q?=D0=B5=D0=BD=D0=BD=D1=8B=D1=85=20=D1=81=D0=BB=D1=83=D1=87=D0=B0?= =?UTF-8?q?=D1=8F=D1=85=20=D0=BF=D0=B5=D1=80=D0=B5=D0=B4=20=D0=BE=D0=B1?= =?UTF-8?q?=D0=BD=D0=BE=D0=B2=D0=BB=D0=B5=D0=BD=D0=B8=D0=B5=D0=BC=20GC.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/core.c b/src/core.c index 0bb42154..db00fc66 100644 --- a/src/core.c +++ b/src/core.c @@ -9927,7 +9927,9 @@ static int gcu_touch(gcu_context_t *ctx) { * during a deleting, when GC tree is unbalanced. */ static int gcu_prepare_backlog(MDBX_txn *txn, gcu_context_t *ctx) { const size_t for_cow = txn->mt_dbs[FREE_DBI].md_depth; - const size_t for_rebalance = for_cow + 1; + const size_t for_rebalance = for_cow + 1 + + (txn->mt_dbs[FREE_DBI].md_depth + 1ul >= + txn->mt_dbs[FREE_DBI].md_branch_pages); size_t for_split = ctx->retired_stored == 0; const intptr_t retired_left = From cd0ed2f155cf76f2973bcac29ab3637c368937d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Sat, 10 Dec 2022 01:20:27 +0300 Subject: [PATCH 266/364] =?UTF-8?q?mdbx:=20=D0=BE=D0=B1=D0=BD=D0=BE=D0=B2?= =?UTF-8?q?=D0=BB=D0=B5=D0=BD=D0=B8=D0=B5=20ChangeLog.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ChangeLog.md | 76 +++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 73 insertions(+), 3 deletions(-) diff --git a/ChangeLog.md b/ChangeLog.md index 3aa0f7ee..70d2b89d 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -4,7 +4,15 @@ ChangeLog English version [by Google](https://gitflic-ru.translate.goog/project/erthink/libmdbx/blob?file=ChangeLog.md&_x_tr_sl=ru&_x_tr_tl=en) and [by Yandex](https://translated.turbopages.org/proxy_u/ru-en.en/https/gitflic.ru/project/erthink/libmdbx/blob?file=ChangeLog.md). -## v0.12.3 (Акула) в процессе подготовки + +## v0.12.3 (Акула) запланирован на 2022-12-11 + +Выпуск с существенными доработками и новой функциональностью в память о закрытом open-source проекте "Акула". + +``` +18 files changed, 2470 insertions(+), 1562 deletions(-) +Signed-off-by: Леонид Юрьев (Leonid Yuriev) +``` Благодарности: @@ -13,17 +21,76 @@ and [by Yandex](https://translated.turbopages.org/proxy_u/ru-en.en/https/gitflic Новое: - - Использование адреса https://libmdbx.dqdkfa.ru/dead-github для отсылки к сохранённым в web.archive.org копиям ресурсов, уничтоженных администрацией Github. + - Использование адреса [https://libmdbx.dqdkfa.ru/dead-github](https://libmdbx.dqdkfa.ru/dead-github) + для отсылки к сохранённым в web.archive.org копиям ресурсов, уничтоженных администрацией Github. + + - Реализована prefault-запись при выделении страниц для read-write отображений. + Это приводит к кратному снижению системных издержек и существенному увеличению + производительности в соответствующих сценариях использования, когда: + - размер БД и объём данных существенно больше ОЗУ; + - используется режим `MDBX_WRITEMAP`; + - не-мелкие транзакции (по ходу транзакции выделяется многие сотни или тысячи страниц). + + В режиме `MDBX_WRITEMAP` выделение/переиспользование страниц приводит + к page-fault и чтению страницы с диска, даже если содержимое страницы + не нужно (будет перезаписано). Это является следствием работы подсистемы + виртуальной памяти, а штатный способ лечения через `MADV_REMOVE` + работает не на всех ФС и обычно дороже получаемой экономии. + + Теперь в libmdbx используется "упреждающая запись" таких страниц, + которая на системах с [unified page cache](https://www.opennet.ru/base/dev/ubc.txt.html) + приводит к "вталкиванию" данных, устраняя необходимость чтения с диска при + обращении к такой странице памяти. + + Новый функционал работает в согласованности с автоматическим управлением read-ahead + и кэшем статуса присутствия страниц в ОЗУ, посредством [mincore()](https://man7.org/linux/man-pages/man2/mincore.2.html). + + - Реализован динамический выбор между сквозной записью на диск и обычной записью + с последующим [fdatasync()](https://man7.org/linux/man-pages/man3/fdatasync.3p.html) + управляемый опцией `MDBX_opt_writethrough_threshold`. + + В долговечных (durable) режимах данные на диск могут быть сброшены двумя способами: + - сквозной записью через файловый дескриптор открытый с `O_DSYNC`; + - обычной записью с последующим вызовом `fdatasync()`. + + Первый способ выгоднее при записи малого количества страниц и/или если + канал взаимодействия с диском/носителем имеет близкую к нулю задержку. + Второй способ выгоднее если требуется записать много страниц и/или канал + взаимодействия имеет весомую задержку (датацентры, облака). Добавленная + опция `MDBX_opt_writethrough_threshold` позволяет во время выполнения + задать порог для динамического выбора способа записи в зависимост от + объема и конкретных условия использования. + + - Автоматическая установка `MDBX_opt_rp_augment_limit` в зависимости от размера БД. + + - Запрещение разного режима `MDBX_WRITEMAP` между процессами в режимах + с отложенной/ленивой записью, так как в этом случае невозможно + обеспечить сброс данных на диск во всех случаях на всех поддерживаемых платформах. Исправления (без корректировок новых функций): - - Устранение SIGSEGV или ошибочного вызова `free()` в ситуациях повторного открытия среды посредством `mdbx_env_open()`. + - Изменение размера отображения если это требуется для сброса данных на + диск при вызове `mdbx_env_sync()` из параллельного потока выполнения вне + работающей транзакции. + + - Исправление регресса после коммита db72763de049d6e4546f838277fe83b9081ad1de от 2022-10-08 + в логике возврата грязных страниц в режиме `MDBX_WRITEMAP`, из-за чего + освободившиеся страницы использовались не немедленно, а попадали в + retired-список совершаемой транзакции и происходил необоснованный рост + размера транзакции. + + - Устранение SIGSEGV или ошибочного вызова `free()` в ситуациях + повторного открытия среды посредством `mdbx_env_open()`. + - Устранение ошибки совершенной в коммите fe20de136c22ed3bc4c6d3f673e79c106e824f60 от 2022-09-18, в результате чего на Linux в режиме `MDBX_WRITEMAP` никогда не вызывался `msync()`. Проблема существует только в релизе 0.12.2. + - Добавление подсчета грязных страниц в `MDBX_WRITEMAP` для предоставления посредством `mdbx_txn_info()` актуальной информации об объеме изменений в процессе транзакций чтения-записи. + - Исправление несущественной опечатки в условиях `#if` определения порядка байт. + - Исправление сборки для случая `MDBX_PNL_ASCENDING=1`. Ликвидация технических долгов и мелочи: @@ -32,6 +99,9 @@ and [by Yandex](https://translated.turbopages.org/proxy_u/ru-en.en/https/gitflic - Устранение несущественных предупреждений Coverity. - Использование единого курсора для поиска в GC. - Переработка внутренних флагов связанных с выделением страниц из GC. + - Доработка подготовки резерва перед обновлением GC при включенном BigFoot. + - Оптимизация `pnl_merge()` для случаев неперекрывающихся объединяемых списков. + - Оптимизация поддержки отсортированного списка страниц в `dpl_append()`. ------------------------------------------------------------------------------- From 1ae6a398edba34bcae875f9a58e2a7909ad94cf7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Sat, 10 Dec 2022 14:44:15 +0300 Subject: [PATCH 267/364] =?UTF-8?q?mdbx-windows:=20=D0=B8=D1=81=D0=BF?= =?UTF-8?q?=D1=80=D0=B0=D0=B2=D0=BB=D0=B5=D0=BD=D0=B8=D0=B5=20=D1=83=D1=82?= =?UTF-8?q?=D0=B5=D1=87=D0=BA=D0=B8=20overlapped-=D0=B4=D0=B5=D1=81=D0=BA?= =?UTF-8?q?=D1=80=D0=B8=D0=BF=D1=82=D0=BE=D1=80=D0=B0.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core.c | 6 ++---- src/osal.c | 2 ++ 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/core.c b/src/core.c index db00fc66..0c3d7cbc 100644 --- a/src/core.c +++ b/src/core.c @@ -14877,10 +14877,8 @@ __cold static int env_close(MDBX_env *env) { } #if defined(_WIN32) || defined(_WIN64) - if (env->me_overlapped_fd) { - CloseHandle(env->me_overlapped_fd); - env->me_overlapped_fd = 0; - } + eASSERT(env, !env->me_overlapped_fd || + env->me_overlapped_fd == INVALID_HANDLE_VALUE); if (env->me_data_lock_event != INVALID_HANDLE_VALUE) { CloseHandle(env->me_data_lock_event); env->me_data_lock_event = INVALID_HANDLE_VALUE; diff --git a/src/osal.c b/src/osal.c index 69a0b49f..91276c66 100644 --- a/src/osal.c +++ b/src/osal.c @@ -1155,6 +1155,8 @@ MDBX_INTERNAL_FUNC void osal_ioring_destroy(osal_ioring_t *ior) { osal_memalign_free(ior->pool); osal_free(ior->event_pool); CloseHandle(ior->async_done); + if (ior->overlapped_fd) + CloseHandle(ior->overlapped_fd); #else osal_free(ior->pool); #endif From 1c93cff8252dea5af1d9298c5e09584ad439d1a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Sun, 11 Dec 2022 00:14:40 +0300 Subject: [PATCH 268/364] =?UTF-8?q?mdbx:=20=D0=B4=D0=BE=D0=BF=D0=BE=D0=BB?= =?UTF-8?q?=D0=BD=D0=B8=D1=82=D0=B5=D0=BB=D1=8C=D0=BD=D1=8B=D0=B5=20=D1=83?= =?UTF-8?q?=D1=81=D0=BB=D0=BE=D0=B2=D0=B8=D1=8F=20=D0=B4=D0=BB=D1=8F=20pre?= =?UTF-8?q?fault-write.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/src/core.c b/src/core.c index 0c3d7cbc..311e52d3 100644 --- a/src/core.c +++ b/src/core.c @@ -7046,8 +7046,24 @@ static __inline pgr_t page_alloc_finalize(MDBX_env *const env, * неё пишет ЦПУ. */ const bool readahead_enabled = env->me_lck->mti_readahead_anchor & 1; const pgno_t readahead_edge = env->me_lck->mti_readahead_anchor >> 1; - /* Не суетимся если страница в зоне включенного упреждающего чтения */ - if (!readahead_enabled || pgno + num > readahead_edge) { + /* В случае если страница в памяти процесса, то излишняя запись может быть + * достаточно дорогой. Кроме системного вызова и копирования данных, в особо + * одаренных ОС при этом могут включаться файловая система, выделяться + * временная страница, пополняться очереди асинхронного выполнения, + * обновляться PTE с последующей генерацией page-fault и чтением данных из + * грязной I/O очереди. Из-за этого штраф за лишнюю запись может быть + * сравним с избегаемым ненужным чтением. + * + * Проверка посредством minicore() существенно снижает затраты, но в + * простейших случаях (тривиальный бенчмарк) интегральная производительность + * становится вдвое меньше. А на платформах без minocore() и с проблемной + * подсистемой виртуальной памяти ситуация может быть многократно хуже. + * Поэтому избегаем затрат в ситуациях когда prefaukt-write скорее всего не + * нужна. Стоит подумать над дополнительными критериями. */ + if (/* Не суетимся если GC почти пустая и БД маленькая */ + (txn->mt_dbs[FREE_DBI].md_branch_pages || txn->mt_geo.now > 1234) && + /* Не суетимся если страница в зоне включенного упреждающего чтения */ + (!readahead_enabled || pgno + num > readahead_edge)) { void *const pattern = ptr_disp( env->me_pbuf, need_clean ? env->me_psize : env->me_psize * 2); size_t file_offset = pgno2bytes(env, pgno); From 0884f28f855f9aefbb4ba6c2a3585900cf457290 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Sun, 11 Dec 2022 16:46:20 +0300 Subject: [PATCH 269/364] =?UTF-8?q?mdbx-tools:=20=D1=83=D1=81=D0=BA=D0=BE?= =?UTF-8?q?=D1=80=D0=B5=D0=BD=D0=B8=D0=B5=20=D1=80=D0=B0=D0=B1=D0=BE=D1=82?= =?UTF-8?q?=D1=8B=20`mdbx=5Fchk`=20=D0=BF=D1=80=D0=B8=20=D0=BE=D0=B1=D1=80?= =?UTF-8?q?=D0=B0=D0=B1=D0=BE=D1=82=D0=BA=D0=B5=20=D0=BF=D0=BE=D0=BB=D1=8C?= =?UTF-8?q?=D0=B7=D0=BE=D0=B2=D0=B0=D1=82=D0=B5=D0=BB=D1=8C=D1=81=D0=BA?= =?UTF-8?q?=D0=B8=D1=85=20=D0=B7=D0=B0=D0=BF=D0=B8=D1=81=D0=B5=D0=B9=20?= =?UTF-8?q?=D0=B2=20`@MAIN`.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/mdbx_chk.c | 39 +++++++++++++++++++++------------------ 1 file changed, 21 insertions(+), 18 deletions(-) diff --git a/src/mdbx_chk.c b/src/mdbx_chk.c index df289401..2496b582 100644 --- a/src/mdbx_chk.c +++ b/src/mdbx_chk.c @@ -620,6 +620,8 @@ static int handle_maindb(const uint64_t record_number, const MDBX_val *key, int rc; size_t i; + if (data->iov_len != sizeof(MDBX_db)) + return handle_userdb(record_number, key, data); name = key->iov_base; for (i = 0; i < key->iov_len; ++i) { if (name[i] < ' ') @@ -631,12 +633,13 @@ static int handle_maindb(const uint64_t record_number, const MDBX_val *key, return MDBX_ENOMEM; memcpy(name, key->iov_base, key->iov_len); name[key->iov_len] = '\0'; - userdb_count++; rc = process_db(~0u, name, handle_userdb, false); osal_free(name); - if (rc != MDBX_INCOMPATIBLE) + if (rc != MDBX_INCOMPATIBLE) { + userdb_count++; return rc; + } return handle_userdb(record_number, key, data); } @@ -737,10 +740,9 @@ static int process_db(MDBX_dbi dbi_handle, char *dbi_name, visitor *handler, return MDBX_SUCCESS; } - if (!silent && verbose) { + if (!silent && verbose) print("Processing '%s'...\n", dbi_name ? dbi_name : "@MAIN"); - fflush(nullptr); - } + fflush(nullptr); rc = mdbx_dbi_flags(txn, dbi_handle, &flags); if (rc) { @@ -1639,15 +1641,6 @@ int main(int argc, char *argv[]) { fflush(nullptr); } - if (!verbose) - print("Iterating DBIs...\n"); - if (data_tree_problems) { - print("Skip processing %s since tree is corrupted (%u problems)\n", "@MAIN", - data_tree_problems); - problems_maindb = data_tree_problems; - } else - problems_maindb = process_db(~0u, /* MAIN_DBI */ nullptr, nullptr, false); - if (gc_tree_problems) { print("Skip processing %s since tree is corrupted (%u problems)\n", "@GC", gc_tree_problems); @@ -1685,7 +1678,7 @@ int main(int argc, char *argv[]) { print(", available %" PRIu64 " (%.1f%%)\n", value, value / percent); } - if (problems_maindb == 0 && problems_freedb == 0) { + if ((problems_maindb = data_tree_problems) == 0 && problems_freedb == 0) { if (!dont_traversal && (envflags & (MDBX_EXCLUSIVE | MDBX_RDONLY)) != MDBX_RDONLY) { if (walk.pgcount != alloc_pages - gc_pages) { @@ -1702,10 +1695,20 @@ int main(int argc, char *argv[]) { "monopolistic or read-write mode only)\n"); } - if (!process_db(MAIN_DBI, nullptr, handle_maindb, true)) { - if (!userdb_count && verbose) - print(" - does not contain multiple databases\n"); + problems_maindb = process_db(~0u, /* MAIN_DBI */ nullptr, nullptr, false); + if (problems_maindb == 0) { + print("Scanning %s for %s...\n", "@MAIN", "sub-database(s)"); + if (!process_db(MAIN_DBI, nullptr, handle_maindb, true)) { + if (!userdb_count && verbose) + print(" - does not contain multiple databases\n"); + } + } else { + print("Skip processing %s since %s is corrupted (%u problems)\n", + "sub-database(s)", "@MAIN", problems_maindb); } + } else { + print("Skip processing %s since %s is corrupted (%u problems)\n", "@MAIN", + "tree", data_tree_problems); } if (rc == 0 && total_problems == 1 && problems_meta == 1 && !dont_traversal && From 69f7d6cdd8fd569fcfb51d5a2aef1249a5a8fb5f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Sun, 11 Dec 2022 20:26:28 +0300 Subject: [PATCH 270/364] =?UTF-8?q?mdbx-tools:=20=D0=BD=D0=B5=D1=81=D1=83?= =?UTF-8?q?=D1=89=D0=B5=D1=81=D1=82=D0=B2=D0=B5=D0=BD=D0=BD=D1=8B=D0=B9=20?= =?UTF-8?q?=D1=80=D0=B5=D1=84=D0=B0=D0=BA=D1=82=D0=BE=D1=80=D0=B8=D0=BD?= =?UTF-8?q?=D0=B3=20`mdbx=5Fchk`.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/mdbx_chk.c | 169 +++++++++++++++++++++++++------------------------ 1 file changed, 88 insertions(+), 81 deletions(-) diff --git a/src/mdbx_chk.c b/src/mdbx_chk.c index 2496b582..d1cb718e 100644 --- a/src/mdbx_chk.c +++ b/src/mdbx_chk.c @@ -222,7 +222,7 @@ static walk_dbi_t *pagemap_lookup_dbi(const char *dbi_name, bool silent) { } if (verbose > 0 && !silent) { - print(" - found '%s' area\n", dbi_name); + print(" - found `%s` area\n", dbi_name); fflush(nullptr); } @@ -491,8 +491,7 @@ static int pgvisitor(const uint64_t pgno, const unsigned pgnumber, typedef int(visitor)(const uint64_t record_number, const MDBX_val *key, const MDBX_val *data); -static int process_db(MDBX_dbi dbi_handle, char *dbi_name, visitor *handler, - bool silent); +static int process_db(MDBX_dbi dbi_handle, char *dbi_name, visitor *handler); static int handle_userdb(const uint64_t record_number, const MDBX_val *key, const MDBX_val *data) { @@ -634,7 +633,7 @@ static int handle_maindb(const uint64_t record_number, const MDBX_val *key, memcpy(name, key->iov_base, key->iov_len); name[key->iov_len] = '\0'; - rc = process_db(~0u, name, handle_userdb, false); + rc = process_db(~0u, name, handle_userdb); osal_free(name); if (rc != MDBX_INCOMPATIBLE) { userdb_count++; @@ -694,8 +693,7 @@ static const char *db_flags2valuemode(unsigned flags) { } } -static int process_db(MDBX_dbi dbi_handle, char *dbi_name, visitor *handler, - bool silent) { +static int process_db(MDBX_dbi dbi_handle, char *dbi_name, visitor *handler) { MDBX_cursor *mc; MDBX_stat ms; MDBX_val key, data; @@ -704,12 +702,13 @@ static int process_db(MDBX_dbi dbi_handle, char *dbi_name, visitor *handler, int rc, i; struct problem *saved_list; uint64_t problems_count; + const bool second_pass = dbi_handle == MAIN_DBI; uint64_t record_count = 0, dups = 0; uint64_t key_bytes = 0, data_bytes = 0; if ((MDBX_TXN_FINISHED | MDBX_TXN_ERROR) & mdbx_txn_flags(txn)) { - print(" ! abort processing '%s' due to a previous error\n", + print(" ! abort processing `%s` due to a previous error\n", dbi_name ? dbi_name : "@MAIN"); return MDBX_BAD_TXN; } @@ -723,8 +722,8 @@ static int process_db(MDBX_dbi dbi_handle, char *dbi_name, visitor *handler, if (!dbi_name || rc != MDBX_INCOMPATIBLE) /* LY: mainDB's record is not a user's DB. */ { - error("mdbx_dbi_open('%s') failed, error %d %s\n", - dbi_name ? dbi_name : "main", rc, mdbx_strerror(rc)); + error("mdbx_dbi_open(`%s`) failed, error %d %s\n", + dbi_name ? dbi_name : "@MAIN", rc, mdbx_strerror(rc)); } return rc; } @@ -733,15 +732,15 @@ static int process_db(MDBX_dbi dbi_handle, char *dbi_name, visitor *handler, if (dbi_handle >= CORE_DBS && dbi_name && only_subdb && strcmp(only_subdb, dbi_name) != 0) { if (verbose) { - print("Skip processing '%s'...\n", dbi_name); + print("Skip processing %s...\n", dbi_name); fflush(nullptr); } skipped_subdb++; return MDBX_SUCCESS; } - if (!silent && verbose) - print("Processing '%s'...\n", dbi_name ? dbi_name : "@MAIN"); + if (!second_pass && verbose) + print("Processing %s...\n", dbi_name ? dbi_name : "@MAIN"); fflush(nullptr); rc = mdbx_dbi_flags(txn, dbi_handle, &flags); @@ -756,7 +755,7 @@ static int process_db(MDBX_dbi dbi_handle, char *dbi_name, visitor *handler, return rc; } - if (!silent && verbose) { + if (!second_pass && verbose) { print(" - key-value kind: %s-key => %s-value", db_flags2keymode(flags), db_flags2valuemode(flags)); if (verbose > 1) { @@ -832,57 +831,75 @@ static int process_db(MDBX_dbi dbi_handle, char *dbi_name, visitor *handler, if (rc) goto bailout; - bool bad_key = false; - if (key.iov_len > maxkeysize) { - problem_add("entry", record_count, "key length exceeds max-key-size", - "%" PRIuPTR " > %" PRIuPTR, key.iov_len, maxkeysize); - bad_key = true; - } else if ((flags & MDBX_INTEGERKEY) && key.iov_len != sizeof(uint64_t) && - key.iov_len != sizeof(uint32_t)) { - problem_add("entry", record_count, "wrong key length", - "%" PRIuPTR " != 4or8", key.iov_len); - bad_key = true; - } + if (!second_pass) { + bool bad_key = false; + if (key.iov_len > maxkeysize) { + problem_add("entry", record_count, "key length exceeds max-key-size", + "%" PRIuPTR " > %" PRIuPTR, key.iov_len, maxkeysize); + bad_key = true; + } else if ((flags & MDBX_INTEGERKEY) && key.iov_len != sizeof(uint64_t) && + key.iov_len != sizeof(uint32_t)) { + problem_add("entry", record_count, "wrong key length", + "%" PRIuPTR " != 4or8", key.iov_len); + bad_key = true; + } - bool bad_data = false; - if ((flags & MDBX_INTEGERDUP) && data.iov_len != sizeof(uint64_t) && - data.iov_len != sizeof(uint32_t)) { - problem_add("entry", record_count, "wrong data length", - "%" PRIuPTR " != 4or8", data.iov_len); - bad_data = true; - } - - if (prev_key.iov_base) { - if (prev_data.iov_base && !bad_data && (flags & MDBX_DUPFIXED) && - prev_data.iov_len != data.iov_len) { - problem_add("entry", record_count, "different data length", - "%" PRIuPTR " != %" PRIuPTR, prev_data.iov_len, - data.iov_len); + bool bad_data = false; + if ((flags & MDBX_INTEGERDUP) && data.iov_len != sizeof(uint64_t) && + data.iov_len != sizeof(uint32_t)) { + problem_add("entry", record_count, "wrong data length", + "%" PRIuPTR " != 4or8", data.iov_len); bad_data = true; } - if (!bad_key) { - int cmp = mdbx_cmp(txn, dbi_handle, &key, &prev_key); - if (cmp == 0) { - ++dups; - if ((flags & MDBX_DUPSORT) == 0) { - problem_add("entry", record_count, "duplicated entries", nullptr); - if (prev_data.iov_base && data.iov_len == prev_data.iov_len && - memcmp(data.iov_base, prev_data.iov_base, data.iov_len) == 0) { - problem_add("entry", record_count, "complete duplicate", nullptr); - } - } else if (!bad_data && prev_data.iov_base) { - cmp = mdbx_dcmp(txn, dbi_handle, &data, &prev_data); - if (cmp == 0) { - problem_add("entry", record_count, "complete duplicate", nullptr); - } else if (cmp < 0 && !ignore_wrong_order) { - problem_add("entry", record_count, "wrong order of multi-values", - nullptr); - } - } - } else if (cmp < 0 && !ignore_wrong_order) { - problem_add("entry", record_count, "wrong order of entries", nullptr); + if (prev_key.iov_base) { + if (prev_data.iov_base && !bad_data && (flags & MDBX_DUPFIXED) && + prev_data.iov_len != data.iov_len) { + problem_add("entry", record_count, "different data length", + "%" PRIuPTR " != %" PRIuPTR, prev_data.iov_len, + data.iov_len); + bad_data = true; } + + if (!bad_key) { + int cmp = mdbx_cmp(txn, dbi_handle, &key, &prev_key); + if (cmp == 0) { + ++dups; + if ((flags & MDBX_DUPSORT) == 0) { + problem_add("entry", record_count, "duplicated entries", nullptr); + if (prev_data.iov_base && data.iov_len == prev_data.iov_len && + memcmp(data.iov_base, prev_data.iov_base, data.iov_len) == + 0) { + problem_add("entry", record_count, "complete duplicate", + nullptr); + } + } else if (!bad_data && prev_data.iov_base) { + cmp = mdbx_dcmp(txn, dbi_handle, &data, &prev_data); + if (cmp == 0) { + problem_add("entry", record_count, "complete duplicate", + nullptr); + } else if (cmp < 0 && !ignore_wrong_order) { + problem_add("entry", record_count, + "wrong order of multi-values", nullptr); + } + } + } else if (cmp < 0 && !ignore_wrong_order) { + problem_add("entry", record_count, "wrong order of entries", + nullptr); + } + } + } + + if (!bad_key) { + if (verbose && (flags & MDBX_INTEGERKEY) && !prev_key.iov_base) + print(" - fixed key-size %" PRIuPTR "\n", key.iov_len); + prev_key = key; + } + if (!bad_data) { + if (verbose && (flags & (MDBX_INTEGERDUP | MDBX_DUPFIXED)) && + !prev_data.iov_base) + print(" - fixed data-size %" PRIuPTR "\n", data.iov_len); + prev_data = data; } } @@ -896,17 +913,6 @@ static int process_db(MDBX_dbi dbi_handle, char *dbi_name, visitor *handler, key_bytes += key.iov_len; data_bytes += data.iov_len; - if (!bad_key) { - if (verbose && (flags & MDBX_INTEGERKEY) && !prev_key.iov_base) - print(" - fixed key-size %" PRIuPTR "\n", key.iov_len); - prev_key = key; - } - if (!bad_data) { - if (verbose && (flags & (MDBX_INTEGERDUP | MDBX_DUPFIXED)) && - !prev_data.iov_base) - print(" - fixed data-size %" PRIuPTR "\n", data.iov_len); - prev_data = data; - } rc = mdbx_cursor_get(mc, &key, &data, MDBX_NEXT); } if (rc != MDBX_NOTFOUND) @@ -919,7 +925,7 @@ static int process_db(MDBX_dbi dbi_handle, char *dbi_name, visitor *handler, "%" PRIu64 " != %" PRIu64, record_count, ms.ms_entries); bailout: problems_count = problems_pop(saved_list); - if (!silent && verbose) { + if (!second_pass && verbose) { print(" - summary: %" PRIu64 " records, %" PRIu64 " dups, %" PRIu64 " key's bytes, %" PRIu64 " data's " "bytes, %" PRIu64 " problems\n", @@ -1225,8 +1231,9 @@ int main(int argc, char *argv[]) { rc = EXIT_INTERRUPTED; } if (only_subdb || dont_traversal) { - error("whole database checking with tree-traversal are required to turn " - "to the specified meta-page.\n"); + error( + "whole database checking with b-tree traversal are required to turn " + "to the specified meta-page.\n"); rc = EXIT_INTERRUPTED; } } @@ -1642,11 +1649,11 @@ int main(int argc, char *argv[]) { } if (gc_tree_problems) { - print("Skip processing %s since tree is corrupted (%u problems)\n", "@GC", - gc_tree_problems); + print("Skip processing %s since %s is corrupted (%u problems)\n", "@GC", + "b-tree", gc_tree_problems); problems_freedb = gc_tree_problems; } else - problems_freedb = process_db(FREE_DBI, "@GC", handle_freedb, false); + problems_freedb = process_db(FREE_DBI, "@GC", handle_freedb); if (verbose) { uint64_t value = envinfo.mi_mapsize / envinfo.mi_dxb_pagesize; @@ -1687,18 +1694,18 @@ int main(int argc, char *argv[]) { walk.pgcount, alloc_pages - gc_pages); } if (unused_pages != gc_pages) { - error("gc pages mismatch (%" PRIu64 "(expected) != %" PRIu64 "(GC))\n", + error("GC pages mismatch (%" PRIu64 "(expected) != %" PRIu64 "(GC))\n", unused_pages, gc_pages); } } else if (verbose) { - print(" - skip check used and gc pages (btree-traversal with " + print(" - skip check used and GC pages (btree-traversal with " "monopolistic or read-write mode only)\n"); } - problems_maindb = process_db(~0u, /* MAIN_DBI */ nullptr, nullptr, false); + problems_maindb = process_db(~0u, /* MAIN_DBI */ nullptr, nullptr); if (problems_maindb == 0) { print("Scanning %s for %s...\n", "@MAIN", "sub-database(s)"); - if (!process_db(MAIN_DBI, nullptr, handle_maindb, true)) { + if (!process_db(MAIN_DBI, nullptr, handle_maindb)) { if (!userdb_count && verbose) print(" - does not contain multiple databases\n"); } @@ -1708,7 +1715,7 @@ int main(int argc, char *argv[]) { } } else { print("Skip processing %s since %s is corrupted (%u problems)\n", "@MAIN", - "tree", data_tree_problems); + "b-tree", data_tree_problems); } if (rc == 0 && total_problems == 1 && problems_meta == 1 && !dont_traversal && From 54b15d7e413e14a381cf866242355763d0d52791 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Mon, 12 Dec 2022 01:20:22 +0300 Subject: [PATCH 271/364] =?UTF-8?q?mdbx:=20=D0=BE=D0=BF=D1=80=D0=B5=D0=B4?= =?UTF-8?q?=D0=B5=D0=BB=D0=B5=D0=BD=D0=B8=D0=B5=20in-core=20=D0=91=D0=94?= =?UTF-8?q?=20(=D0=B2=20tmpfs/ramfs/mfs)=20=D1=81=20=D0=BE=D1=82=D0=BA?= =?UTF-8?q?=D0=BB=D1=8E=D1=87=D0=B5=D0=BD=D0=B8=D0=B5=D0=BC=20prefault-wri?= =?UTF-8?q?te.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Это вынужденный читинг для "починки" сравнительных бенчмарков при размещении БД в /dev/shm. Проблема в том, что актуальные ядра Linux для файлов размещенных в tmpfs возвращают mincore=false. В результате, в простейших бенчмарках видно двукратное снижение производительности, просто из-за вызовов write() выполняемых для prefault. Из-за этого, в таких синтетических тестах, новая libmdbx становится существенно медленнее предыдущих версий, в том числе LMDB. --- src/core.c | 126 +++++++++++++++++++++++++++++++----------------- src/internals.h | 4 ++ src/osal.c | 44 +++++++++++++++++ src/osal.h | 1 + 4 files changed, 130 insertions(+), 45 deletions(-) diff --git a/src/core.c b/src/core.c index 311e52d3..59cda19f 100644 --- a/src/core.c +++ b/src/core.c @@ -5920,13 +5920,28 @@ __cold static void munlock_all(const MDBX_env *env) { } __cold static unsigned default_rp_augment_limit(const MDBX_env *env) { - /* default drp_augment_limit = ceil(npages / gold_ratio) */ + /* default rp_augment_limit = ceil(npages / gold_ratio) */ const size_t augment = (env->me_dbgeo.now >> (env->me_psize2log + 10)) * 633u; eASSERT(env, augment < MDBX_PGL_LIMIT); return pnl_bytes2size(pnl_size2bytes( (augment > MDBX_PNL_INITIAL) ? augment : MDBX_PNL_INITIAL)); } +__cold static bool default_prefault_write(const MDBX_env *env) { + if (env->me_incore || + (env->me_flags & (MDBX_WRITEMAP | MDBX_RDONLY)) != MDBX_WRITEMAP) + return false; + + return !MDBX_MMAP_INCOHERENT_FILE_WRITE; +} + +static void adjust_defaults(MDBX_env *env) { + if (!env->me_options.flags.non_auto.rp_augment_limit) + env->me_options.rp_augment_limit = default_rp_augment_limit(env); + if (!env->me_options.flags.non_auto.prefault_write) + env->me_options.prefault_write = default_prefault_write(env); +} + __cold static int map_resize(MDBX_env *env, const pgno_t used_pgno, const pgno_t size_pgno, const pgno_t limit_pgno, const bool implicit) { @@ -6120,8 +6135,7 @@ bailout: /* update env-geo to avoid influences */ env->me_dbgeo.now = env->me_dxb_mmap.current; env->me_dbgeo.upper = env->me_dxb_mmap.limit; - if (!env->me_options.flags.non_auto.rp_augment_limit) - env->me_options.rp_augment_limit = default_rp_augment_limit(env); + adjust_defaults(env); #ifdef MDBX_USE_VALGRIND if (prev_limit != env->me_dxb_mmap.limit || prev_map != env->me_map) { VALGRIND_DISCARD(env->me_valgrind_handle); @@ -7044,26 +7058,15 @@ static __inline pgr_t page_alloc_finalize(MDBX_env *const env, * с диска. При этом запись на диск должна быть отложена адекватным ядром, * так как страница отображена в память в режиме чтения-записи и следом в * неё пишет ЦПУ. */ - const bool readahead_enabled = env->me_lck->mti_readahead_anchor & 1; - const pgno_t readahead_edge = env->me_lck->mti_readahead_anchor >> 1; + /* В случае если страница в памяти процесса, то излишняя запись может быть * достаточно дорогой. Кроме системного вызова и копирования данных, в особо * одаренных ОС при этом могут включаться файловая система, выделяться * временная страница, пополняться очереди асинхронного выполнения, * обновляться PTE с последующей генерацией page-fault и чтением данных из * грязной I/O очереди. Из-за этого штраф за лишнюю запись может быть - * сравним с избегаемым ненужным чтением. - * - * Проверка посредством minicore() существенно снижает затраты, но в - * простейших случаях (тривиальный бенчмарк) интегральная производительность - * становится вдвое меньше. А на платформах без minocore() и с проблемной - * подсистемой виртуальной памяти ситуация может быть многократно хуже. - * Поэтому избегаем затрат в ситуациях когда prefaukt-write скорее всего не - * нужна. Стоит подумать над дополнительными критериями. */ - if (/* Не суетимся если GC почти пустая и БД маленькая */ - (txn->mt_dbs[FREE_DBI].md_branch_pages || txn->mt_geo.now > 1234) && - /* Не суетимся если страница в зоне включенного упреждающего чтения */ - (!readahead_enabled || pgno + num > readahead_edge)) { + * сравним с избегаемым ненужным чтением. */ + if (env->me_prefault_write) { void *const pattern = ptr_disp( env->me_pbuf, need_clean ? env->me_psize : env->me_psize * 2); size_t file_offset = pgno2bytes(env, pgno); @@ -7200,6 +7203,24 @@ static pgr_t page_alloc_slowpath(const MDBX_cursor *const mc, const size_t num, gc->mc_txn = txn; gc->mc_flags = 0; + env->me_prefault_write = env->me_options.prefault_write; + if (env->me_prefault_write) { + /* Проверка посредством minicore() существенно снижает затраты, но в + * простейших случаях (тривиальный бенчмарк) интегральная производительность + * становится вдвое меньше. А на платформах без mincore() и с проблемной + * подсистемой виртуальной памяти ситуация может быть многократно хуже. + * Поэтому избегаем затрат в ситуациях когда prefaukt-write скорее всего не + * нужна. */ + const bool readahead_enabled = env->me_lck->mti_readahead_anchor & 1; + const pgno_t readahead_edge = env->me_lck->mti_readahead_anchor >> 1; + if (/* Не суетимся если GC почти пустая и БД маленькая */ + (txn->mt_dbs[FREE_DBI].md_branch_pages == 0 && + txn->mt_geo.now < 1234) || + /* Не суетимся если страница в зоне включенного упреждающего чтения */ + (readahead_enabled && pgno + num < readahead_edge)) + env->me_prefault_write = false; + } + retry_gc_refresh_oldest:; txnid_t oldest = txn_oldest_reader(txn); retry_gc_have_oldest: @@ -12359,7 +12380,8 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending, mode_bits |= MDBX_SYNC_SIZE; if (flags & MDBX_NOMETASYNC) mode_bits |= MDBX_SYNC_IODQ; - } + } else if (unlikely(env->me_incore)) + goto skip_incore_sync; if (!MDBX_AVOID_MSYNC && (flags & MDBX_WRITEMAP)) { #if MDBX_ENABLE_PGOP_STAT env->me_lck->mti_pgop_stat.msync.weak += sync_op; @@ -12391,6 +12413,7 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending, atomic_store64(&env->me_lck->mti_unsynced_pages, 0, mo_Relaxed); } else { assert(rc == MDBX_RESULT_TRUE /* carry non-steady */); + skip_incore_sync: eASSERT(env, env->me_lck->mti_unsynced_pages.weak > 0); eASSERT(env, env->me_lck->mti_eoos_timestamp.weak != 0); unaligned_poke_u64(4, pending->mm_sign, MDBX_DATASIGN_WEAK); @@ -12495,35 +12518,38 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending, memcpy(target->mm_sign, pending->mm_sign, 8); osal_flush_incoherent_cpu_writeback(); jitter4testing(true); - if (!MDBX_AVOID_MSYNC) { - /* sync meta-pages */ + if (!env->me_incore) { + if (!MDBX_AVOID_MSYNC) { + /* sync meta-pages */ #if MDBX_ENABLE_PGOP_STAT - env->me_lck->mti_pgop_stat.msync.weak += 1; + env->me_lck->mti_pgop_stat.msync.weak += 1; #endif /* MDBX_ENABLE_PGOP_STAT */ - rc = osal_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, NUM_METAS), - (flags & MDBX_NOMETASYNC) - ? MDBX_SYNC_KICK - : MDBX_SYNC_DATA | MDBX_SYNC_IODQ); - } else { + rc = osal_msync( + &env->me_dxb_mmap, 0, pgno_align2os_bytes(env, NUM_METAS), + (flags & MDBX_NOMETASYNC) ? MDBX_SYNC_KICK + : MDBX_SYNC_DATA | MDBX_SYNC_IODQ); + } else { #if MDBX_ENABLE_PGOP_STAT - env->me_lck->mti_pgop_stat.wops.weak += 1; + env->me_lck->mti_pgop_stat.wops.weak += 1; #endif /* MDBX_ENABLE_PGOP_STAT */ - const MDBX_page *page = data_page(target); - rc = osal_pwrite(env->me_fd4meta, page, env->me_psize, - ptr_dist(page, env->me_map)); - if (likely(rc == MDBX_SUCCESS)) { - osal_flush_incoherent_mmap(target, sizeof(MDBX_meta), env->me_os_psize); - if ((flags & MDBX_NOMETASYNC) == 0 && - env->me_fd4meta == env->me_lazy_fd) { + const MDBX_page *page = data_page(target); + rc = osal_pwrite(env->me_fd4meta, page, env->me_psize, + ptr_dist(page, env->me_map)); + if (likely(rc == MDBX_SUCCESS)) { + osal_flush_incoherent_mmap(target, sizeof(MDBX_meta), + env->me_os_psize); + if ((flags & MDBX_NOMETASYNC) == 0 && + env->me_fd4meta == env->me_lazy_fd) { #if MDBX_ENABLE_PGOP_STAT - env->me_lck->mti_pgop_stat.fsync.weak += 1; + env->me_lck->mti_pgop_stat.fsync.weak += 1; #endif /* MDBX_ENABLE_PGOP_STAT */ - rc = osal_fsync(env->me_lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); + rc = osal_fsync(env->me_lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); + } } } + if (unlikely(rc != MDBX_SUCCESS)) + goto fail; } - if (unlikely(rc != MDBX_SUCCESS)) - goto fail; } else { #if MDBX_ENABLE_PGOP_STAT env->me_lck->mti_pgop_stat.wops.weak += 1; @@ -12542,7 +12568,8 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending, } osal_flush_incoherent_mmap(target, sizeof(MDBX_meta), env->me_os_psize); /* sync meta-pages */ - if ((flags & MDBX_NOMETASYNC) == 0 && env->me_fd4meta == env->me_lazy_fd) { + if ((flags & MDBX_NOMETASYNC) == 0 && env->me_fd4meta == env->me_lazy_fd && + !env->me_incore) { #if MDBX_ENABLE_PGOP_STAT env->me_lck->mti_pgop_stat.fsync.weak += 1; #endif /* MDBX_ENABLE_PGOP_STAT */ @@ -13050,8 +13077,7 @@ mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t size_now, pgno2bytes(env, pv2pages(pages2pv(bytes2pgno(env, growth_step)))); env->me_dbgeo.shrink = pgno2bytes(env, pv2pages(pages2pv(bytes2pgno(env, shrink_threshold)))); - if (!env->me_options.flags.non_auto.rp_augment_limit) - env->me_options.rp_augment_limit = default_rp_augment_limit(env); + adjust_defaults(env); ENSURE(env, env->me_dbgeo.lower >= MIN_MAPSIZE); ENSURE(env, env->me_dbgeo.lower / (unsigned)pagesize >= MIN_PAGENO); @@ -14017,8 +14043,8 @@ static uint32_t merge_sync_flags(const uint32_t a, const uint32_t b) { !F_ISSET(r, MDBX_UTTERLY_NOSYNC)) r = (r - MDBX_DEPRECATED_MAPASYNC) | MDBX_SAFE_NOSYNC; - /* force MDBX_NOMETASYNC if MDBX_SAFE_NOSYNC enabled */ - if (r & MDBX_SAFE_NOSYNC) + /* force MDBX_NOMETASYNC if NOSYNC enabled */ + if (r & (MDBX_SAFE_NOSYNC | MDBX_UTTERLY_NOSYNC)) r |= MDBX_NOMETASYNC; assert(!(F_ISSET(r, MDBX_UTTERLY_NOSYNC) && @@ -14746,6 +14772,16 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, goto bailout; } + rc = osal_check_fs_incore(env->me_lazy_fd); + env->me_incore = false; + if (rc == MDBX_RESULT_TRUE) { + env->me_incore = true; + NOTICE("%s", "in-core database"); + } else if (unlikely(rc != MDBX_SUCCESS)) { + ERROR("check_fs_incore(), err %d", rc); + goto bailout; + } + if (unlikely(/* recovery mode */ env->me_stuck_meta >= 0) && (lck_rc != /* exclusive */ MDBX_RESULT_TRUE || (flags & MDBX_EXCLUSIVE) == 0)) { @@ -14784,8 +14820,6 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, } if ((flags & MDBX_RDONLY) == 0) { - if (!env->me_options.flags.non_auto.rp_augment_limit) - env->me_options.rp_augment_limit = default_rp_augment_limit(env); const size_t tsize = sizeof(MDBX_txn) + sizeof(MDBX_cursor), size = tsize + env->me_maxdbs * (sizeof(MDBX_db) + sizeof(MDBX_cursor *) + @@ -14821,6 +14855,8 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, ior_direct, env->me_overlapped_fd #endif /* Windows */ ); + if (rc == MDBX_SUCCESS) + adjust_defaults(env); } #if MDBX_DEBUG diff --git a/src/internals.h b/src/internals.h index 51ff05ca..4f0ec2ac 100644 --- a/src/internals.h +++ b/src/internals.h @@ -1273,12 +1273,14 @@ struct MDBX_env { #if !(defined(_WIN32) || defined(_WIN64)) unsigned writethrough_threshold; #endif /* Windows */ + bool prefault_write; union { unsigned all; /* tracks options with non-auto values but tuned by user */ struct { unsigned dp_limit : 1; unsigned rp_augment_limit : 1; + unsigned prefault_write : 1; } non_auto; } flags; } me_options; @@ -1300,6 +1302,7 @@ struct MDBX_env { int semid; } me_sysv_ipc; #endif /* MDBX_LOCKING == MDBX_LOCKING_SYSV */ + bool me_incore; MDBX_env *me_lcklist_next; @@ -1308,6 +1311,7 @@ struct MDBX_env { MDBX_txn *me_txn; /* current write transaction */ osal_fastmutex_t me_dbi_lock; MDBX_dbi me_numdbs; /* number of DBs opened */ + bool me_prefault_write; MDBX_page *me_dp_reserve; /* list of malloc'ed blocks for re-use */ unsigned me_dp_reserve_len; diff --git a/src/osal.c b/src/osal.c index 91276c66..a18e3e96 100644 --- a/src/osal.c +++ b/src/osal.c @@ -1760,6 +1760,50 @@ MDBX_INTERNAL_FUNC int osal_check_fs_rdonly(mdbx_filehandle_t handle, return MDBX_SUCCESS; } +MDBX_INTERNAL_FUNC int osal_check_fs_incore(mdbx_filehandle_t handle) { +#if defined(_WIN32) || defined(_WIN64) + (void)handle; +#else + struct statfs statfs_info; + if (fstatfs(handle, &statfs_info)) + return errno; + +#if defined(__OpenBSD__) + const unsigned type = 0; +#else + const unsigned type = statfs_info.f_type; +#endif + switch (type) { + case 0x28cd3d45 /* CRAMFS_MAGIC */: + case 0x858458f6 /* RAMFS_MAGIC */: + case 0x01021994 /* TMPFS_MAGIC */: + case 0x73717368 /* SQUASHFS_MAGIC */: + case 0x7275 /* ROMFS_MAGIC */: + return MDBX_RESULT_TRUE; + } + +#if defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \ + defined(__BSD__) || defined(__bsdi__) || defined(__DragonFly__) || \ + defined(__APPLE__) || defined(__MACH__) || defined(MFSNAMELEN) || \ + defined(MFSTYPENAMELEN) || defined(VFS_NAMELEN) + const char *const name = statfs_info.f_fstypename; + const size_t name_len = sizeof(statfs_info.f_fstypename); +#else + const char *const name = ""; + const size_t name_len = 0; +#endif + if (name_len) { + if (strncasecmp("tmpfs", name, 6) == 0 || + strncasecmp("mfs", name, 4) == 0 || + strncasecmp("ramfs", name, 6) == 0 || + strncasecmp("romfs", name, 6) == 0) + return MDBX_RESULT_TRUE; + } +#endif /* !Windows */ + + return MDBX_RESULT_FALSE; +} + static int osal_check_fs_local(mdbx_filehandle_t handle, int flags) { #if defined(_WIN32) || defined(_WIN64) if (mdbx_RunningUnderWine() && !(flags & MDBX_EXCLUSIVE)) diff --git a/src/osal.h b/src/osal.h index cdd6fa27..bd869403 100644 --- a/src/osal.h +++ b/src/osal.h @@ -585,6 +585,7 @@ MDBX_INTERNAL_FUNC int osal_msync(const osal_mmap_t *map, size_t offset, MDBX_INTERNAL_FUNC int osal_check_fs_rdonly(mdbx_filehandle_t handle, const pathchar_t *pathname, int err); +MDBX_INTERNAL_FUNC int osal_check_fs_incore(mdbx_filehandle_t handle); MDBX_MAYBE_UNUSED static __inline uint32_t osal_getpid(void) { STATIC_ASSERT(sizeof(mdbx_pid_t) <= sizeof(uint32_t)); From b959e217b1a07bbb12d7ec11cd5fd9ff8d2fbbdf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Mon, 12 Dec 2022 15:02:08 +0300 Subject: [PATCH 272/364] =?UTF-8?q?mdbx:=20=D1=80=D0=B5=D1=84=D0=B0=D0=BA?= =?UTF-8?q?=D1=82=D0=BE=D1=80=D0=B8=D0=BD=D0=B3=20=D0=BE=D0=B1=D1=80=D0=B0?= =?UTF-8?q?=D0=B1=D0=BE=D1=82=D0=BA=D0=B8=20=D1=83=D1=81=D1=82=D0=B0=D0=BD?= =?UTF-8?q?=D0=BE=D0=B2=D0=BA=D0=B8=20=D0=BE=D0=BF=D1=86=D0=B8=D0=B9=20?= =?UTF-8?q?=D0=B2=20=D0=B7=D0=BD=D0=B0=D1=87=D0=B5=D0=BD=D0=B8=D1=8F=20?= =?UTF-8?q?=D0=BF=D0=BE-=D1=83=D0=BC=D0=BE=D0=BB=D1=87=D0=B0=D0=BD=D0=B8?= =?UTF-8?q?=D1=8E.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core.c | 63 ++++++++++++++++++++++++++++-------------------------- 1 file changed, 33 insertions(+), 30 deletions(-) diff --git a/src/core.c b/src/core.c index 59cda19f..74922f9d 100644 --- a/src/core.c +++ b/src/core.c @@ -24359,14 +24359,14 @@ __cold int mdbx_env_set_option(MDBX_env *env, const MDBX_option_t option, bool should_unlock = false; switch (option) { case MDBX_opt_sync_bytes: - if (value == UINT64_MAX) - value = SIZE_MAX - 65536; + if (value == /* default */ UINT64_MAX) + value = MAX_WRITE; if (unlikely(env->me_flags & MDBX_RDONLY)) return MDBX_EACCESS; if (unlikely(!(env->me_flags & MDBX_ENV_ACTIVE))) return MDBX_EPERM; if (unlikely(value > SIZE_MAX - 65536)) - return MDBX_TOO_LARGE; + return MDBX_EINVAL; value = bytes2pgno(env, (size_t)value + env->me_psize - 1); if ((uint32_t)value != atomic_load32(&env->me_lck->mti_autosync_threshold, mo_AcquireRelease) && @@ -24382,14 +24382,14 @@ __cold int mdbx_env_set_option(MDBX_env *env, const MDBX_option_t option, break; case MDBX_opt_sync_period: - if (value == UINT64_MAX) - value = UINT32_MAX; + if (value == /* default */ UINT64_MAX) + value = 2780315 /* 42.42424 секунды */; if (unlikely(env->me_flags & MDBX_RDONLY)) return MDBX_EACCESS; if (unlikely(!(env->me_flags & MDBX_ENV_ACTIVE))) return MDBX_EPERM; if (unlikely(value > UINT32_MAX)) - return MDBX_TOO_LARGE; + return MDBX_EINVAL; value = osal_16dot16_to_monotime((uint32_t)value); if (value != atomic_load64(&env->me_lck->mti_autosync_period, mo_AcquireRelease) && @@ -24404,8 +24404,8 @@ __cold int mdbx_env_set_option(MDBX_env *env, const MDBX_option_t option, break; case MDBX_opt_max_db: - if (value == UINT64_MAX) - value = MDBX_MAX_DBI; + if (value == /* default */ UINT64_MAX) + value = 42; if (unlikely(value > MDBX_MAX_DBI)) return MDBX_EINVAL; if (unlikely(env->me_map)) @@ -24414,7 +24414,7 @@ __cold int mdbx_env_set_option(MDBX_env *env, const MDBX_option_t option, break; case MDBX_opt_max_readers: - if (value == UINT64_MAX) + if (value == /* default */ UINT64_MAX) value = MDBX_READERS_LIMIT; if (unlikely(value < 1 || value > MDBX_READERS_LIMIT)) return MDBX_EINVAL; @@ -24424,7 +24424,7 @@ __cold int mdbx_env_set_option(MDBX_env *env, const MDBX_option_t option, break; case MDBX_opt_dp_reserve_limit: - if (value == UINT64_MAX) + if (value == /* default */ UINT64_MAX) value = INT_MAX; if (unlikely(value > INT_MAX)) return MDBX_EINVAL; @@ -24450,7 +24450,7 @@ __cold int mdbx_env_set_option(MDBX_env *env, const MDBX_option_t option, break; case MDBX_opt_rp_augment_limit: - if (value == UINT64_MAX) { + if (value == /* default */ UINT64_MAX) { env->me_options.flags.non_auto.rp_augment_limit = 0; env->me_options.rp_augment_limit = default_rp_augment_limit(env); } else if (unlikely(value > MDBX_PGL_LIMIT)) @@ -24463,7 +24463,7 @@ __cold int mdbx_env_set_option(MDBX_env *env, const MDBX_option_t option, case MDBX_opt_txn_dp_limit: case MDBX_opt_txn_dp_initial: - if (value == UINT64_MAX) + if (value == /* default */ UINT64_MAX) value = MDBX_PGL_LIMIT; if (unlikely(value > MDBX_PGL_LIMIT || value < CURSOR_STACK * 4)) return MDBX_EINVAL; @@ -24498,34 +24498,38 @@ __cold int mdbx_env_set_option(MDBX_env *env, const MDBX_option_t option, break; case MDBX_opt_spill_max_denominator: - if (value == UINT64_MAX) - value = 255; + if (value == /* default */ UINT64_MAX) + value = 8; if (unlikely(value > 255)) return MDBX_EINVAL; env->me_options.spill_max_denominator = (uint8_t)value; break; case MDBX_opt_spill_min_denominator: + if (value == /* default */ UINT64_MAX) + value = 8; if (unlikely(value > 255)) return MDBX_EINVAL; env->me_options.spill_min_denominator = (uint8_t)value; break; case MDBX_opt_spill_parent4child_denominator: + if (value == /* default */ UINT64_MAX) + value = 0; if (unlikely(value > 255)) return MDBX_EINVAL; env->me_options.spill_parent4child_denominator = (uint8_t)value; break; case MDBX_opt_loose_limit: - if (value == UINT64_MAX) - value = 255; + if (value == /* default */ UINT64_MAX) + value = 64; if (unlikely(value > 255)) return MDBX_EINVAL; env->me_options.dp_loose_limit = (uint8_t)value; break; case MDBX_opt_merge_threshold_16dot16_percent: - if (value == UINT64_MAX) - value = 32768; + if (value == /* default */ UINT64_MAX) + value = 65536 / 4 /* 25% */; if (unlikely(value < 8192 || value > 32768)) return MDBX_EINVAL; env->me_options.merge_threshold_16dot16_percent = (unsigned)value; @@ -24533,23 +24537,22 @@ __cold int mdbx_env_set_option(MDBX_env *env, const MDBX_option_t option, break; case MDBX_opt_writethrough_threshold: +#if defined(_WIN32) || defined(_WIN64) + /* позволяем "установить" значение по-умолчанию и совпадающее + * с поведением соответствующим текущей установке MDBX_NOMETASYNC */ + if (value == /* default */ UINT64_MAX && + value != ((env->me_flags & MDBX_NOMETASYNC) ? 0 : UINT_MAX)) + err = MDBX_EINVAL; +#else + if (value == /* default */ UINT64_MAX) + value = MDBX_WRITETHROUGH_THRESHOLD_DEFAULT; if (value != (unsigned)value) err = MDBX_EINVAL; else -#if defined(_WIN32) || defined(_WIN64) - /* позволяем "установить" значение по-умолчанию и совпадающее - * с поведением соответствующим текущей установке MDBX_NOMETASYNC */ - if ((unsigned)-1 != (unsigned)value && - value != ((env->me_flags & MDBX_NOMETASYNC) ? 0 : INT_MAX)) - err = MDBX_EINVAL; -#else - env->me_options.writethrough_threshold = - ((unsigned)-1 == (unsigned)value) - ? MDBX_WRITETHROUGH_THRESHOLD_DEFAULT - : (unsigned)value; + env->me_options.writethrough_threshold = (unsigned)value; #endif - break; + default: return MDBX_EINVAL; } From 957c99d86fead8a7a65916a413a322a53ef9972c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Mon, 12 Dec 2022 15:23:47 +0300 Subject: [PATCH 273/364] =?UTF-8?q?mdbx:=20=D0=B4=D0=BE=D0=B1=D0=B0=D0=B2?= =?UTF-8?q?=D0=BB=D0=B5=D0=BD=D0=B8=D0=B5=20`MDBX=5Fopt=5Fprefault=5Fwrite?= =?UTF-8?q?=5Fenable`=20=D0=B2=D0=BC=D0=B5=D1=81=D1=82=D0=BE=20`MDBX=5FENA?= =?UTF-8?q?BLE=5FPREFAULT`.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mdbx.h | 5 +++++ src/core.c | 19 ++++++++++++++++--- src/options.h | 12 ------------ 3 files changed, 21 insertions(+), 15 deletions(-) diff --git a/mdbx.h b/mdbx.h index cb14bf33..1b3dcfd8 100644 --- a/mdbx.h +++ b/mdbx.h @@ -2254,6 +2254,11 @@ enum MDBX_option_t { * On Windows a write-through is used always but \ref MDBX_NOMETASYNC could * be used for switching to write-and-flush. */ MDBX_opt_writethrough_threshold, + + /** \brief Controls prevention of page-faults of reclaimed and allocated pages + * in the \ref MDBX_WRITEMAP mode by clearing ones through file handle before + * touching. */ + MDBX_opt_prefault_write_enable, }; #ifndef __cplusplus /** \ingroup c_settings */ diff --git a/src/core.c b/src/core.c index 74922f9d..544e5934 100644 --- a/src/core.c +++ b/src/core.c @@ -7043,7 +7043,6 @@ static __inline pgr_t page_alloc_finalize(MDBX_env *const env, MDBX_ASAN_UNPOISON_MEMORY_REGION(ret.page, pgno2bytes(env, num)); VALGRIND_MAKE_MEM_UNDEFINED(ret.page, pgno2bytes(env, num)); -#if MDBX_ENABLE_PREFAULT /* Содержимое выделенной страницы не нужно, но если страница отсутствует * в ОЗУ (что весьма вероятно), то любое обращение к ней приведет * к page-fault: @@ -7107,7 +7106,6 @@ static __inline pgr_t page_alloc_finalize(MDBX_env *const env, need_clean = false; } } -#endif /* MDBX_ENABLE_PREFAULT */ } else { ret.page = page_malloc(txn, num); if (unlikely(!ret.page)) { @@ -24553,6 +24551,18 @@ __cold int mdbx_env_set_option(MDBX_env *env, const MDBX_option_t option, #endif break; + case MDBX_opt_prefault_write_enable: + if (value == /* default */ UINT64_MAX) { + env->me_options.prefault_write = default_prefault_write(env); + env->me_options.flags.non_auto.prefault_write = false; + } else if (value > 1) + err = MDBX_EINVAL; + else { + env->me_options.prefault_write = value != 0; + env->me_options.flags.non_auto.prefault_write = true; + } + break; + default: return MDBX_EINVAL; } @@ -24634,6 +24644,10 @@ __cold int mdbx_env_get_option(const MDBX_env *env, const MDBX_option_t option, #endif break; + case MDBX_opt_prefault_write_enable: + *pvalue = env->me_options.prefault_write; + break; + default: return MDBX_EINVAL; } @@ -25120,7 +25134,6 @@ __dll_export " MDBX_AVOID_MSYNC=" MDBX_STRINGIFY(MDBX_AVOID_MSYNC) " MDBX_ENABLE_REFUND=" MDBX_STRINGIFY(MDBX_ENABLE_REFUND) " MDBX_ENABLE_MADVISE=" MDBX_STRINGIFY(MDBX_ENABLE_MADVISE) - " MDBX_ENABLE_PREFAULT=" MDBX_STRINGIFY(MDBX_ENABLE_PREFAULT) " MDBX_ENABLE_MINCORE=" MDBX_STRINGIFY(MDBX_ENABLE_MINCORE) " MDBX_ENABLE_PGOP_STAT=" MDBX_STRINGIFY(MDBX_ENABLE_PGOP_STAT) " MDBX_ENABLE_PROFGC=" MDBX_STRINGIFY(MDBX_ENABLE_PROFGC) diff --git a/src/options.h b/src/options.h index 3e9c9243..92b0f56e 100644 --- a/src/options.h +++ b/src/options.h @@ -87,18 +87,6 @@ #error MDBX_ENABLE_PGOP_STAT must be defined as 0 or 1 #endif /* MDBX_ENABLE_PGOP_STAT */ -/** Controls prevention of page-faults of reclaimed and allocated pages in the - * MDBX_WRITEMAP mode by clearing ones through file handle before touching. */ -#ifndef MDBX_ENABLE_PREFAULT -#if MDBX_MMAP_INCOHERENT_FILE_WRITE -#define MDBX_ENABLE_PREFAULT 0 -#else -#define MDBX_ENABLE_PREFAULT 1 -#endif -#elif !(MDBX_ENABLE_PREFAULT == 0 || MDBX_ENABLE_PREFAULT == 1) -#error MDBX_ENABLE_PREFAULT must be defined as 0 or 1 -#endif /* MDBX_ENABLE_PREFAULT */ - /** Controls using Unix' mincore() to determine whether DB-pages * are resident in memory. */ #ifndef MDBX_ENABLE_MINCORE From 245a78291218c9801d64662080b8597efc546c66 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Mon, 12 Dec 2022 18:35:08 +0300 Subject: [PATCH 274/364] =?UTF-8?q?mdbx:=20=D0=BD=D0=B5=20=D0=B8=D0=B3?= =?UTF-8?q?=D0=BD=D0=BE=D1=80=D0=B8=D1=80=D1=83=D0=B5=D0=BC=20=D0=BE=D1=88?= =?UTF-8?q?=D0=B8=D0=B1=D0=BA=D0=B8=20=D0=BF=D1=80=D0=B8=20=D0=BE=D1=82?= =?UTF-8?q?=D0=BA=D1=80=D1=8B=D1=82=D0=B8=D0=B8=20=D0=B4=D0=B5=D1=81=D0=BA?= =?UTF-8?q?=D1=80=D0=B8=D0=BF=D1=82=D0=BE=D1=80=D0=B0=20=D1=81=20O=5FDSYNC?= =?UTF-8?q?.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/core.c b/src/core.c index 544e5934..27c226b0 100644 --- a/src/core.c +++ b/src/core.c @@ -14682,6 +14682,8 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, if (!(flags & (MDBX_RDONLY | MDBX_SAFE_NOSYNC | MDBX_DEPRECATED_MAPASYNC))) { rc = osal_openfile(MDBX_OPEN_DXB_DSYNC, env, env_pathname.dxb, &env->me_dsync_fd, 0); + if (MDBX_IS_ERROR(rc)) + goto bailout; if (env->me_dsync_fd != INVALID_HANDLE_VALUE) { if ((flags & MDBX_NOMETASYNC) == 0) env->me_fd4meta = env->me_dsync_fd; From 167011c2d5bd07bb2efc6e76f98776beac4b5a58 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Mon, 12 Dec 2022 21:27:17 +0300 Subject: [PATCH 275/364] =?UTF-8?q?mdbx:=20=D0=BE=D0=B1=D0=BD=D0=BE=D0=B2?= =?UTF-8?q?=D0=BB=D0=B5=D0=BD=D0=B8=D0=B5=20ChangeLog.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ChangeLog.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/ChangeLog.md b/ChangeLog.md index 70d2b89d..ab5564a8 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -10,7 +10,7 @@ and [by Yandex](https://translated.turbopages.org/proxy_u/ru-en.en/https/gitflic Выпуск с существенными доработками и новой функциональностью в память о закрытом open-source проекте "Акула". ``` -18 files changed, 2470 insertions(+), 1562 deletions(-) +18 files changed, 2729 insertions(+), 1696 deletions(-) Signed-off-by: Леонид Юрьев (Leonid Yuriev) ``` @@ -45,6 +45,9 @@ Signed-off-by: Леонид Юрьев (Leonid Yuriev) Новый функционал работает в согласованности с автоматическим управлением read-ahead и кэшем статуса присутствия страниц в ОЗУ, посредством [mincore()](https://man7.org/linux/man-pages/man2/mincore.2.html). + - Добавлена опция `MDBX_opt_prefault_write_enable` для возможности принудительного + включения/выключения prefault-записи. + - Реализован динамический выбор между сквозной записью на диск и обычной записью с последующим [fdatasync()](https://man7.org/linux/man-pages/man3/fdatasync.3p.html) управляемый опцией `MDBX_opt_writethrough_threshold`. @@ -102,6 +105,7 @@ Signed-off-by: Леонид Юрьев (Leonid Yuriev) - Доработка подготовки резерва перед обновлением GC при включенном BigFoot. - Оптимизация `pnl_merge()` для случаев неперекрывающихся объединяемых списков. - Оптимизация поддержки отсортированного списка страниц в `dpl_append()`. + - Ускорение работы `mdbx_chk` при обработке пользовательских записей в `@MAIN`. ------------------------------------------------------------------------------- From 23fedf6bbad95d2876350e85957b3cb7c0b3cd56 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Tue, 13 Dec 2022 19:44:36 +0300 Subject: [PATCH 276/364] =?UTF-8?q?mdbx:=20=D0=BA=D0=BE=D0=BD=D1=82=D1=80?= =?UTF-8?q?=D0=BE=D0=BB=D1=8C=20=D0=B7=D0=BD=D0=B0=D1=87=D0=B5=D0=BD=D0=B8?= =?UTF-8?q?=D0=B9=20=D0=BC=D0=B0=D0=BA=D1=80=D0=BE=D1=81=D0=BE=D0=B2-?= =?UTF-8?q?=D0=BE=D0=BF=D1=86=D0=B8=D0=B9=20=D1=81=D0=B1=D0=BE=D1=80=D0=BA?= =?UTF-8?q?=D0=B8.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/options.h | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/src/options.h b/src/options.h index 92b0f56e..96dc8325 100644 --- a/src/options.h +++ b/src/options.h @@ -40,6 +40,8 @@ #define MDBX_ENV_CHECKPID 1 #endif #define MDBX_ENV_CHECKPID_CONFIG "AUTO=" MDBX_STRINGIFY(MDBX_ENV_CHECKPID) +#elif !(MDBX_ENV_CHECKPID == 0 || MDBX_ENV_CHECKPID == 1) +#error MDBX_ENV_CHECKPID must be defined as 0 or 1 #else #define MDBX_ENV_CHECKPID_CONFIG MDBX_STRINGIFY(MDBX_ENV_CHECKPID) #endif /* MDBX_ENV_CHECKPID */ @@ -49,6 +51,8 @@ #ifndef MDBX_TXN_CHECKOWNER #define MDBX_TXN_CHECKOWNER 1 #define MDBX_TXN_CHECKOWNER_CONFIG "AUTO=" MDBX_STRINGIFY(MDBX_TXN_CHECKOWNER) +#elif !(MDBX_TXN_CHECKOWNER == 0 || MDBX_TXN_CHECKOWNER == 1) +#error MDBX_TXN_CHECKOWNER must be defined as 0 or 1 #else #define MDBX_TXN_CHECKOWNER_CONFIG MDBX_STRINGIFY(MDBX_TXN_CHECKOWNER) #endif /* MDBX_TXN_CHECKOWNER */ @@ -62,6 +66,8 @@ #define MDBX_TRUST_RTC 1 #endif #define MDBX_TRUST_RTC_CONFIG "AUTO=" MDBX_STRINGIFY(MDBX_TRUST_RTC) +#elif !(MDBX_TRUST_RTC == 0 || MDBX_TRUST_RTC == 1) +#error MDBX_TRUST_RTC must be defined as 0 or 1 #else #define MDBX_TRUST_RTC_CONFIG MDBX_STRINGIFY(MDBX_TRUST_RTC) #endif /* MDBX_TRUST_RTC */ @@ -300,6 +306,8 @@ #define MDBX_USE_OFDLOCKS 0 #endif #define MDBX_USE_OFDLOCKS_CONFIG "AUTO=" MDBX_STRINGIFY(MDBX_USE_OFDLOCKS) +#elif !(MDBX_USE_OFDLOCKS == 0 || MDBX_USE_OFDLOCKS == 1) +#error MDBX_USE_OFDLOCKS must be defined as 0 or 1 #else #define MDBX_USE_OFDLOCKS_CONFIG MDBX_STRINGIFY(MDBX_USE_OFDLOCKS) #endif /* MDBX_USE_OFDLOCKS */ @@ -313,6 +321,8 @@ #else #define MDBX_USE_SENDFILE 0 #endif +#elif !(MDBX_USE_SENDFILE == 0 || MDBX_USE_SENDFILE == 1) +#error MDBX_USE_SENDFILE must be defined as 0 or 1 #endif /* MDBX_USE_SENDFILE */ /** Advanced: Using copy_file_range() syscall (autodetection by default). */ @@ -322,6 +332,8 @@ #else #define MDBX_USE_COPYFILERANGE 0 #endif +#elif !(MDBX_USE_COPYFILERANGE == 0 || MDBX_USE_COPYFILERANGE == 1) +#error MDBX_USE_COPYFILERANGE must be defined as 0 or 1 #endif /* MDBX_USE_COPYFILERANGE */ /** Advanced: Using sync_file_range() syscall (autodetection by default). */ @@ -333,6 +345,8 @@ #else #define MDBX_USE_SYNCFILERANGE 0 #endif +#elif !(MDBX_USE_SYNCFILERANGE == 0 || MDBX_USE_SYNCFILERANGE == 1) +#error MDBX_USE_SYNCFILERANGE must be defined as 0 or 1 #endif /* MDBX_USE_SYNCFILERANGE */ //------------------------------------------------------------------------------ @@ -344,6 +358,9 @@ #else #define MDBX_CPU_WRITEBACK_INCOHERENT 1 #endif +#elif !(MDBX_CPU_WRITEBACK_INCOHERENT == 0 || \ + MDBX_CPU_WRITEBACK_INCOHERENT == 1) +#error MDBX_CPU_WRITEBACK_INCOHERENT must be defined as 0 or 1 #endif /* MDBX_CPU_WRITEBACK_INCOHERENT */ #ifndef MDBX_MMAP_INCOHERENT_FILE_WRITE @@ -352,6 +369,9 @@ #else #define MDBX_MMAP_INCOHERENT_FILE_WRITE 0 #endif +#elif !(MDBX_MMAP_INCOHERENT_FILE_WRITE == 0 || \ + MDBX_MMAP_INCOHERENT_FILE_WRITE == 1) +#error MDBX_MMAP_INCOHERENT_FILE_WRITE must be defined as 0 or 1 #endif /* MDBX_MMAP_INCOHERENT_FILE_WRITE */ #ifndef MDBX_MMAP_INCOHERENT_CPU_CACHE @@ -364,6 +384,9 @@ /* LY: assume no relevant mmap/dcache issues. */ #define MDBX_MMAP_INCOHERENT_CPU_CACHE 0 #endif +#elif !(MDBX_MMAP_INCOHERENT_CPU_CACHE == 0 || \ + MDBX_MMAP_INCOHERENT_CPU_CACHE == 1) +#error MDBX_MMAP_INCOHERENT_CPU_CACHE must be defined as 0 or 1 #endif /* MDBX_MMAP_INCOHERENT_CPU_CACHE */ #ifndef MDBX_64BIT_ATOMIC @@ -373,6 +396,8 @@ #define MDBX_64BIT_ATOMIC 0 #endif #define MDBX_64BIT_ATOMIC_CONFIG "AUTO=" MDBX_STRINGIFY(MDBX_64BIT_ATOMIC) +#elif !(MDBX_64BIT_ATOMIC == 0 || MDBX_64BIT_ATOMIC == 1) +#error MDBX_64BIT_ATOMIC must be defined as 0 or 1 #else #define MDBX_64BIT_ATOMIC_CONFIG MDBX_STRINGIFY(MDBX_64BIT_ATOMIC) #endif /* MDBX_64BIT_ATOMIC */ @@ -398,6 +423,8 @@ #endif #elif defined(_MSC_VER) || defined(__APPLE__) || defined(DOXYGEN) #define MDBX_64BIT_CAS 1 +#elif !(MDBX_64BIT_CAS == 0 || MDBX_64BIT_CAS == 1) +#error MDBX_64BIT_CAS must be defined as 0 or 1 #else #define MDBX_64BIT_CAS MDBX_64BIT_ATOMIC #endif From 07f2ccb752f3db25d2a454184230f3f68f12b3d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Tue, 13 Dec 2022 20:04:37 +0300 Subject: [PATCH 277/364] =?UTF-8?q?mdbx:=20=D0=B4=D0=BE=D0=B1=D0=B0=D0=B2?= =?UTF-8?q?=D0=BB=D0=B5=D0=BD=D0=B8=D0=B5=20=D0=BE=D0=BF=D1=86=D0=B8=D0=B8?= =?UTF-8?q?=20`MDBX=5FMMAP=5FUSE=5FMS=5FASYNC`.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Суть в избавлении от лишнего вызова msync(MS_ASYNC) в режимах MDBX_WRITEMAP+MDBX_SAFE_NOSYNC и т.п. Гипотетически могут быть системы/платформы, на которых изменения в разделяемой памяти не видны другим процессам до вызова msync(MS_ASYNC) и/или до этого вызова не будет инициироваться вытеснение/запись таких страниц на диск. Поэтому использование msync(MS_ASYNC) вынесено под опцию MDBX_MMAP_USE_MS_ASYNC, которая по-умолчанию включена только на системах с MDBX_MMAP_INCOHERENT_FILE_WRITE или MDBX_MMAP_INCOHERENT_CPU_CACHE. --- src/options.h | 10 ++++++++++ src/osal.c | 12 ++++++++++-- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/src/options.h b/src/options.h index 96dc8325..a4081e6c 100644 --- a/src/options.h +++ b/src/options.h @@ -389,6 +389,16 @@ #error MDBX_MMAP_INCOHERENT_CPU_CACHE must be defined as 0 or 1 #endif /* MDBX_MMAP_INCOHERENT_CPU_CACHE */ +#ifndef MDBX_MMAP_USE_MS_ASYNC +#if MDBX_MMAP_INCOHERENT_FILE_WRITE || MDBX_MMAP_INCOHERENT_CPU_CACHE +#define MDBX_MMAP_USE_MS_ASYNC 1 +#else +#define MDBX_MMAP_USE_MS_ASYNC 0 +#endif +#elif !(MDBX_MMAP_USE_MS_ASYNC == 0 || MDBX_MMAP_USE_MS_ASYNC == 1) +#error MDBX_MMAP_USE_MS_ASYNC must be defined as 0 or 1 +#endif /* MDBX_MMAP_USE_MS_ASYNC */ + #ifndef MDBX_64BIT_ATOMIC #if MDBX_WORDBITS >= 64 || defined(DOXYGEN) #define MDBX_64BIT_ATOMIC 1 diff --git a/src/osal.c b/src/osal.c index a18e3e96..a8dbac6a 100644 --- a/src/osal.c +++ b/src/osal.c @@ -1707,10 +1707,16 @@ MDBX_INTERNAL_FUNC int osal_thread_join(osal_thread_t thread) { MDBX_INTERNAL_FUNC int osal_msync(const osal_mmap_t *map, size_t offset, size_t length, enum osal_syncmode_bits mode_bits) { + if (!MDBX_MMAP_USE_MS_ASYNC && mode_bits == MDBX_SYNC_KICK) + return MDBX_SUCCESS; + void *ptr = ptr_disp(map->base, offset); #if defined(_WIN32) || defined(_WIN64) if (!FlushViewOfFile(ptr, length)) return (int)GetLastError(); + if ((mode_bits & (MDBX_SYNC_DATA | MDBX_SYNC_IODQ)) && + !FlushFileBuffers(map->fd)) + return (int)GetLastError(); #else #if defined(__linux__) || defined(__gnu_linux__) /* Since Linux 2.6.19, MS_ASYNC is in fact a no-op. The kernel properly @@ -1718,6 +1724,7 @@ MDBX_INTERNAL_FUNC int osal_msync(const osal_mmap_t *map, size_t offset, // // However, this behavior may be changed in custom kernels, // so just leave such optimization to the libc discretion. + // NOTE: The MDBX_MMAP_USE_MS_ASYNC must be defined to 1 for such cases. // // assert(linux_kernel_version > 0x02061300); // if (mode_bits == MDBX_SYNC_KICK) @@ -1725,9 +1732,10 @@ MDBX_INTERNAL_FUNC int osal_msync(const osal_mmap_t *map, size_t offset, #endif /* Linux */ if (msync(ptr, length, (mode_bits & MDBX_SYNC_DATA) ? MS_SYNC : MS_ASYNC)) return errno; - mode_bits &= ~MDBX_SYNC_DATA; + if ((mode_bits & MDBX_SYNC_SIZE) && fsync(map->fd)) + return errno; #endif - return osal_fsync(map->fd, mode_bits); + return MDBX_SUCCESS; } MDBX_INTERNAL_FUNC int osal_check_fs_rdonly(mdbx_filehandle_t handle, From ffdff3f831c2f53ccdacea2b38fa7c23fe85e745 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Wed, 14 Dec 2022 10:58:31 +0300 Subject: [PATCH 278/364] =?UTF-8?q?mdbx:=20=D0=BE=D0=B1=D0=BD=D0=BE=D0=B2?= =?UTF-8?q?=D0=BB=D0=B5=D0=BD=D0=B8=D0=B5=20ChangeLog.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ChangeLog.md | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/ChangeLog.md b/ChangeLog.md index ab5564a8..2c31d3f9 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -5,12 +5,12 @@ English version [by Google](https://gitflic-ru.translate.goog/project/erthink/li and [by Yandex](https://translated.turbopages.org/proxy_u/ru-en.en/https/gitflic.ru/project/erthink/libmdbx/blob?file=ChangeLog.md). -## v0.12.3 (Акула) запланирован на 2022-12-11 +## v0.12.3 (Акула) запланирован на 2022-12-20 Выпуск с существенными доработками и новой функциональностью в память о закрытом open-source проекте "Акула". ``` -18 files changed, 2729 insertions(+), 1696 deletions(-) +18 files changed, 2792 insertions(+), 1698 deletions(-) Signed-off-by: Леонид Юрьев (Leonid Yuriev) ``` @@ -70,6 +70,22 @@ Signed-off-by: Леонид Юрьев (Leonid Yuriev) с отложенной/ленивой записью, так как в этом случае невозможно обеспечить сброс данных на диск во всех случаях на всех поддерживаемых платформах. + - Добавлена опция сборки `MDBX_MMAP_USE_MS_ASYNC` позволяющая отключить + использование системного вызова `msync(MS_ASYNC)`, в использовании + которого нет необходимости на подавляющем большинстве актуальных ОС. + По-умолчанию `MDBX_MMAP_USE_MS_ASYNC=0` (выключено) на Linux и других + системах с unified page cache. Такое поведение (без использования + `msync(MS_ASYNC)`) соответствует неизменяемой (hardcoded) логике LMDB. В + результате, в простых/наивных бенчмарках, libmdbx опережает LMDB + примерна также как при реальном применении. + + На всякий случай стоит еще раз отметить/напомнить, что на Windows + предположительно libmdbx будет отставать от LMDB в сценариях с + множеством мелких транзакций, так как libmdbx осознанно использует на + Windows файловые блокировки, которые медленные (плохо реализованы в ядре + ОС), но позволяют застраховать пользователей от массы неверных действий + приводящих к повреждению БД. + Исправления (без корректировок новых функций): - Изменение размера отображения если это требуется для сброса данных на From bf2f3bfbbf16980322564cabf2d79518dda0e0d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Mon, 19 Dec 2022 21:22:42 +0300 Subject: [PATCH 279/364] =?UTF-8?q?mdbx:=20=D1=83=D1=81=D1=82=D1=80=D0=B0?= =?UTF-8?q?=D0=BD=D0=B5=D0=BD=D0=B8=D0=B5=20=D1=87=D1=82=D0=B5=D0=BD=D0=B8?= =?UTF-8?q?=D1=8F=20=D0=BE=D1=81=D0=B2=D0=BE=D0=B1=D0=BE=D0=B6=D0=B4=D0=B5?= =?UTF-8?q?=D0=BD=D0=BD=D0=BE=D0=B9=20=D0=BF=D0=B0=D0=BC=D1=8F=D1=82=D0=B8?= =?UTF-8?q?=20=D0=B8=20=D0=B6=D0=B0=D0=BB=D0=BE=D0=B1=20ASAN=20=D0=BF?= =?UTF-8?q?=D1=80=D0=B8=20=D1=81=D0=BF=D0=B8=D0=BB=D0=BB=D0=B8=D0=BD=D0=B3?= =?UTF-8?q?=D0=B5.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Вероятность проявляния проблемы крайне низкая, но стало воспроизводиться после доработки спллинга. --- src/core.c | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/src/core.c b/src/core.c index 27c226b0..995ea2d5 100644 --- a/src/core.c +++ b/src/core.c @@ -4696,10 +4696,14 @@ static size_t cursor_keep(MDBX_txn *txn, MDBX_cursor *mc) { tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); size_t keep = 0; - while (mc->mc_flags & C_INITIALIZED) { - for (size_t i = 0; i < mc->mc_snum; ++i) { - const MDBX_page *mp = mc->mc_pg[i]; - if (IS_MODIFIABLE(txn, mp) && !IS_SUBP(mp)) { + while ((mc->mc_flags & C_INITIALIZED) && mc->mc_snum) { + tASSERT(txn, mc->mc_top == mc->mc_snum - 1); + const MDBX_page *mp; + size_t i = 0; + do { + mp = mc->mc_pg[i]; + tASSERT(txn, !IS_SUBP(mp)); + if (IS_MODIFIABLE(txn, mp)) { size_t const n = dpl_search(txn, mp->mp_pgno); if (txn->tw.dirtylist->items[n].pgno == mp->mp_pgno && dpl_age(txn, n)) { @@ -4709,8 +4713,13 @@ static size_t cursor_keep(MDBX_txn *txn, MDBX_cursor *mc) { ++keep; } } - } - if (!mc->mc_xcursor) + } while (++i < mc->mc_snum); + + tASSERT(txn, IS_LEAF(mp)); + if (!mc->mc_xcursor || mc->mc_ki[mc->mc_top] >= page_numkeys(mp)) + break; + const MDBX_node *const node = page_node(mp, mc->mc_ki[mc->mc_top]); + if (!(node->mn_flags & F_SUBDATA)) break; mc = &mc->mc_xcursor->mx_cursor; } @@ -15667,7 +15676,8 @@ __hot static int page_search(MDBX_cursor *mc, const MDBX_val *key, int flags) { } cASSERT(mc, root >= NUM_METAS); - if (!mc->mc_pg[0] || mc->mc_pg[0]->mp_pgno != root) { + if (!mc->mc_snum || !(mc->mc_flags & C_INITIALIZED) || + mc->mc_pg[0]->mp_pgno != root) { txnid_t pp_txnid = mc->mc_db->md_mod_txnid; pp_txnid = /* mc->mc_db->md_mod_txnid maybe zero in a legacy DB */ pp_txnid ? pp_txnid From b247b081af006b69b03e16063ad694e915cdac37 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Wed, 14 Dec 2022 23:43:32 +0300 Subject: [PATCH 280/364] =?UTF-8?q?mdbx:=20=D0=BF=D0=B5=D1=80=D0=B5=D1=80?= =?UTF-8?q?=D0=B0=D0=B1=D0=BE=D1=82=D0=BA=D0=B0=20LRU-=D0=BE=D1=82=D0=BC?= =?UTF-8?q?=D0=B5=D1=82=D0=BE=D0=BA=20=D0=B4=D0=BB=D1=8F=20=D1=81=D0=BF?= =?UTF-8?q?=D0=B8=D0=BB=D0=BB=D0=B8=D0=BD=D0=B3=D0=B0.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Два существенных изменения: 1. Инкремент и обновление LRU происходит при изменении страницы, но не при доступе к ней. 2. Устранен регресс, из-за которого страницы в стеке курсора хоть помечались, но могли быть ошибочно пролиты на диск, так как dpl_age() возвращал не 0. --- src/core.c | 62 ++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 41 insertions(+), 21 deletions(-) diff --git a/src/core.c b/src/core.c index 995ea2d5..b91bf3ca 100644 --- a/src/core.c +++ b/src/core.c @@ -3062,11 +3062,12 @@ static __inline uint32_t dpl_age(const MDBX_txn *txn, size_t i) { tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); const MDBX_dpl *dl = txn->tw.dirtylist; assert((intptr_t)i > 0 && i <= dl->length); - return (txn->tw.dirtylru + 1 - dl->items[i].mlru) >> 1; + return (txn->tw.dirtylru >> 1) - (dl->items[i].mlru >> 1); } -static __inline uint32_t txn_lru_inc(MDBX_txn *txn) { - if (unlikely(++txn->tw.dirtylru > UINT32_MAX / 3)) +static __inline uint32_t txn_lru_turn(MDBX_txn *txn) { + txn->tw.dirtylru += 2; + if (unlikely(txn->tw.dirtylru > UINT32_MAX / 3)) txn_lru_reduce(txn); return txn->tw.dirtylru & MDBX_dp_lru_mask; } @@ -3077,7 +3078,7 @@ static __always_inline int __must_check_result dpl_append(MDBX_txn *txn, size_t npages) { tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); - const MDBX_dp dp = {page, pgno, txn_lru_inc(txn) + (npages > 1)}; + const MDBX_dp dp = {page, pgno, txn_lru_turn(txn) + (npages > 1)}; MDBX_dpl *dl = txn->tw.dirtylist; tASSERT(txn, dl->length <= MDBX_PGL_LIMIT + MDBX_PNL_GRANULATE); tASSERT(txn, dl->items[0].pgno == 0 && @@ -4692,7 +4693,7 @@ static int spill_page(MDBX_txn *txn, iov_ctx_t *ctx, MDBX_page *dp, /* Set unspillable LRU-label for dirty pages watched by txn. * Returns the number of pages marked as unspillable. */ -static size_t cursor_keep(MDBX_txn *txn, MDBX_cursor *mc) { +static size_t cursor_keep(const MDBX_txn *const txn, const MDBX_cursor *mc) { tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); size_t keep = 0; @@ -4706,10 +4707,11 @@ static size_t cursor_keep(MDBX_txn *txn, MDBX_cursor *mc) { if (IS_MODIFIABLE(txn, mp)) { size_t const n = dpl_search(txn, mp->mp_pgno); if (txn->tw.dirtylist->items[n].pgno == mp->mp_pgno && - dpl_age(txn, n)) { + /* не считаем дважды */ dpl_age(txn, n)) { txn->tw.dirtylist->items[n].mlru = (txn->tw.dirtylist->items[n].mlru & MDBX_dp_multi_mask) + (txn->tw.dirtylru & MDBX_dp_lru_mask); + tASSERT(txn, dpl_age(txn, n) == 0); ++keep; } } @@ -4727,8 +4729,8 @@ static size_t cursor_keep(MDBX_txn *txn, MDBX_cursor *mc) { } static size_t txn_keep(MDBX_txn *txn, MDBX_cursor *m0) { - tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); + txn_lru_turn(txn); size_t keep = m0 ? cursor_keep(txn, m0) : 0; for (size_t i = FREE_DBI; i < txn->mt_numdbs; ++i) if (F_ISSET(txn->mt_dbistate[i], DBI_DIRTY | DBI_VALID) && @@ -4831,6 +4833,7 @@ static __inline int txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0, const size_t need) { tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); + tASSERT(txn, !m0 || cursor_is_tracked(m0)); intptr_t wanna_spill_entries = need - txn->tw.dirtyroom - txn->tw.loose_count; intptr_t wanna_spill_npages = @@ -4863,7 +4866,7 @@ static size_t spill_gate(const MDBX_env *env, intptr_t part, : 0); part = (part < spill_max) ? part : spill_max; part = (part > spill_min) ? part : spill_min; - eASSERT(env, part > 0 && (size_t)part <= total); + eASSERT(env, part >= 0 && (size_t)part <= total); return (size_t)part; } @@ -4951,7 +4954,7 @@ __cold static int txn_spill_slowpath(MDBX_txn *const txn, MDBX_cursor *const m0, if (likely(txn->tw.dirtyroom + txn->tw.loose_count >= need)) return MDBX_SUCCESS; #endif /* xMDBX_DEBUG_SPILLING */ - ERROR("all %zu dirty pages are unspillable since referenced " + ERROR("all %zu dirty pages are unspillable since referenced " "by a cursor(s), use fewer cursors or increase " "MDBX_opt_txn_dp_limit", unspillable); @@ -7661,6 +7664,8 @@ done: __hot static pgr_t page_alloc(const MDBX_cursor *const mc) { MDBX_txn *const txn = mc->mc_txn; + tASSERT(txn, mc->mc_txn->mt_flags & MDBX_TXN_DIRTY); + tASSERT(txn, F_ISSET(txn->mt_dbistate[mc->mc_dbi], DBI_DIRTY | DBI_VALID)); /* If there are any loose pages, just use them */ while (likely(txn->tw.loose_pages)) { @@ -7799,6 +7804,9 @@ __hot static int page_touch(MDBX_cursor *mc) { MDBX_txn *txn = mc->mc_txn; int rc; + tASSERT(txn, mc->mc_txn->mt_flags & MDBX_TXN_DIRTY); + tASSERT(txn, F_ISSET(*mc->mc_dbistate, DBI_DIRTY | DBI_VALID)); + tASSERT(txn, !IS_OVERFLOW(mp)); if (ASSERT_ENABLED()) { if (mc->mc_flags & C_SUB) { MDBX_xcursor *mx = container_of(mc->mc_db, MDBX_xcursor, mx_db); @@ -7806,16 +7814,30 @@ __hot static int page_touch(MDBX_cursor *mc) { tASSERT(txn, mc->mc_db == &couple->outer.mc_xcursor->mx_db); tASSERT(txn, mc->mc_dbx == &couple->outer.mc_xcursor->mx_dbx); tASSERT(txn, *couple->outer.mc_dbistate & DBI_DIRTY); - } else { - tASSERT(txn, *mc->mc_dbistate & DBI_DIRTY); } - tASSERT(txn, mc->mc_txn->mt_flags & MDBX_TXN_DIRTY); - tASSERT(txn, !IS_OVERFLOW(mp)); tASSERT(txn, dirtylist_check(txn)); } - if (IS_MODIFIABLE(txn, mp) || IS_SUBP(mp)) + if (IS_MODIFIABLE(txn, mp)) { + if (!txn->tw.dirtylist) { + tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) && !MDBX_AVOID_MSYNC); + return MDBX_SUCCESS; + } + if (IS_SUBP(mp)) + return MDBX_SUCCESS; + tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); + const size_t n = dpl_search(txn, mp->mp_pgno); + txn->tw.dirtylist->items[n].mlru = + (txn->tw.dirtylist->items[n].mlru & MDBX_dp_multi_mask) + + txn_lru_turn(txn); return MDBX_SUCCESS; + } + if (IS_SUBP(mp)) { + np = (MDBX_page *)mp; + np->mp_txnid = txn->mt_front; + return MDBX_SUCCESS; + } + tASSERT(txn, !IS_OVERFLOW(mp)); if (IS_FROZEN(txn, mp)) { /* CoW the page */ @@ -8847,7 +8869,7 @@ static int txn_renew(MDBX_txn *txn, const unsigned flags) { if (unlikely(rc != MDBX_SUCCESS)) goto bailout; txn->tw.dirtyroom = txn->mt_env->me_options.dp_limit; - txn->tw.dirtylru = MDBX_DEBUG ? ~42u : 0; + txn->tw.dirtylru = MDBX_DEBUG ? UINT32_MAX / 3 - 42 : 0; } else { tASSERT(txn, txn->tw.dirtylist == nullptr); txn->tw.dirtylist = nullptr; @@ -12422,7 +12444,8 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending, assert(rc == MDBX_RESULT_TRUE /* carry non-steady */); skip_incore_sync: eASSERT(env, env->me_lck->mti_unsynced_pages.weak > 0); - eASSERT(env, env->me_lck->mti_eoos_timestamp.weak != 0); + /* Может быть нулевым если unsynced_pages > 0 в результате спиллинга. + * eASSERT(env, env->me_lck->mti_eoos_timestamp.weak != 0); */ unaligned_poke_u64(4, pending->mm_sign, MDBX_DATASIGN_WEAK); } @@ -15405,9 +15428,6 @@ __hot static __always_inline pgr_t page_get_inline(const uint16_t ILL, const size_t i = dpl_search(spiller, pgno); tASSERT(txn, (intptr_t)i > 0); if (spiller->tw.dirtylist->items[i].pgno == pgno) { - const uint32_t is_multi = - spiller->tw.dirtylist->items[i].mlru & MDBX_dp_multi_mask; - spiller->tw.dirtylist->items[i].mlru = is_multi + txn_lru_inc(txn); r.page = spiller->tw.dirtylist->items[i].ptr; break; } @@ -17729,13 +17749,13 @@ __hot int mdbx_cursor_del(MDBX_cursor *mc, MDBX_put_flags_t flags) { mc->mc_xcursor->mx_db.md_mod_txnid = mc->mc_txn->mt_txnid; memcpy(db, &mc->mc_xcursor->mx_db, sizeof(MDBX_db)); } else { - MDBX_cursor *m2; /* shrink fake page */ node_shrink(mp, mc->mc_ki[mc->mc_top]); node = page_node(mp, mc->mc_ki[mc->mc_top]); mc->mc_xcursor->mx_cursor.mc_pg[0] = node_data(node); /* fix other sub-DB cursors pointed at fake pages on this page */ - for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2 = m2->mc_next) { + for (MDBX_cursor *m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; + m2 = m2->mc_next) { if (m2 == mc || m2->mc_snum < mc->mc_snum) continue; if (!(m2->mc_flags & C_INITIALIZED)) From 701174326243a2b380048b176fd6aed849ff3c4d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Tue, 13 Dec 2022 17:08:39 +0300 Subject: [PATCH 281/364] =?UTF-8?q?mdbx:=20=D0=BF=D0=BE=D0=B4=D0=B4=D0=B5?= =?UTF-8?q?=D1=80=D0=B6=D0=BA=D0=B0=20=D0=BD=D0=B5-=D0=BF=D0=B5=D1=87?= =?UTF-8?q?=D0=B0=D1=82=D0=BD=D1=8B=D1=85=20=D0=B8=D0=BC=D0=B5=D0=BD=20?= =?UTF-8?q?=D0=B4=D0=BB=D1=8F=20subDb.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mdbx.h | 23 +++++--- src/core.c | 161 +++++++++++++++++++++++++++++++++-------------------- 2 files changed, 115 insertions(+), 69 deletions(-) diff --git a/mdbx.h b/mdbx.h index 1b3dcfd8..8d8621de 100644 --- a/mdbx.h +++ b/mdbx.h @@ -4163,6 +4163,8 @@ typedef int(MDBX_cmp_func)(const MDBX_val *a, * by current thread. */ LIBMDBX_API int mdbx_dbi_open(MDBX_txn *txn, const char *name, MDBX_db_flags_t flags, MDBX_dbi *dbi); +LIBMDBX_API int mdbx_dbi_open2(MDBX_txn *txn, const MDBX_val *name, + MDBX_db_flags_t flags, MDBX_dbi *dbi); /** \deprecated Please * \ref avoid_custom_comparators "avoid using custom comparators" and use @@ -4182,6 +4184,9 @@ LIBMDBX_API int mdbx_dbi_open(MDBX_txn *txn, const char *name, MDBX_DEPRECATED LIBMDBX_API int mdbx_dbi_open_ex(MDBX_txn *txn, const char *name, MDBX_db_flags_t flags, MDBX_dbi *dbi, MDBX_cmp_func *keycmp, MDBX_cmp_func *datacmp); +MDBX_DEPRECATED LIBMDBX_API int +mdbx_dbi_open_ex2(MDBX_txn *txn, const MDBX_val *name, MDBX_db_flags_t flags, + MDBX_dbi *dbi, MDBX_cmp_func *keycmp, MDBX_cmp_func *datacmp); /** \defgroup value2key Value-to-Key functions * \brief Value-to-Key functions to @@ -5479,18 +5484,20 @@ typedef enum MDBX_page_type_t MDBX_page_type_t; #endif /** \brief Pseudo-name for MainDB */ -#define MDBX_PGWALK_MAIN ((const char *)((ptrdiff_t)0)) +#define MDBX_PGWALK_MAIN ((void *)((ptrdiff_t)0)) /** \brief Pseudo-name for GarbageCollectorDB */ -#define MDBX_PGWALK_GC ((const char *)((ptrdiff_t)-1)) +#define MDBX_PGWALK_GC ((void *)((ptrdiff_t)-1)) /** \brief Pseudo-name for MetaPages */ -#define MDBX_PGWALK_META ((const char *)((ptrdiff_t)-2)) +#define MDBX_PGWALK_META ((void *)((ptrdiff_t)-2)) /** \brief Callback function for traverse the b-tree. \see mdbx_env_pgwalk() */ -typedef int MDBX_pgvisitor_func( - const uint64_t pgno, const unsigned number, void *const ctx, const int deep, - const char *const dbi, const size_t page_size, const MDBX_page_type_t type, - const MDBX_error_t err, const size_t nentries, const size_t payload_bytes, - const size_t header_bytes, const size_t unused_bytes) MDBX_CXX17_NOEXCEPT; +typedef int +MDBX_pgvisitor_func(const uint64_t pgno, const unsigned number, void *const ctx, + const int deep, const MDBX_val *dbi_name, + const size_t page_size, const MDBX_page_type_t type, + const MDBX_error_t err, const size_t nentries, + const size_t payload_bytes, const size_t header_bytes, + const size_t unused_bytes) MDBX_CXX17_NOEXCEPT; /** \brief B-tree traversal function. */ LIBMDBX_API int mdbx_env_pgwalk(MDBX_txn *txn, MDBX_pgvisitor_func *visitor, diff --git a/src/core.c b/src/core.c index b91bf3ca..352c731a 100644 --- a/src/core.c +++ b/src/core.c @@ -9489,14 +9489,18 @@ static void dbi_update(MDBX_txn *txn, int keep) { if (keep) { env->me_dbflags[i] = txn->mt_dbs[i].md_flags | DB_VALID; } else { - char *ptr = env->me_dbxs[i].md_name.iov_base; - if (ptr) { - env->me_dbxs[i].md_name.iov_len = 0; + const MDBX_val name = env->me_dbxs[i].md_name; + if (name.iov_base) { + env->me_dbxs[i].md_name.iov_base = nullptr; eASSERT(env, env->me_dbflags[i] == 0); atomic_store32(&env->me_dbiseqs[i], dbi_seq(env, i), mo_AcquireRelease); - env->me_dbxs[i].md_name.iov_base = NULL; - osal_free(ptr); + env->me_dbxs[i].md_name.iov_len = 0; + if (name.iov_len) + osal_free(name.iov_base); + } else { + eASSERT(env, name.iov_len == 0); + eASSERT(env, env->me_dbflags[i] == 0); } } } @@ -9862,7 +9866,7 @@ __cold static int audit_ex(MDBX_txn *txn, size_t retired_stored, if ((txn->mt_flags & MDBX_TXN_RDONLY) == 0) { for (MDBX_dbi k = txn->mt_numdbs; --k > MAIN_DBI;) { if ((txn->mt_dbistate[k] & DBI_VALID) && - /* txn->mt_dbxs[k].md_name.iov_len > 0 && */ + /* txn->mt_dbxs[k].md_name.iov_base && */ node_ks(node) == txn->mt_dbxs[k].md_name.iov_len && memcmp(node_key(node), txn->mt_dbxs[k].md_name.iov_base, node_ks(node)) == 0) { @@ -14986,7 +14990,8 @@ __cold static int env_close(MDBX_env *env) { if (env->me_dbxs) { for (size_t i = CORE_DBS; i < env->me_numdbs; ++i) - osal_free(env->me_dbxs[i].md_name.iov_base); + if (env->me_dbxs[i].md_name.iov_len) + osal_free(env->me_dbxs[i].md_name.iov_base); osal_free(env->me_dbxs); env->me_numdbs = CORE_DBS; env->me_dbxs = nullptr; @@ -22160,8 +22165,8 @@ static int dbi_bind(MDBX_txn *txn, const MDBX_dbi dbi, unsigned user_flags, return MDBX_SUCCESS; } -static int dbi_open(MDBX_txn *txn, const char *table_name, unsigned user_flags, - MDBX_dbi *dbi, MDBX_cmp_func *keycmp, +static int dbi_open(MDBX_txn *txn, const MDBX_val *const table_name, + unsigned user_flags, MDBX_dbi *dbi, MDBX_cmp_func *keycmp, MDBX_cmp_func *datacmp) { int rc = MDBX_EINVAL; if (unlikely(!dbi)) @@ -22198,17 +22203,30 @@ static int dbi_open(MDBX_txn *txn, const char *table_name, unsigned user_flags, } /* main table? */ - if (!table_name) { + if (table_name == MDBX_PGWALK_MAIN || + table_name->iov_base == MDBX_PGWALK_MAIN) { rc = dbi_bind(txn, MAIN_DBI, user_flags, keycmp, datacmp); if (unlikely(rc != MDBX_SUCCESS)) goto early_bailout; *dbi = MAIN_DBI; return rc; } + if (table_name == MDBX_PGWALK_GC || table_name->iov_base == MDBX_PGWALK_GC) { + rc = dbi_bind(txn, FREE_DBI, user_flags, keycmp, datacmp); + if (unlikely(rc != MDBX_SUCCESS)) + goto early_bailout; + *dbi = FREE_DBI; + return rc; + } + if (table_name == MDBX_PGWALK_META || + table_name->iov_base == MDBX_PGWALK_META) { + rc = MDBX_EINVAL; + goto early_bailout; + } - MDBX_env *env = txn->mt_env; - size_t len = strlen(table_name); - if (len > env->me_leaf_nodemax - NODESIZE - sizeof(MDBX_db)) + MDBX_val key = *table_name; + MDBX_env *const env = txn->mt_env; + if (key.iov_len > env->me_leaf_nodemax - NODESIZE - sizeof(MDBX_db)) return MDBX_EINVAL; if (txn->mt_dbxs[MAIN_DBI].md_cmp == NULL) { @@ -22221,13 +22239,14 @@ static int dbi_open(MDBX_txn *txn, const char *table_name, unsigned user_flags, /* Is the DB already open? */ MDBX_dbi scan, slot; for (slot = scan = txn->mt_numdbs; --scan >= CORE_DBS;) { - if (!txn->mt_dbxs[scan].md_name.iov_len) { + if (!txn->mt_dbxs[scan].md_name.iov_base) { /* Remember this free slot */ slot = scan; continue; } - if (len == txn->mt_dbxs[scan].md_name.iov_len && - !strncmp(table_name, txn->mt_dbxs[scan].md_name.iov_base, len)) { + if (key.iov_len == txn->mt_dbxs[scan].md_name.iov_len && + !memcmp(key.iov_base, txn->mt_dbxs[scan].md_name.iov_base, + key.iov_len)) { rc = dbi_bind(txn, scan, user_flags, keycmp, datacmp); if (unlikely(rc != MDBX_SUCCESS)) goto early_bailout; @@ -22250,9 +22269,7 @@ static int dbi_open(MDBX_txn *txn, const char *table_name, unsigned user_flags, } /* Find the DB info */ - MDBX_val key, data; - key.iov_len = len; - key.iov_base = (void *)table_name; + MDBX_val data; MDBX_cursor_couple couple; rc = cursor_init(&couple.outer, txn, MAIN_DBI); if (unlikely(rc != MDBX_SUCCESS)) @@ -22281,16 +22298,21 @@ static int dbi_open(MDBX_txn *txn, const char *table_name, unsigned user_flags, } /* Done here so we cannot fail after creating a new DB */ - char *const namedup = osal_strdup(table_name); - if (unlikely(!namedup)) { - rc = MDBX_ENOMEM; - goto early_bailout; - } + void *clone = nullptr; + if (key.iov_len) { + clone = osal_malloc(key.iov_len); + if (unlikely(!clone)) { + rc = MDBX_ENOMEM; + goto early_bailout; + } + key.iov_base = memcpy(clone, key.iov_base, key.iov_len); + } else + key.iov_base = ""; int err = osal_fastmutex_acquire(&env->me_dbi_lock); if (unlikely(err != MDBX_SUCCESS)) { rc = err; - osal_free(namedup); + osal_free(clone); goto early_bailout; } @@ -22299,13 +22321,14 @@ static int dbi_open(MDBX_txn *txn, const char *table_name, unsigned user_flags, /* Rescan after mutex acquisition & import handles */ for (slot = scan = txn->mt_numdbs; --scan >= CORE_DBS;) { - if (!txn->mt_dbxs[scan].md_name.iov_len) { + if (!txn->mt_dbxs[scan].md_name.iov_base) { /* Remember this free slot */ slot = scan; continue; } - if (len == txn->mt_dbxs[scan].md_name.iov_len && - !strncmp(table_name, txn->mt_dbxs[scan].md_name.iov_base, len)) { + if (key.iov_len == txn->mt_dbxs[scan].md_name.iov_len && + !memcmp(key.iov_base, txn->mt_dbxs[scan].md_name.iov_base, + key.iov_len)) { rc = dbi_bind(txn, scan, user_flags, keycmp, datacmp); if (unlikely(rc != MDBX_SUCCESS)) goto later_bailout; @@ -22352,11 +22375,10 @@ static int dbi_open(MDBX_txn *txn, const char *table_name, unsigned user_flags, later_bailout: *dbi = 0; later_exit: - osal_free(namedup); + osal_free(clone); } else { txn->mt_dbistate[slot] = (uint8_t)dbiflags; - txn->mt_dbxs[slot].md_name.iov_base = namedup; - txn->mt_dbxs[slot].md_name.iov_len = len; + txn->mt_dbxs[slot].md_name = key; txn->mt_dbiseqs[slot].weak = env->me_dbiseqs[slot].weak = dbi_seq(env, slot); if (!(dbiflags & DBI_CREAT)) @@ -22377,15 +22399,41 @@ static int dbi_open(MDBX_txn *txn, const char *table_name, unsigned user_flags, return rc; } -int mdbx_dbi_open(MDBX_txn *txn, const char *table_name, - MDBX_db_flags_t table_flags, MDBX_dbi *dbi) { - return dbi_open(txn, table_name, table_flags, dbi, nullptr, nullptr); +static int dbi_open_cstr(MDBX_txn *txn, const char *name_cstr, + MDBX_db_flags_t flags, MDBX_dbi *dbi, + MDBX_cmp_func *keycmp, MDBX_cmp_func *datacmp) { + MDBX_val thunk, *name; + if (name_cstr == MDBX_PGWALK_MAIN || name_cstr == MDBX_PGWALK_GC || + name_cstr == MDBX_PGWALK_META) + name = (void *)name_cstr; + else { + thunk.iov_len = strlen(name_cstr); + thunk.iov_base = (void *)name_cstr; + name = &thunk; + } + return dbi_open(txn, name, flags, dbi, keycmp, datacmp); } -int mdbx_dbi_open_ex(MDBX_txn *txn, const char *table_name, - MDBX_db_flags_t table_flags, MDBX_dbi *dbi, - MDBX_cmp_func *keycmp, MDBX_cmp_func *datacmp) { - return dbi_open(txn, table_name, table_flags, dbi, keycmp, datacmp); +int mdbx_dbi_open(MDBX_txn *txn, const char *name, MDBX_db_flags_t flags, + MDBX_dbi *dbi) { + return dbi_open_cstr(txn, name, flags, dbi, nullptr, nullptr); +} + +int mdbx_dbi_open2(MDBX_txn *txn, const MDBX_val *name, MDBX_db_flags_t flags, + MDBX_dbi *dbi) { + return dbi_open(txn, name, flags, dbi, nullptr, nullptr); +} + +int mdbx_dbi_open_ex(MDBX_txn *txn, const char *name, MDBX_db_flags_t flags, + MDBX_dbi *dbi, MDBX_cmp_func *keycmp, + MDBX_cmp_func *datacmp) { + return dbi_open_cstr(txn, name, flags, dbi, keycmp, datacmp); +} + +int mdbx_dbi_open_ex2(MDBX_txn *txn, const MDBX_val *name, + MDBX_db_flags_t flags, MDBX_dbi *dbi, + MDBX_cmp_func *keycmp, MDBX_cmp_func *datacmp) { + return dbi_open(txn, name, flags, dbi, keycmp, datacmp); } __cold int mdbx_dbi_stat(MDBX_txn *txn, MDBX_dbi dbi, MDBX_stat *dest, @@ -23064,7 +23112,7 @@ typedef struct mdbx_walk_ctx { } mdbx_walk_ctx_t; __cold static int walk_sdb(mdbx_walk_ctx_t *ctx, MDBX_db *const sdb, - const char *name, int deep); + const MDBX_val *name, int deep); static MDBX_page_type_t walk_page_type(const MDBX_page *mp) { if (mp) @@ -23085,7 +23133,8 @@ static MDBX_page_type_t walk_page_type(const MDBX_page *mp) { /* Depth-first tree traversal. */ __cold static int walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, - const char *name, int deep, txnid_t parent_txnid) { + const MDBX_val *name, int deep, + txnid_t parent_txnid) { assert(pgno != P_INVALID); MDBX_page *mp = nullptr; int err = page_get(ctx->mw_cursor, pgno, &mp, parent_txnid); @@ -23251,33 +23300,22 @@ __cold static int walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, } assert(type == MDBX_page_leaf); - MDBX_db db; switch (node_flags(node)) { default: continue; - case F_SUBDATA /* sub-db */: { - const size_t namelen = node_ks(node); - if (unlikely(namelen == 0 || node_ds(node) != sizeof(MDBX_db))) { + case F_SUBDATA /* sub-db */: + if (unlikely(node_ds(node) != sizeof(MDBX_db))) { assert(err == MDBX_CORRUPTED); err = MDBX_CORRUPTED; - break; + } else { + MDBX_db db; + memcpy(&db, node_data(node), sizeof(db)); + const MDBX_val subdb_name = {node_key(node), node_ks(node)}; + assert(err == MDBX_SUCCESS); + err = walk_sdb(ctx, &db, &subdb_name, deep + 1); } - - char namebuf_onstask[64]; - char *const sub_name = (namelen < sizeof(namebuf_onstask)) - ? namebuf_onstask - : osal_malloc(namelen + 1); - if (unlikely(!sub_name)) - return MDBX_ENOMEM; - memcpy(sub_name, node_key(node), namelen); - sub_name[namelen] = 0; - memcpy(&db, node_data(node), sizeof(db)); - assert(err == MDBX_SUCCESS); - err = walk_sdb(ctx, &db, sub_name, deep + 1); - if (sub_name != namebuf_onstask) - osal_free(sub_name); - } break; + break; case F_SUBDATA | F_DUPDATA /* dupsorted sub-tree */: if (unlikely(node_ds(node) != sizeof(MDBX_db) || @@ -23285,6 +23323,7 @@ __cold static int walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, assert(err == MDBX_CORRUPTED); err = MDBX_CORRUPTED; } else { + MDBX_db db; memcpy(&db, node_data(node), sizeof(db)); assert(ctx->mw_cursor->mc_xcursor == &container_of(ctx->mw_cursor, MDBX_cursor_couple, outer)->inner); @@ -23308,7 +23347,7 @@ __cold static int walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, } __cold static int walk_sdb(mdbx_walk_ctx_t *ctx, MDBX_db *const sdb, - const char *name, int deep) { + const MDBX_val *name, int deep) { if (unlikely(sdb->md_root == P_INVALID)) return MDBX_SUCCESS; /* empty db */ From 44493c6448d34572ddf0077664aeb06988257da6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Wed, 14 Dec 2022 14:42:16 +0300 Subject: [PATCH 282/364] =?UTF-8?q?mdbx-tools:=20=D0=BF=D0=BE=D0=B4=D0=B4?= =?UTF-8?q?=D0=B5=D1=80=D0=B6=D0=BA=D0=B0=20=D0=BD=D0=B5-=D0=BF=D0=B5?= =?UTF-8?q?=D1=87=D0=B0=D1=82=D0=BD=D1=8B=D1=85=20=D0=B8=D0=BC=D0=B5=D0=BD?= =?UTF-8?q?=20subDb=20=D0=B2=20`mdbx=5Fchk`.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/mdbx_chk.c | 245 +++++++++++++++++++++++++++++++----------------- src/mdbx_dump.c | 2 +- 2 files changed, 159 insertions(+), 88 deletions(-) diff --git a/src/mdbx_chk.c b/src/mdbx_chk.c index d1cb718e..3a6c59eb 100644 --- a/src/mdbx_chk.c +++ b/src/mdbx_chk.c @@ -23,6 +23,8 @@ #define xMDBX_TOOLS /* Avoid using internal eASSERT() */ #include "internals.h" +#include + typedef struct flagbit { int bit; const char *name; @@ -71,7 +73,7 @@ static void signal_handler(int sig) { #define EXIT_FAILURE_CHECK_MINOR EXIT_FAILURE typedef struct { - const char *name; + MDBX_val name; struct { uint64_t branch, large_count, large_volume, leaf; uint64_t subleaf_dupsort, leaf_dupfixed, subleaf_dupfixed; @@ -102,7 +104,7 @@ uint64_t total_unused_bytes, reclaimable_pages, gc_pages, alloc_pages, unused_pages, backed_pages; unsigned verbose; bool ignore_wrong_order, quiet, dont_traversal; -const char *only_subdb; +MDBX_val only_subdb; int stuck_meta = -1; struct problem { @@ -125,6 +127,93 @@ static void MDBX_PRINTF_ARGS(1, 2) print(const char *msg, ...) { } } +static MDBX_val printable_buf; +static void free_printable_buf(void) { osal_free(printable_buf.iov_base); } + +static const char *sdb_name(const MDBX_val *val) { + if (val == MDBX_PGWALK_MAIN) + return "@MAIN"; + if (val == MDBX_PGWALK_GC) + return "@GC"; + if (val == MDBX_PGWALK_META) + return "@META"; + + const unsigned char *const data = val->iov_base; + const size_t len = val->iov_len; + if (data == MDBX_PGWALK_MAIN) + return "@MAIN"; + if (data == MDBX_PGWALK_GC) + return "@GC"; + if (data == MDBX_PGWALK_META) + return "@META"; + + if (!len) + return ""; + if (!data) + return ""; + if (len > 65536) { + static char buf[64]; + snprintf(buf, sizeof(buf), "", len); + return buf; + } + + bool printable = true; + bool quoting = false; + size_t xchars = 0; + for (size_t i = 0; i < val->iov_len && printable; ++i) { + quoting |= data[i] != '_' && isalnum(data[i]) == 0; + printable = isprint(data[i]) != 0 || + (data[i] < ' ' && ++xchars < 4 && len > xchars * 4); + } + + size_t need = len + 1; + if (quoting || !printable) + need += len + /* quotes */ 2 + 2 * /* max xchars */ 4; + if (need > printable_buf.iov_len) { + void *ptr = osal_realloc(printable_buf.iov_base, need); + if (!ptr) + return ""; + if (!printable_buf.iov_base) + atexit(free_printable_buf); + printable_buf.iov_base = ptr; + printable_buf.iov_len = need; + } + + char *out = printable_buf.iov_base; + if (!quoting) { + memcpy(out, data, len); + out += len; + } else if (printable) { + *out++ = '\''; + for (size_t i = 0; i < len; ++i) { + if (data[i] < ' ') { + assert((char *)printable_buf.iov_base + printable_buf.iov_len > + out + 4); + static const char hex[] = "0123456789abcdef"; + out[0] = '\\'; + out[1] = 'x'; + out[2] = hex[data[i] >> 4]; + out[3] = hex[data[i] & 15]; + out += 4; + } else if (strchr("\"'`\\", data[i])) { + assert((char *)printable_buf.iov_base + printable_buf.iov_len > + out + 2); + out[0] = '\\'; + out[1] = data[i]; + out += 2; + } else { + assert((char *)printable_buf.iov_base + printable_buf.iov_len > + out + 1); + *out++ = data[i]; + } + } + *out++ = '\''; + } + assert((char *)printable_buf.iov_base + printable_buf.iov_len > out); + *out = 0; + return printable_buf.iov_base; +} + static void va_log(MDBX_log_level_t level, const char *function, int line, const char *msg, va_list args) { static const char *const prefixes[] = { @@ -190,19 +279,17 @@ static int check_user_break(void) { } static void pagemap_cleanup(void) { - for (size_t i = CORE_DBS + /* account pseudo-entry for meta */ 1; - i < ARRAY_LENGTH(walk.dbi); ++i) { - if (walk.dbi[i].name) { - osal_free((void *)walk.dbi[i].name); - walk.dbi[i].name = nullptr; - } - } - osal_free(walk.pagemap); walk.pagemap = nullptr; } -static walk_dbi_t *pagemap_lookup_dbi(const char *dbi_name, bool silent) { +static bool eq(const MDBX_val a, const MDBX_val b) { + return a.iov_len == b.iov_len && + (a.iov_base == b.iov_base || a.iov_len == 0 || + !memcmp(a.iov_base, b.iov_base, a.iov_len)); +} + +static walk_dbi_t *pagemap_lookup_dbi(const MDBX_val *dbi_name, bool silent) { static walk_dbi_t *last; if (dbi_name == MDBX_PGWALK_MAIN) @@ -212,24 +299,24 @@ static walk_dbi_t *pagemap_lookup_dbi(const char *dbi_name, bool silent) { if (dbi_name == MDBX_PGWALK_META) return &dbi_meta; - if (last && strcmp(last->name, dbi_name) == 0) + if (last && eq(last->name, *dbi_name)) return last; walk_dbi_t *dbi = walk.dbi + CORE_DBS + /* account pseudo-entry for meta */ 1; - for (; dbi < ARRAY_END(walk.dbi) && dbi->name; ++dbi) { - if (strcmp(dbi->name, dbi_name) == 0) + for (; dbi < ARRAY_END(walk.dbi) && dbi->name.iov_base; ++dbi) { + if (eq(dbi->name, *dbi_name)) return last = dbi; } if (verbose > 0 && !silent) { - print(" - found `%s` area\n", dbi_name); + print(" - found %s area\n", sdb_name(dbi_name)); fflush(nullptr); } if (dbi == ARRAY_END(walk.dbi)) return nullptr; - dbi->name = osal_strdup(dbi_name); + dbi->name = *dbi_name; return last = dbi; } @@ -304,13 +391,13 @@ static size_t problems_pop(struct problem *list) { } static int pgvisitor(const uint64_t pgno, const unsigned pgnumber, - void *const ctx, const int deep, - const char *const dbi_name_or_tag, const size_t page_size, - const MDBX_page_type_t pagetype, const MDBX_error_t err, - const size_t nentries, const size_t payload_bytes, - const size_t header_bytes, const size_t unused_bytes) { + void *const ctx, const int deep, const MDBX_val *dbi_name, + const size_t page_size, const MDBX_page_type_t pagetype, + const MDBX_error_t err, const size_t nentries, + const size_t payload_bytes, const size_t header_bytes, + const size_t unused_bytes) { (void)ctx; - const bool is_gc_tree = dbi_name_or_tag == MDBX_PGWALK_GC; + const bool is_gc_tree = dbi_name == MDBX_PGWALK_GC; if (deep > 42) { problem_add("deep", deep, "too large", nullptr); data_tree_problems += !is_gc_tree; @@ -318,7 +405,7 @@ static int pgvisitor(const uint64_t pgno, const unsigned pgnumber, return MDBX_CORRUPTED /* avoid infinite loop/recursion */; } - walk_dbi_t *dbi = pagemap_lookup_dbi(dbi_name_or_tag, false); + walk_dbi_t *dbi = pagemap_lookup_dbi(dbi_name, false); if (!dbi) { data_tree_problems += !is_gc_tree; gc_tree_problems += is_gc_tree; @@ -383,14 +470,14 @@ static int pgvisitor(const uint64_t pgno, const unsigned pgnumber, } if (pgnumber) { - if (verbose > 3 && (!only_subdb || strcmp(only_subdb, dbi->name) == 0)) { + if (verbose > 3 && (!only_subdb.iov_base || eq(only_subdb, dbi->name))) { if (pgnumber == 1) print(" %s-page %" PRIu64, pagetype_caption, pgno); else print(" %s-span %" PRIu64 "[%u]", pagetype_caption, pgno, pgnumber); print(" of %s: header %" PRIiPTR ", %s %" PRIiPTR ", payload %" PRIiPTR ", unused %" PRIiPTR ", deep %i\n", - dbi->name, header_bytes, + sdb_name(&dbi->name), header_bytes, (pagetype == MDBX_page_branch) ? "keys" : "entries", nentries, payload_bytes, unused_bytes, deep); } @@ -408,8 +495,8 @@ static int pgvisitor(const uint64_t pgno, const unsigned pgnumber, walk_dbi_t *coll_dbi = &walk.dbi[walk.pagemap[spanpgno] - 1]; problem_add("page", spanpgno, (branch && coll_dbi == dbi) ? "loop" : "already used", - "%s-page: by %s, deep %i", pagetype_caption, coll_dbi->name, - deep); + "%s-page: by %s, deep %i", pagetype_caption, + sdb_name(&coll_dbi->name), deep); already_used = true; data_tree_problems += !is_gc_tree; gc_tree_problems += is_gc_tree; @@ -491,7 +578,8 @@ static int pgvisitor(const uint64_t pgno, const unsigned pgnumber, typedef int(visitor)(const uint64_t record_number, const MDBX_val *key, const MDBX_val *data); -static int process_db(MDBX_dbi dbi_handle, char *dbi_name, visitor *handler); +static int process_db(MDBX_dbi dbi_handle, const MDBX_val *dbi_name, + visitor *handler); static int handle_userdb(const uint64_t record_number, const MDBX_val *key, const MDBX_val *data) { @@ -569,7 +657,7 @@ static int handle_freedb(const uint64_t record_number, const MDBX_val *key, walk.pagemap[pgno] = -1; else if (idx > 0) problem_add("page", pgno, "already used", "by %s", - walk.dbi[idx - 1].name); + sdb_name(&walk.dbi[idx - 1].name)); else problem_add("page", pgno, "already listed in GC", nullptr); } @@ -580,7 +668,7 @@ static int handle_freedb(const uint64_t record_number, const MDBX_val *key, : pgno_sub(pgno, span))) ++span; } - if (verbose > 3 && !only_subdb) { + if (verbose > 3 && !only_subdb.iov_base) { print(" transaction %" PRIaTXN ", %" PRIuPTR " pages, maxspan %" PRIaPGNO "%s\n", txnid, number, span, bad); @@ -607,39 +695,18 @@ static int handle_freedb(const uint64_t record_number, const MDBX_val *key, } static int equal_or_greater(const MDBX_val *a, const MDBX_val *b) { - return (a->iov_len == b->iov_len && - memcmp(a->iov_base, b->iov_base, a->iov_len) == 0) - ? 0 - : 1; + return eq(*a, *b) ? 0 : 1; } static int handle_maindb(const uint64_t record_number, const MDBX_val *key, const MDBX_val *data) { - char *name; - int rc; - size_t i; - - if (data->iov_len != sizeof(MDBX_db)) - return handle_userdb(record_number, key, data); - name = key->iov_base; - for (i = 0; i < key->iov_len; ++i) { - if (name[i] < ' ') - return handle_userdb(record_number, key, data); + if (data->iov_len == sizeof(MDBX_db)) { + int rc = process_db(~0u, key, handle_userdb); + if (rc != MDBX_INCOMPATIBLE) { + userdb_count++; + return rc; + } } - - name = osal_malloc(key->iov_len + 1); - if (unlikely(!name)) - return MDBX_ENOMEM; - memcpy(name, key->iov_base, key->iov_len); - name[key->iov_len] = '\0'; - - rc = process_db(~0u, name, handle_userdb); - osal_free(name); - if (rc != MDBX_INCOMPATIBLE) { - userdb_count++; - return rc; - } - return handle_userdb(record_number, key, data); } @@ -693,7 +760,8 @@ static const char *db_flags2valuemode(unsigned flags) { } } -static int process_db(MDBX_dbi dbi_handle, char *dbi_name, visitor *handler) { +static int process_db(MDBX_dbi dbi_handle, const MDBX_val *dbi_name, + visitor *handler) { MDBX_cursor *mc; MDBX_stat ms; MDBX_val key, data; @@ -708,13 +776,13 @@ static int process_db(MDBX_dbi dbi_handle, char *dbi_name, visitor *handler) { uint64_t key_bytes = 0, data_bytes = 0; if ((MDBX_TXN_FINISHED | MDBX_TXN_ERROR) & mdbx_txn_flags(txn)) { - print(" ! abort processing `%s` due to a previous error\n", - dbi_name ? dbi_name : "@MAIN"); + print(" ! abort processing %s due to a previous error\n", + sdb_name(dbi_name)); return MDBX_BAD_TXN; } if (dbi_handle == ~0u) { - rc = mdbx_dbi_open_ex( + rc = mdbx_dbi_open_ex2( txn, dbi_name, MDBX_DB_ACCEDE, &dbi_handle, (dbi_name && ignore_wrong_order) ? equal_or_greater : nullptr, (dbi_name && ignore_wrong_order) ? equal_or_greater : nullptr); @@ -722,17 +790,17 @@ static int process_db(MDBX_dbi dbi_handle, char *dbi_name, visitor *handler) { if (!dbi_name || rc != MDBX_INCOMPATIBLE) /* LY: mainDB's record is not a user's DB. */ { - error("mdbx_dbi_open(`%s`) failed, error %d %s\n", - dbi_name ? dbi_name : "@MAIN", rc, mdbx_strerror(rc)); + error("mdbx_dbi_open(%s) failed, error %d %s\n", sdb_name(dbi_name), rc, + mdbx_strerror(rc)); } return rc; } } - if (dbi_handle >= CORE_DBS && dbi_name && only_subdb && - strcmp(only_subdb, dbi_name) != 0) { + if (dbi_handle >= CORE_DBS && dbi_name && only_subdb.iov_base && + !eq(only_subdb, *dbi_name)) { if (verbose) { - print("Skip processing %s...\n", dbi_name); + print("Skip processing %s...\n", sdb_name(dbi_name)); fflush(nullptr); } skipped_subdb++; @@ -740,7 +808,7 @@ static int process_db(MDBX_dbi dbi_handle, char *dbi_name, visitor *handler) { } if (!second_pass && verbose) - print("Processing %s...\n", dbi_name ? dbi_name : "@MAIN"); + print("Processing %s...\n", sdb_name(dbi_name)); fflush(nullptr); rc = mdbx_dbi_flags(txn, dbi_handle, &flags); @@ -1110,9 +1178,9 @@ int main(int argc, char *argv[]) { } #endif - dbi_meta.name = "@META"; - dbi_free.name = "@GC"; - dbi_main.name = "@MAIN"; + dbi_meta.name.iov_base = MDBX_PGWALK_META; + dbi_free.name.iov_base = MDBX_PGWALK_GC; + dbi_main.name.iov_base = MDBX_PGWALK_MAIN; atexit(pagemap_cleanup); if (argc < 2) @@ -1190,9 +1258,10 @@ int main(int argc, char *argv[]) { dont_traversal = true; break; case 's': - if (only_subdb && strcmp(only_subdb, optarg)) + if (only_subdb.iov_base && strcmp(only_subdb.iov_base, optarg)) usage(prog); - only_subdb = optarg; + only_subdb.iov_base = optarg; + only_subdb.iov_len = strlen(optarg); break; case 'i': ignore_wrong_order = true; @@ -1230,7 +1299,7 @@ int main(int argc, char *argv[]) { error("write-mode must be enabled to turn to the specified meta-page.\n"); rc = EXIT_INTERRUPTED; } - if (only_subdb || dont_traversal) { + if (only_subdb.iov_base || dont_traversal) { error( "whole database checking with b-tree traversal are required to turn " "to the specified meta-page.\n"); @@ -1572,8 +1641,8 @@ int main(int argc, char *argv[]) { unused_pages += 1; empty_pages = lost_bytes = 0; - for (walk_dbi_t *dbi = &dbi_main; dbi < ARRAY_END(walk.dbi) && dbi->name; - ++dbi) { + for (walk_dbi_t *dbi = &dbi_main; + dbi < ARRAY_END(walk.dbi) && dbi->name.iov_base; ++dbi) { empty_pages += dbi->pages.empty; lost_bytes += dbi->lost_bytes; } @@ -1583,9 +1652,10 @@ int main(int argc, char *argv[]) { print(" - pages: walked %" PRIu64 ", left/unused %" PRIu64 "\n", walk.pgcount, unused_pages); if (verbose > 1) { - for (walk_dbi_t *dbi = walk.dbi; dbi < ARRAY_END(walk.dbi) && dbi->name; - ++dbi) { - print(" %s: subtotal %" PRIu64, dbi->name, dbi->pages.total); + for (walk_dbi_t *dbi = walk.dbi; + dbi < ARRAY_END(walk.dbi) && dbi->name.iov_base; ++dbi) { + print(" %s: subtotal %" PRIu64, sdb_name(&dbi->name), + dbi->pages.total); if (dbi->pages.other && dbi->pages.other != dbi->pages.total) print(", other %" PRIu64, dbi->pages.other); if (dbi->pages.branch) @@ -1617,14 +1687,15 @@ int main(int argc, char *argv[]) { (total_page_bytes - walk.total_payload_bytes) * 100.0 / total_page_bytes); if (verbose > 2) { - for (walk_dbi_t *dbi = walk.dbi; dbi < ARRAY_END(walk.dbi) && dbi->name; - ++dbi) + for (walk_dbi_t *dbi = walk.dbi; + dbi < ARRAY_END(walk.dbi) && dbi->name.iov_base; ++dbi) if (dbi->pages.total) { uint64_t dbi_bytes = dbi->pages.total * envinfo.mi_dxb_pagesize; print(" %s: subtotal %" PRIu64 " bytes (%.1f%%)," " payload %" PRIu64 " (%.1f%%), unused %" PRIu64 " (%.1f%%)", - dbi->name, dbi_bytes, dbi_bytes * 100.0 / total_page_bytes, - dbi->payload_bytes, dbi->payload_bytes * 100.0 / dbi_bytes, + sdb_name(&dbi->name), dbi_bytes, + dbi_bytes * 100.0 / total_page_bytes, dbi->payload_bytes, + dbi->payload_bytes * 100.0 / dbi_bytes, dbi_bytes - dbi->payload_bytes, (dbi_bytes - dbi->payload_bytes) * 100.0 / dbi_bytes); if (dbi->pages.empty) @@ -1633,7 +1704,7 @@ int main(int argc, char *argv[]) { print(", %" PRIu64 " bytes lost", dbi->lost_bytes); print("\n"); } else - print(" %s: empty\n", dbi->name); + print(" %s: empty\n", sdb_name(&dbi->name)); } print(" - summary: average fill %.1f%%", walk.total_payload_bytes * 100.0 / total_page_bytes); @@ -1653,7 +1724,7 @@ int main(int argc, char *argv[]) { "b-tree", gc_tree_problems); problems_freedb = gc_tree_problems; } else - problems_freedb = process_db(FREE_DBI, "@GC", handle_freedb); + problems_freedb = process_db(FREE_DBI, MDBX_PGWALK_GC, handle_freedb); if (verbose) { uint64_t value = envinfo.mi_mapsize / envinfo.mi_dxb_pagesize; @@ -1719,7 +1790,7 @@ int main(int argc, char *argv[]) { } if (rc == 0 && total_problems == 1 && problems_meta == 1 && !dont_traversal && - (envflags & MDBX_RDONLY) == 0 && !only_subdb && stuck_meta < 0 && + (envflags & MDBX_RDONLY) == 0 && !only_subdb.iov_base && stuck_meta < 0 && get_meta_txnid(meta_recent(true)) < envinfo.mi_recent_txnid) { print("Perform sync-to-disk for make steady checkpoint at txn-id #%" PRIi64 "\n", @@ -1738,7 +1809,7 @@ int main(int argc, char *argv[]) { } } - if (turn_meta && stuck_meta >= 0 && !dont_traversal && !only_subdb && + if (turn_meta && stuck_meta >= 0 && !dont_traversal && !only_subdb.iov_base && (envflags & (MDBX_RDONLY | MDBX_EXCLUSIVE)) == MDBX_EXCLUSIVE) { const bool successful_check = (rc | total_problems | problems_meta) == 0; if (successful_check || force_turn_meta) { diff --git a/src/mdbx_dump.c b/src/mdbx_dump.c index f710d33d..8c266aed 100644 --- a/src/mdbx_dump.c +++ b/src/mdbx_dump.c @@ -66,7 +66,7 @@ static const char hexc[] = "0123456789abcdef"; static void dumpbyte(unsigned char c) { putchar(hexc[c >> 4]); - putchar(hexc[c & 0xf]); + putchar(hexc[c & 15]); } static void text(MDBX_val *v) { From 722c6ecf4366d11fb865135d0d45f12b65391011 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Sat, 17 Dec 2022 17:17:26 +0300 Subject: [PATCH 283/364] mdbx: use `attribute(tls_model(local-dynamic))` as workaround for CLANG bug. --- src/core.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/core.c b/src/core.c index 352c731a..28d9f301 100644 --- a/src/core.c +++ b/src/core.c @@ -1164,7 +1164,12 @@ static __inline uint64_t rthc_signature(const void *addr, uint8_t kind) { #define MDBX_THREAD_RTHC_REGISTERED(addr) rthc_signature(addr, 0x0D) #define MDBX_THREAD_RTHC_COUNTED(addr) rthc_signature(addr, 0xC0) -static __thread uint64_t rthc_thread_state; +static __thread uint64_t rthc_thread_state +#if __has_attribute(tls_model) && \ + (defined(__PIC__) || defined(__pic__) || MDBX_BUILD_SHARED_LIBRARY) + __attribute__((tls_model("local-dynamic"))) +#endif + ; #if defined(__APPLE__) && defined(__SANITIZE_ADDRESS__) && \ !defined(MDBX_ATTRIBUTE_NO_SANITIZE_ADDRESS) From fd98a635d93cff1bfbf23f4ae24aed4ce98e8aae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Sat, 17 Dec 2022 17:22:16 +0300 Subject: [PATCH 284/364] =?UTF-8?q?mdbx:=20=D0=BD=D0=B5=20=D0=B2=D0=BE?= =?UTF-8?q?=D0=B7=D0=B2=D1=80=D0=B0=D1=89=D0=B0=D0=B5=D0=BC=20=D0=BE=D1=88?= =?UTF-8?q?=D0=B8=D0=B1=D0=BA=D1=83=20=D0=BF=D1=80=D0=B8=20=D0=BF=D0=BE?= =?UTF-8?q?=D0=BF=D1=8B=D1=82=D0=BA=D0=B5=20=D0=B7=D0=B0=D0=BA=D1=80=D1=8B?= =?UTF-8?q?=D1=82=D0=B8=D1=8F=20`MAIN=5FDBI`.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/core.c b/src/core.c index 28d9f301..94383bf7 100644 --- a/src/core.c +++ b/src/core.c @@ -22503,6 +22503,12 @@ int mdbx_dbi_close(MDBX_env *env, MDBX_dbi dbi) { if (unlikely(rc != MDBX_SUCCESS)) return rc; + if (unlikely(dbi < CORE_DBS)) + return (dbi == MAIN_DBI) ? MDBX_SUCCESS : MDBX_BAD_DBI; + + if (unlikely(dbi >= env->me_maxdbs)) + return MDBX_BAD_DBI; + if (unlikely(dbi < CORE_DBS || dbi >= env->me_maxdbs)) return MDBX_BAD_DBI; From e9a2042df16c812b9e7ee8ef624f21b488edbfe2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Sat, 17 Dec 2022 00:31:00 +0300 Subject: [PATCH 285/364] =?UTF-8?q?mdbx:=20=D0=B4=D0=BE=D0=B1=D0=B0=D0=B2?= =?UTF-8?q?=D0=BB=D0=B5=D0=BD=D0=B8=D0=B5=20`MDBX=5FNOTHROW=5FPURE=5FFUNCT?= =?UTF-8?q?ION`=20=D0=BA=20=D0=BD=D0=B5=D0=BA=D0=BE=D1=82=D0=BE=D1=80?= =?UTF-8?q?=D1=8B=D0=BC=20=D1=84=D1=83=D0=BD=D0=BA=D1=86=D0=B8=D1=8F=D0=BC?= =?UTF-8?q?.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/core.c b/src/core.c index 94383bf7..52cfdad3 100644 --- a/src/core.c +++ b/src/core.c @@ -2998,7 +2998,8 @@ static __inline bool dpl_intersect(const MDBX_txn *txn, pgno_t pgno, return rc; } -static __always_inline size_t dpl_exist(const MDBX_txn *txn, pgno_t pgno) { +MDBX_NOTHROW_PURE_FUNCTION static __always_inline size_t +dpl_exist(const MDBX_txn *txn, pgno_t pgno) { tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); MDBX_dpl *dl = txn->tw.dirtylist; size_t i = dpl_search(txn, pgno); @@ -3062,7 +3063,8 @@ static __noinline void txn_lru_reduce(MDBX_txn *txn) { } while (txn); } -static __inline uint32_t dpl_age(const MDBX_txn *txn, size_t i) { +MDBX_NOTHROW_PURE_FUNCTION static __inline uint32_t dpl_age(const MDBX_txn *txn, + size_t i) { tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); const MDBX_dpl *dl = txn->tw.dirtylist; @@ -4750,8 +4752,8 @@ static size_t txn_keep(MDBX_txn *txn, MDBX_cursor *m0) { * 0 = should be spilled; * ... * > 255 = must not be spilled. */ -static unsigned spill_prio(const MDBX_txn *txn, const size_t i, - const uint32_t reciprocal) { +MDBX_NOTHROW_PURE_FUNCTION static unsigned +spill_prio(const MDBX_txn *txn, const size_t i, const uint32_t reciprocal) { MDBX_dpl *const dl = txn->tw.dirtylist; const uint32_t age = dpl_age(txn, i); const size_t npages = dpl_npages(dl, i); From fe55f2566537d023b5755ea7f35c261ff6302f03 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Wed, 14 Dec 2022 11:43:22 +0300 Subject: [PATCH 286/364] =?UTF-8?q?mdbx:=20=D0=B8=D1=81=D0=BF=D0=BE=D0=BB?= =?UTF-8?q?=D1=8C=D0=B7=D0=BE=D0=B2=D0=B0=D0=BD=D0=B8=D0=B5=20`msync(MS=5F?= =?UTF-8?q?ASYNC)`=20=D0=B4=D0=BB=D1=8F=20=D1=81=D0=BF=D0=B8=D0=BB=D0=BB?= =?UTF-8?q?=D0=B8=D0=BD=D0=B3=D0=B0=20=D0=B2=20=D1=80=D0=B5=D0=B6=D0=B8?= =?UTF-8?q?=D0=BC=D0=B5=20MDBX=5FWRITEMAP=20=D0=B2=D0=BD=D0=B5=20=D0=B7?= =?UTF-8?q?=D0=B0=D0=B2=D0=B8=D1=81=D0=B8=D0=BC=D0=BE=D1=81=D1=82=D0=B8=20?= =?UTF-8?q?=D0=BE=D1=82=20`MDBX=5FAVOID=5FMSYNC`=20=D0=B8=20`MDBX=5FMMAP?= =?UTF-8?q?=5FUSE=5FMS=5FASYNC`.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core.c | 234 +++++++++++++++++++++++------------------------- src/internals.h | 1 + src/osal.c | 5 +- src/osal.h | 9 +- 4 files changed, 123 insertions(+), 126 deletions(-) diff --git a/src/core.c b/src/core.c index 52cfdad3..3bc11921 100644 --- a/src/core.c +++ b/src/core.c @@ -4179,7 +4179,6 @@ __cold static void kill_page(MDBX_txn *txn, MDBX_page *mp, pgno_t pgno, static __inline void page_wash(MDBX_txn *txn, const size_t di, MDBX_page *const mp, const size_t npages) { tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); - tASSERT(txn, (di > 0) == (txn->tw.dirtylist != nullptr)); mp->mp_txnid = INVALID_TXNID; mp->mp_flags = P_BAD; @@ -4194,10 +4193,13 @@ static __inline void page_wash(MDBX_txn *txn, const size_t di, (txn->mt_parent ? txn->mt_parent->tw.dirtyroom : txn->mt_env->me_options.dp_limit)); } else { - tASSERT(txn, txn->tw.dirtylist == nullptr); - tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) != 0 && !MDBX_AVOID_MSYNC); - tASSERT(txn, txn->tw.writemap_dirty_npages >= npages); - txn->tw.writemap_dirty_npages -= npages; + tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP)); + if (txn->tw.dirtylist == nullptr) { + tASSERT(txn, !MDBX_AVOID_MSYNC); + txn->tw.writemap_dirty_npages -= (txn->tw.writemap_dirty_npages > npages) + ? npages + : txn->tw.writemap_dirty_npages; + } } VALGRIND_MAKE_MEM_UNDEFINED(mp, PAGEHDRSZ); @@ -4686,14 +4688,13 @@ __must_check_result static int iov_page(MDBX_txn *txn, iov_ctx_t *ctx, static int spill_page(MDBX_txn *txn, iov_ctx_t *ctx, MDBX_page *dp, const size_t npages) { - tASSERT(txn, !(txn->mt_flags & MDBX_WRITEMAP) || MDBX_AVOID_MSYNC); + tASSERT(txn, !(txn->mt_flags & MDBX_WRITEMAP)); #if MDBX_ENABLE_PGOP_STAT txn->mt_env->me_lck->mti_pgop_stat.spill.weak += npages; #endif /* MDBX_ENABLE_PGOP_STAT */ const pgno_t pgno = dp->mp_pgno; int err = iov_page(txn, ctx, dp, npages); - if (likely(err == MDBX_SUCCESS) && - (!MDBX_AVOID_MSYNC || !(txn->mt_flags & MDBX_WRITEMAP))) + if (likely(err == MDBX_SUCCESS)) err = pnl_append_range(true, &txn->tw.spilled.list, pgno << 1, npages); return err; } @@ -4702,7 +4703,7 @@ static int spill_page(MDBX_txn *txn, iov_ctx_t *ctx, MDBX_page *dp, * Returns the number of pages marked as unspillable. */ static size_t cursor_keep(const MDBX_txn *const txn, const MDBX_cursor *mc) { tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); - tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); + tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0); size_t keep = 0; while ((mc->mc_flags & C_INITIALIZED) && mc->mc_snum) { tASSERT(txn, mc->mc_top == mc->mc_snum - 1); @@ -4736,7 +4737,8 @@ static size_t cursor_keep(const MDBX_txn *const txn, const MDBX_cursor *mc) { } static size_t txn_keep(MDBX_txn *txn, MDBX_cursor *m0) { - tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); + tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); + tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0); txn_lru_turn(txn); size_t keep = m0 ? cursor_keep(txn, m0) : 0; for (size_t i = FREE_DBI; i < txn->mt_numdbs; ++i) @@ -4839,13 +4841,15 @@ __cold static int txn_spill_slowpath(MDBX_txn *const txn, MDBX_cursor *const m0, static __inline int txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0, const size_t need) { tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); - tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); tASSERT(txn, !m0 || cursor_is_tracked(m0)); - intptr_t wanna_spill_entries = need - txn->tw.dirtyroom - txn->tw.loose_count; - intptr_t wanna_spill_npages = - need + txn->tw.dirtylist->pages_including_loose - txn->tw.loose_count - - txn->mt_env->me_options.dp_limit; + const intptr_t wanna_spill_entries = + txn->tw.dirtylist ? (need - txn->tw.dirtyroom - txn->tw.loose_count) : 0; + const intptr_t wanna_spill_npages = + need + + (txn->tw.dirtylist ? txn->tw.dirtylist->pages_including_loose + : txn->tw.writemap_dirty_npages) - + txn->tw.loose_count - txn->mt_env->me_options.dp_limit; /* production mode */ if (likely(wanna_spill_npages < 1 && wanna_spill_entries < 1) @@ -4882,15 +4886,19 @@ __cold static int txn_spill_slowpath(MDBX_txn *const txn, MDBX_cursor *const m0, const intptr_t wanna_spill_npages, const size_t need) { tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); - tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); int rc = MDBX_SUCCESS; - if (unlikely(txn->tw.dirtylist->length <= txn->tw.loose_count)) + if (unlikely(txn->tw.loose_count >= + (txn->tw.dirtylist ? txn->tw.dirtylist->pages_including_loose + : txn->tw.writemap_dirty_npages))) goto done; - const size_t dirty_entries = txn->tw.dirtylist->length - txn->tw.loose_count; + const size_t dirty_entries = + txn->tw.dirtylist ? (txn->tw.dirtylist->length - txn->tw.loose_count) : 1; const size_t dirty_npages = - txn->tw.dirtylist->pages_including_loose - txn->tw.loose_count; + (txn->tw.dirtylist ? txn->tw.dirtylist->pages_including_loose + : txn->tw.writemap_dirty_npages) - + txn->tw.loose_count; const size_t need_spill_entries = spill_gate(txn->mt_env, wanna_spill_entries, dirty_entries); const size_t need_spill_npages = @@ -4902,17 +4910,18 @@ __cold static int txn_spill_slowpath(MDBX_txn *const txn, MDBX_cursor *const m0, if (!need_spill) goto done; -#if !MDBX_AVOID_MSYNC if (txn->mt_flags & MDBX_WRITEMAP) { NOTICE("%s-spilling %zu dirty-entries, %zu dirty-npages", "msync", dirty_entries, dirty_npages); - tASSERT(txn, txn->tw.spilled.list == nullptr); const MDBX_env *env = txn->mt_env; + tASSERT(txn, txn->tw.spilled.list == nullptr); rc = osal_msync(&txn->mt_env->me_dxb_mmap, 0, pgno_align2os_bytes(env, txn->mt_next_pgno), MDBX_SYNC_KICK); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; +#if MDBX_AVOID_MSYNC + tASSERT(txn, dirtylist_check(txn)); env->me_lck->mti_unsynced_pages.weak += txn->tw.dirtylist->pages_including_loose - txn->tw.loose_count; dpl_clear(txn->tw.dirtylist); @@ -4921,34 +4930,40 @@ __cold static int txn_spill_slowpath(MDBX_txn *const txn, MDBX_cursor *const m0, rc = dpl_append(txn, lp->mp_pgno, lp, 1); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; + MDBX_ASAN_UNPOISON_MEMORY_REGION(&mp_next(lp), sizeof(MDBX_page *)); + VALGRIND_MAKE_MEM_DEFINED(&mp_next(lp), sizeof(MDBX_page *)); } + tASSERT(txn, dirtylist_check(txn)); +#else + tASSERT(txn, txn->tw.dirtylist == nullptr); + env->me_lck->mti_unsynced_pages.weak += txn->tw.writemap_dirty_npages; + txn->tw.writemap_spilled_npages += txn->tw.writemap_dirty_npages; + txn->tw.writemap_dirty_npages = 0; +#endif /* MDBX_AVOID_MSYNC */ goto done; } -#endif /* MDBX_AVOID_MSYNC */ NOTICE("%s-spilling %zu dirty-entries, %zu dirty-npages", "write", need_spill_entries, need_spill_npages); tASSERT(txn, txn->tw.dirtylist->length - txn->tw.loose_count >= 1); tASSERT(txn, txn->tw.dirtylist->pages_including_loose - txn->tw.loose_count >= need_spill_npages); - if (!MDBX_AVOID_MSYNC || !(txn->mt_flags & MDBX_WRITEMAP)) { - if (!txn->tw.spilled.list) { - txn->tw.spilled.least_removed = INT_MAX; - txn->tw.spilled.list = pnl_alloc(need_spill); - if (unlikely(!txn->tw.spilled.list)) { - rc = MDBX_ENOMEM; - bailout: - txn->mt_flags |= MDBX_TXN_ERROR; - return rc; - } - } else { - /* purge deleted slots */ - spill_purge(txn); - rc = pnl_reserve(&txn->tw.spilled.list, need_spill); - (void)rc /* ignore since the resulting list may be shorter - and pnl_append() will increase pnl on demand */ - ; + if (!txn->tw.spilled.list) { + txn->tw.spilled.least_removed = INT_MAX; + txn->tw.spilled.list = pnl_alloc(need_spill); + if (unlikely(!txn->tw.spilled.list)) { + rc = MDBX_ENOMEM; + bailout: + txn->mt_flags |= MDBX_TXN_ERROR; + return rc; } + } else { + /* purge deleted slots */ + spill_purge(txn); + rc = pnl_reserve(&txn->tw.spilled.list, need_spill); + (void)rc /* ignore since the resulting list may be shorter + and pnl_append() will increase pnl on demand */ + ; } /* Сортируем чтобы запись на диск была полее последовательна */ @@ -5063,57 +5078,35 @@ __cold static int txn_spill_slowpath(MDBX_txn *const txn, MDBX_cursor *const m0, if (unlikely(rc != MDBX_SUCCESS)) goto bailout; - unsigned prev_prio = 256, prio; - size_t r, w; - for (w = 0, r = 1; - r <= dl->length && (spilled_entries < need_spill_entries || - spilled_npages < need_spill_npages); - prev_prio = prio, ++r) { - prio = spill_prio(txn, r, reciprocal); - MDBX_page *const dp = dl->items[r].ptr; - if (prio < prio2adjacent) { - const pgno_t pgno = dl->items[r].pgno; - const unsigned npages = dpl_npages(dl, r); - if (prio <= prio2spill) { - if (prev_prio < prio2adjacent && prev_prio > prio2spill && - dpl_endpgno(dl, r - 1) == pgno) { - DEBUG("co-spill %u prev-adjacent page %" PRIaPGNO - " (age %d, prio %u)", - dpl_npages(dl, w), dl->items[r - 1].pgno, dpl_age(txn, r - 1), - prev_prio); - --w; - const unsigned co_npages = dpl_npages(dl, r - 1); - rc = spill_page(txn, &ctx, dl->items[r - 1].ptr, co_npages); - if (unlikely(rc != MDBX_SUCCESS)) - break; - ++spilled_entries; - spilled_npages += co_npages; - } + size_t r = 0, w = 0; + pgno_t last = 0; + while (r < dl->length && (spilled_entries < need_spill_entries || + spilled_npages < need_spill_npages)) { + dl->items[++w] = dl->items[++r]; + unsigned prio = spill_prio(txn, w, reciprocal); + if (prio > prio2spill && + (prio >= prio2adjacent || last != dl->items[w].pgno)) + continue; - DEBUG("spill %u page %" PRIaPGNO " (age %d, prio %u)", npages, - dp->mp_pgno, dpl_age(txn, r), prio); - rc = spill_page(txn, &ctx, dp, npages); - if (unlikely(rc != MDBX_SUCCESS)) - break; - ++spilled_entries; - spilled_npages += npages; - continue; - } + const size_t e = w; + last = dpl_endpgno(dl, w); + while (--w && dpl_endpgno(dl, w) == dl->items[w + 1].pgno && + spill_prio(txn, w, reciprocal) < prio2adjacent) + ; - if (prev_prio <= prio2spill && dpl_endpgno(dl, r - 1) == pgno) { - DEBUG("co-spill %u next-adjacent page %" PRIaPGNO - " (age %d, prio %u)", - npages, dp->mp_pgno, dpl_age(txn, r), prio); - rc = spill_page(txn, &ctx, dp, npages); - if (unlikely(rc != MDBX_SUCCESS)) - break; - prio = prev_prio /* to continue co-spilling next adjacent pages */; - ++spilled_entries; - spilled_npages += npages; - continue; - } + for (size_t i = w; ++i <= e;) { + const unsigned npages = dpl_npages(dl, i); + prio = spill_prio(txn, i, reciprocal); + DEBUG("%sspill[%zu] %u page %" PRIaPGNO " (age %d, prio %u)", + (prio > prio2spill) ? "co-" : "", i, npages, dl->items[i].pgno, + dpl_age(txn, i), prio); + tASSERT(txn, prio < 256); + ++spilled_entries; + spilled_npages += npages; + rc = spill_page(txn, &ctx, dl->items[i].ptr, npages); + if (unlikely(rc != MDBX_SUCCESS)) + goto failed; } - dl->items[++w] = dl->items[r]; } VERBOSE("spilled entries %u, spilled npages %u", spilled_entries, @@ -5121,9 +5114,10 @@ __cold static int txn_spill_slowpath(MDBX_txn *const txn, MDBX_cursor *const m0, tASSERT(txn, spillable_entries == 0 || spilled_entries > 0); tASSERT(txn, spilled_npages >= spilled_entries); - while (r <= dl->length) - dl->items[++w] = dl->items[r++]; - tASSERT(txn, r - 1 - w == spilled_entries); + failed: + while (r < dl->length) + dl->items[++w] = dl->items[++r]; + tASSERT(txn, r - w == spilled_entries || rc != MDBX_SUCCESS); dl->sorted = dpl_setlen(dl, w); txn->tw.dirtyroom += spilled_entries; @@ -5138,10 +5132,8 @@ __cold static int txn_spill_slowpath(MDBX_txn *const txn, MDBX_cursor *const m0, goto bailout; txn->mt_env->me_lck->mti_unsynced_pages.weak += spilled_npages; - if (!MDBX_AVOID_MSYNC || !(txn->mt_flags & MDBX_WRITEMAP)) { - pnl_sort(txn->tw.spilled.list, (size_t)txn->mt_next_pgno << 1); - txn->mt_flags |= MDBX_TXN_SPILLS; - } + pnl_sort(txn->tw.spilled.list, (size_t)txn->mt_next_pgno << 1); + txn->mt_flags |= MDBX_TXN_SPILLS; NOTICE("spilled %u dirty-entries, %u dirty-npages, now have %zu dirty-room", spilled_entries, spilled_npages, txn->tw.dirtyroom); } else { @@ -5180,11 +5172,6 @@ static int cursor_spill(MDBX_cursor *mc, const MDBX_val *key, const MDBX_val *data) { MDBX_txn *txn = mc->mc_txn; tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); - if (!txn->tw.dirtylist) { - tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) != 0 && !MDBX_AVOID_MSYNC); - return MDBX_SUCCESS; - } - tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); /* Estimate how much space this operation will take: */ /* 1) Max b-tree height, reasonable enough with including dups' sub-tree */ @@ -5676,16 +5663,12 @@ __hot static int __must_check_result page_dirty(MDBX_txn *txn, MDBX_page *mp, txn->tw.loose_pages = mp_next(lp); txn->tw.loose_count--; txn->tw.dirtyroom++; - if (!MDBX_AVOID_MSYNC || !(txn->mt_flags & MDBX_WRITEMAP)) { - tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0); + if (!MDBX_AVOID_MSYNC || !(txn->mt_flags & MDBX_WRITEMAP)) dpage_free(txn->mt_env, lp, 1); - } } else { ERROR("Dirtyroom is depleted, DPL length %zu", txn->tw.dirtylist->length); - if (!MDBX_AVOID_MSYNC || !(txn->mt_flags & MDBX_WRITEMAP)) { - tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0); + if (!MDBX_AVOID_MSYNC || !(txn->mt_flags & MDBX_WRITEMAP)) dpage_free(txn->mt_env, mp, npages); - } return MDBX_TXN_FULL; } } @@ -6059,7 +6042,7 @@ __cold static int map_resize(MDBX_env *env, const pgno_t used_pgno, env->me_lck->mti_pgop_stat.msync.weak += 1; #endif /* MDBX_ENABLE_PGOP_STAT */ rc = osal_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, used_pgno), - MDBX_SYNC_KICK); + MDBX_SYNC_NONE); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } @@ -7834,6 +7817,20 @@ __hot static int page_touch(MDBX_cursor *mc) { return MDBX_SUCCESS; tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); const size_t n = dpl_search(txn, mp->mp_pgno); + if (MDBX_AVOID_MSYNC && + unlikely(txn->tw.dirtylist->items[n].pgno != mp->mp_pgno)) { + tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP)); + tASSERT(txn, n > 0 && n <= txn->tw.dirtylist->length + 1); + VERBOSE("unspill page %" PRIaPGNO, mp->mp_pgno); + np = (MDBX_page *)mp; +#if MDBX_ENABLE_PGOP_STAT + txn->mt_env->me_lck->mti_pgop_stat.unspill.weak += 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ + return page_dirty(txn, np, 1); + } + tASSERT(txn, n > 0 && n <= txn->tw.dirtylist->length); + tASSERT(txn, txn->tw.dirtylist->items[n].pgno == mp->mp_pgno && + txn->tw.dirtylist->items[n].ptr == mp); txn->tw.dirtylist->items[n].mlru = (txn->tw.dirtylist->items[n].mlru & MDBX_dp_multi_mask) + txn_lru_turn(txn); @@ -8883,6 +8880,8 @@ static int txn_renew(MDBX_txn *txn, const unsigned flags) { txn->tw.dirtyroom = MAX_PAGENO; txn->tw.dirtylru = 0; } + eASSERT(env, txn->tw.writemap_dirty_npages == 0); + eASSERT(env, txn->tw.writemap_spilled_npages == 0); } /* Setup db info */ @@ -9352,7 +9351,8 @@ int mdbx_txn_info(const MDBX_txn *txn, MDBX_txn_info *info, bool scan_rlt) { info->txn_space_leftover = pgno2bytes(env, txn->tw.dirtyroom); info->txn_space_dirty = pgno2bytes( env, txn->tw.dirtylist ? txn->tw.dirtylist->pages_including_loose - : txn->tw.writemap_dirty_npages); + : (txn->tw.writemap_dirty_npages + + txn->tw.writemap_spilled_npages)); info->txn_reader_lag = INT64_MAX; MDBX_lockinfo *const lck = env->me_lck_mmap.lck; if (scan_rlt && lck) { @@ -9566,10 +9566,8 @@ static void dpl_sift(MDBX_txn *const txn, MDBX_PNL pl, const bool spilled) { remove_dl: npages = dpl_npages(dl, r); dl->pages_including_loose -= npages; - if (!MDBX_AVOID_MSYNC || !(txn->mt_env->me_flags & MDBX_WRITEMAP)) { - tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0); + if (!MDBX_AVOID_MSYNC || !(txn->mt_flags & MDBX_WRITEMAP)) dpage_free(txn->mt_env, dl->items[r].ptr, npages); - } ++r; next_i: i += step; @@ -12410,7 +12408,7 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending, rc = MDBX_RESULT_FALSE /* carry steady */; if (atomic_load64(&env->me_lck->mti_unsynced_pages, mo_Relaxed)) { eASSERT(env, ((flags ^ env->me_flags) & MDBX_WRITEMAP) == 0); - enum osal_syncmode_bits mode_bits = MDBX_SYNC_KICK; + enum osal_syncmode_bits mode_bits = MDBX_SYNC_NONE; unsigned sync_op = 0; if ((flags & MDBX_SAFE_NOSYNC) == 0) { sync_op = 1; @@ -12422,7 +12420,7 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending, mode_bits |= MDBX_SYNC_IODQ; } else if (unlikely(env->me_incore)) goto skip_incore_sync; - if (!MDBX_AVOID_MSYNC && (flags & MDBX_WRITEMAP)) { + if (flags & MDBX_WRITEMAP) { #if MDBX_ENABLE_PGOP_STAT env->me_lck->mti_pgop_stat.msync.weak += sync_op; #else @@ -12567,7 +12565,7 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending, #endif /* MDBX_ENABLE_PGOP_STAT */ rc = osal_msync( &env->me_dxb_mmap, 0, pgno_align2os_bytes(env, NUM_METAS), - (flags & MDBX_NOMETASYNC) ? MDBX_SYNC_KICK + (flags & MDBX_NOMETASYNC) ? MDBX_SYNC_NONE : MDBX_SYNC_DATA | MDBX_SYNC_IODQ); } else { #if MDBX_ENABLE_PGOP_STAT @@ -13995,14 +13993,10 @@ __cold static int setup_lck(MDBX_env *env, pathchar_t *lck_pathname, #if MDBX_ENABLE_PGOP_STAT lck->mti_pgop_stat.wops.weak = 1; #endif /* MDBX_ENABLE_PGOP_STAT */ - err = osal_msync(&env->me_lck_mmap, 0, (size_t)size, MDBX_SYNC_KICK); + err = osal_msync(&env->me_lck_mmap, 0, (size_t)size, + MDBX_SYNC_DATA | MDBX_SYNC_SIZE); if (unlikely(err != MDBX_SUCCESS)) { - ERROR("initial-%s for lck-file failed", "msync"); - goto bailout; - } - err = osal_fsync(env->me_lck_mmap.fd, MDBX_SYNC_SIZE); - if (unlikely(err != MDBX_SUCCESS)) { - ERROR("initial-%s for lck-file failed", "fsync"); + ERROR("initial-%s for lck-file failed, err %d", "msync/fsync", err); goto bailout; } } else { diff --git a/src/internals.h b/src/internals.h index 4f0ec2ac..06e8503a 100644 --- a/src/internals.h +++ b/src/internals.h @@ -1110,6 +1110,7 @@ struct MDBX_txn { MDBX_PNL list; } spilled; size_t writemap_dirty_npages; + size_t writemap_spilled_npages; }; } tw; }; diff --git a/src/osal.c b/src/osal.c index a8dbac6a..8a76d865 100644 --- a/src/osal.c +++ b/src/osal.c @@ -1566,6 +1566,7 @@ MDBX_INTERNAL_FUNC int osal_fsync(mdbx_filehandle_t fd, * see http://www.spinics.net/lists/linux-ext4/msg33714.html */ while (1) { switch (mode_bits & (MDBX_SYNC_DATA | MDBX_SYNC_SIZE)) { + case MDBX_SYNC_NONE: case MDBX_SYNC_KICK: return MDBX_SUCCESS /* nothing to do */; #if defined(_POSIX_SYNCHRONIZED_IO) && _POSIX_SYNCHRONIZED_IO > 0 @@ -1707,7 +1708,7 @@ MDBX_INTERNAL_FUNC int osal_thread_join(osal_thread_t thread) { MDBX_INTERNAL_FUNC int osal_msync(const osal_mmap_t *map, size_t offset, size_t length, enum osal_syncmode_bits mode_bits) { - if (!MDBX_MMAP_USE_MS_ASYNC && mode_bits == MDBX_SYNC_KICK) + if (!MDBX_MMAP_USE_MS_ASYNC && mode_bits == MDBX_SYNC_NONE) return MDBX_SUCCESS; void *ptr = ptr_disp(map->base, offset); @@ -1727,7 +1728,7 @@ MDBX_INTERNAL_FUNC int osal_msync(const osal_mmap_t *map, size_t offset, // NOTE: The MDBX_MMAP_USE_MS_ASYNC must be defined to 1 for such cases. // // assert(linux_kernel_version > 0x02061300); - // if (mode_bits == MDBX_SYNC_KICK) + // if (mode_bits <= MDBX_SYNC_KICK) // return MDBX_SUCCESS; #endif /* Linux */ if (msync(ptr, length, (mode_bits & MDBX_SYNC_DATA) ? MS_SYNC : MS_ASYNC)) diff --git a/src/osal.h b/src/osal.h index bd869403..53b80e58 100644 --- a/src/osal.h +++ b/src/osal.h @@ -523,10 +523,11 @@ osal_thread_create(osal_thread_t *thread, MDBX_INTERNAL_FUNC int osal_thread_join(osal_thread_t thread); enum osal_syncmode_bits { - MDBX_SYNC_KICK = 0, - MDBX_SYNC_DATA = 1, - MDBX_SYNC_SIZE = 2, - MDBX_SYNC_IODQ = 4 + MDBX_SYNC_NONE = 0, + MDBX_SYNC_KICK = 1, + MDBX_SYNC_DATA = 2, + MDBX_SYNC_SIZE = 4, + MDBX_SYNC_IODQ = 8 }; MDBX_INTERNAL_FUNC int osal_fsync(mdbx_filehandle_t fd, From 686145ec2e7426b32b3e26104bbe13dae4466750 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Fri, 16 Dec 2022 23:02:59 +0300 Subject: [PATCH 287/364] =?UTF-8?q?mdbx:=20=D1=80=D0=B5=D1=84=D0=B0=D0=BA?= =?UTF-8?q?=D1=82=D0=BE=D1=80=D0=B8=D0=BD=D0=B3=20=D1=81=20=D1=83=D0=B4?= =?UTF-8?q?=D0=B0=D0=BB=D0=B5=D0=BD=D0=B8=D0=B5=D0=BC=20`cursor=5Fspill()`?= =?UTF-8?q?=20=D0=B8=20`MDBX=5FNOSPILL`.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/bits.md | 2 +- src/core.c | 165 +++++++++++++++++++++--------------------------- src/internals.h | 3 - 3 files changed, 73 insertions(+), 97 deletions(-) diff --git a/src/bits.md b/src/bits.md index 99f9f117..7566cdf3 100644 --- a/src/bits.md +++ b/src/bits.md @@ -15,7 +15,7 @@ N | MASK | ENV | TXN | DB | PUT | DBI | NOD 12|0000 1000| | | | | | | | | 13|0000 2000|VALIDATION | | | | | |P_SPILLED | | 14|0000 4000|NOSUBDIR | | | | | |P_LOOSE | | -15|0000 8000| | |DB_VALID |NOSPILL | | |P_FROZEN | | +15|0000 8000| | |DB_VALID | | | |P_FROZEN | | 16|0001 0000|SAFE_NOSYNC|TXN_NOSYNC | |RESERVE | |RESERVE | | | 17|0002 0000|RDONLY |TXN_RDONLY | |APPEND | |APPEND | | <= | 18|0004 0000|NOMETASYNC |TXN_NOMETASYNC|CREATE |APPENDDUP | | | | | diff --git a/src/core.c b/src/core.c index 3bc11921..4a1fae77 100644 --- a/src/core.c +++ b/src/core.c @@ -3210,8 +3210,8 @@ static txnid_t kick_longlived_readers(MDBX_env *env, const txnid_t laggard); static pgr_t page_new(MDBX_cursor *mc, const unsigned flags); static pgr_t page_new_large(MDBX_cursor *mc, const size_t npages); static int page_touch(MDBX_cursor *mc); -static int cursor_touch(MDBX_cursor *mc); -static int touch_dbi(MDBX_cursor *mc); +static int cursor_touch(MDBX_cursor *const mc, const MDBX_val *key, + const MDBX_val *data); #define MDBX_END_NAMES \ { \ @@ -5141,7 +5141,7 @@ __cold static int txn_spill_slowpath(MDBX_txn *const txn, MDBX_cursor *const m0, for (size_t i = 1; i <= dl->length; ++i) { MDBX_page *dp = dl->items[i].ptr; VERBOSE( - "dirtylist[%zu]: pgno %u, npages %u, flags 0x%04X, age %u, prio %u", + "unspillable[%zu]: pgno %u, npages %u, flags 0x%04X, age %u, prio %u", i, dp->mp_pgno, dpl_npages(dl, i), dp->mp_flags, dpl_age(txn, i), spill_prio(txn, i, reciprocal)); } @@ -5168,39 +5168,6 @@ done: : MDBX_TXN_FULL; } -static int cursor_spill(MDBX_cursor *mc, const MDBX_val *key, - const MDBX_val *data) { - MDBX_txn *txn = mc->mc_txn; - tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); - - /* Estimate how much space this operation will take: */ - /* 1) Max b-tree height, reasonable enough with including dups' sub-tree */ - size_t need = CURSOR_STACK + 3; - /* 2) GC/FreeDB for any payload */ - if (mc->mc_dbi > FREE_DBI) { - need += txn->mt_dbs[FREE_DBI].md_depth + 3; - /* 3) Named DBs also dirty the main DB */ - if (mc->mc_dbi > MAIN_DBI) - need += txn->mt_dbs[MAIN_DBI].md_depth + 3; - } -#if xMDBX_DEBUG_SPILLING != 2 - /* production mode */ - /* 4) Double the page chain estimation - * for extensively splitting, rebalance and merging */ - need += need; - /* 5) Factor the key+data which to be put in */ - need += bytes2pgno(txn->mt_env, node_size(key, data)) + 1; -#else - /* debug mode */ - (void)key; - (void)data; - mc->mc_txn->mt_env->debug_dirtied_est = ++need; - mc->mc_txn->mt_env->debug_dirtied_act = 0; -#endif /* xMDBX_DEBUG_SPILLING == 2 */ - - return txn_spill(txn, mc, need); -} - /*----------------------------------------------------------------------------*/ static bool meta_bootid_match(const MDBX_meta *meta) { @@ -7833,7 +7800,7 @@ __hot static int page_touch(MDBX_cursor *mc) { txn->tw.dirtylist->items[n].ptr == mp); txn->tw.dirtylist->items[n].mlru = (txn->tw.dirtylist->items[n].mlru & MDBX_dp_multi_mask) + - txn_lru_turn(txn); + (txn->tw.dirtylru & MDBX_dp_lru_mask); return MDBX_SUCCESS; } if (IS_SUBP(mp)) { @@ -9991,8 +9958,12 @@ static int gcu_clean_stored_retired(MDBX_txn *txn, gcu_context_t *ctx) { } static int gcu_touch(gcu_context_t *ctx) { + MDBX_val key, val; + key.iov_base = val.iov_base = nullptr; + key.iov_len = sizeof(txnid_t); + val.iov_len = MDBX_PNL_SIZEOF(ctx->cursor.mc_txn->tw.retired_pages); ctx->cursor.mc_flags |= C_GCU; - int err = cursor_touch(&ctx->cursor); + int err = cursor_touch(&ctx->cursor, &key, &val); ctx->cursor.mc_flags -= C_GCU; return err; } @@ -10036,18 +10007,7 @@ static int gcu_prepare_backlog(MDBX_txn *txn, gcu_context_t *ctx) { for_all_before_touch, for_relist, for_split, for_cow, for_tree_before_touch); - int err; - if (unlikely(for_relist > 2)) { - MDBX_val key, val; - key.iov_base = val.iov_base = nullptr; - key.iov_len = sizeof(txnid_t); - val.iov_len = MDBX_PNL_SIZEOF(txn->tw.retired_pages); - err = cursor_spill(&ctx->cursor, &key, &val); - if (unlikely(err != MDBX_SUCCESS)) - return err; - } - - err = gcu_touch(ctx); + int err = gcu_touch(ctx); TRACE("== after-touch, backlog %zu, err %d", gcu_backlog_size(txn), err); if (!MDBX_ENABLE_BIGFOOT && unlikely(for_relist > 1) && @@ -15517,7 +15477,8 @@ __hot __noinline static int page_search_root(MDBX_cursor *mc, ready: if (flags & MDBX_PS_MODIFY) { - if (unlikely((rc = page_touch(mc)) != 0)) + rc = page_touch(mc); + if (unlikely(rc != MDBX_SUCCESS)) return rc; mp = mc->mc_pg[mc->mc_top]; } @@ -15731,8 +15692,6 @@ __hot static int page_search(MDBX_cursor *mc, const MDBX_val *key, int flags) { mc->mc_pg[0]->mp_flags); if (flags & MDBX_PS_MODIFY) { - if (!(*mc->mc_dbistate & DBI_DIRTY) && unlikely(rc = touch_dbi(mc))) - return rc; if (unlikely(rc = page_touch(mc))) return rc; } @@ -16878,21 +16837,61 @@ static int touch_dbi(MDBX_cursor *mc) { return MDBX_SUCCESS; } -/* Touch all the pages in the cursor stack. Set mc_top. - * Makes sure all the pages are writable, before attempting a write operation. - * [in] mc The cursor to operate on. */ -static int cursor_touch(MDBX_cursor *mc) { - int rc = MDBX_SUCCESS; +static int cursor_touch(MDBX_cursor *const mc, const MDBX_val *key, + const MDBX_val *data) { + cASSERT(mc, (mc->mc_txn->mt_flags & MDBX_TXN_RDONLY) == 0); + cASSERT(mc, (mc->mc_flags & C_INITIALIZED) || mc->mc_snum == 0); + cASSERT(mc, cursor_is_tracked(mc)); + + txn_lru_turn(mc->mc_txn); + if (unlikely((*mc->mc_dbistate & DBI_DIRTY) == 0)) { - rc = touch_dbi(mc); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; + int err = touch_dbi(mc); + if (unlikely(err != MDBX_SUCCESS)) + return err; } + + if ((mc->mc_flags & C_SUB) == 0) { + MDBX_txn *const txn = mc->mc_txn; + /* Estimate how much space this operation will take: */ + /* 1) Max b-tree height, reasonable enough with including dups' sub-tree */ + size_t need = CURSOR_STACK + 3; + /* 2) GC/FreeDB for any payload */ + if (mc->mc_dbi > FREE_DBI) { + need += txn->mt_dbs[FREE_DBI].md_depth + 3; + /* 3) Named DBs also dirty the main DB */ + if (mc->mc_dbi > MAIN_DBI) + need += txn->mt_dbs[MAIN_DBI].md_depth + 3; + } +#if xMDBX_DEBUG_SPILLING != 2 + /* production mode */ + /* 4) Double the page chain estimation + * for extensively splitting, rebalance and merging */ + need += need; + /* 5) Factor the key+data which to be put in */ + need += bytes2pgno(txn->mt_env, node_size(key, data)) + 1; +#else + /* debug mode */ + (void)key; + (void)data; + txn->mt_env->debug_dirtied_est = ++need; + txn->mt_env->debug_dirtied_act = 0; +#endif /* xMDBX_DEBUG_SPILLING == 2 */ + + int err = txn_spill(txn, mc, need); + if (unlikely(err != MDBX_SUCCESS)) + return err; + } + + int rc = MDBX_SUCCESS; if (likely(mc->mc_snum)) { mc->mc_top = 0; do { rc = page_touch(mc); - } while (!rc && ++(mc->mc_top) < mc->mc_snum); + if (unlikely(rc != MDBX_SUCCESS)) + break; + mc->mc_top += 1; + } while (mc->mc_top < mc->mc_snum); mc->mc_top = mc->mc_snum - 1; } return rc; @@ -16952,9 +16951,6 @@ __hot int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, data->iov_base = nullptr; } - const unsigned nospill = flags & MDBX_NOSPILL; - flags -= nospill; - if (unlikely(mc->mc_txn->mt_flags & (MDBX_TXN_RDONLY | MDBX_TXN_BLOCKED))) return (mc->mc_txn->mt_flags & MDBX_TXN_RDONLY) ? MDBX_EACCESS : MDBX_BAD_TXN; @@ -17159,26 +17155,19 @@ __hot int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, } mc->mc_flags &= ~C_DEL; - /* Cursor is positioned, check for room in the dirty list */ - if (!nospill) { - rdata = data; - if (unlikely(flags & MDBX_MULTIPLE)) { - rdata = &xdata; - xdata.iov_len = data->iov_len * dcount; - } - if (unlikely(err = cursor_spill(mc, key, rdata))) - return err; + rdata = data; + if (unlikely(flags & MDBX_MULTIPLE)) { + rdata = &xdata; + xdata.iov_len = data->iov_len * dcount; } + err = cursor_touch(mc, key, rdata); + if (unlikely(err)) + return err; if (unlikely(rc == MDBX_NO_ROOT)) { /* new database, write a root leaf page */ DEBUG("%s", "allocating new root leaf page"); - if (unlikely((*mc->mc_dbistate & DBI_DIRTY) == 0)) { - err = touch_dbi(mc); - if (unlikely(err != MDBX_SUCCESS)) - return err; - } pgr_t npr = page_new(mc, P_LEAF); if (unlikely(npr.err != MDBX_SUCCESS)) return npr.err; @@ -17205,11 +17194,6 @@ __hot int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, if ((mc->mc_db->md_flags & (MDBX_DUPSORT | MDBX_DUPFIXED)) == MDBX_DUPFIXED) npr.page->mp_flags |= P_LEAF2; mc->mc_flags |= C_INITIALIZED; - } else { - /* make sure all cursor pages are writable */ - err = cursor_touch(mc); - if (unlikely(err)) - return err; } bool insert_key, insert_data, do_sub = false; @@ -17602,9 +17586,8 @@ new_sub:; STATIC_ASSERT( (MDBX_NODUPDATA >> SHIFT_MDBX_NODUPDATA_TO_MDBX_NOOVERWRITE) == MDBX_NOOVERWRITE); - xflags = MDBX_CURRENT | MDBX_NOSPILL | - ((flags & MDBX_NODUPDATA) >> - SHIFT_MDBX_NODUPDATA_TO_MDBX_NOOVERWRITE); + xflags = MDBX_CURRENT | ((flags & MDBX_NODUPDATA) >> + SHIFT_MDBX_NODUPDATA_TO_MDBX_NOOVERWRITE); if ((flags & MDBX_CURRENT) == 0) { xflags -= MDBX_CURRENT; err = cursor_xinit1(mc, node, mc->mc_pg[mc->mc_top]); @@ -17718,11 +17701,7 @@ __hot int mdbx_cursor_del(MDBX_cursor *mc, MDBX_put_flags_t flags) { if (unlikely(mc->mc_ki[mc->mc_top] >= page_numkeys(mc->mc_pg[mc->mc_top]))) return MDBX_NOTFOUND; - if (likely((flags & MDBX_NOSPILL) == 0) && - unlikely(rc = cursor_spill(mc, NULL, NULL))) - return rc; - - rc = cursor_touch(mc); + rc = cursor_touch(mc, nullptr, nullptr); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -17744,7 +17723,7 @@ __hot int mdbx_cursor_del(MDBX_cursor *mc, MDBX_put_flags_t flags) { } else { if (!(node_flags(node) & F_SUBDATA)) mc->mc_xcursor->mx_cursor.mc_pg[0] = node_data(node); - rc = mdbx_cursor_del(&mc->mc_xcursor->mx_cursor, MDBX_NOSPILL); + rc = mdbx_cursor_del(&mc->mc_xcursor->mx_cursor, 0); if (unlikely(rc)) return rc; /* If sub-DB still has entries, we're done */ diff --git a/src/internals.h b/src/internals.h index 06e8503a..eb14a15b 100644 --- a/src/internals.h +++ b/src/internals.h @@ -1681,9 +1681,6 @@ typedef struct MDBX_node { #define CMP2INT(a, b) (((a) > (b)) - ((b) > (a))) #endif -/* Do not spill pages to disk if txn is getting full, may fail instead */ -#define MDBX_NOSPILL 0x8000 - MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline pgno_t int64pgno(int64_t i64) { if (likely(i64 >= (int64_t)MIN_PAGENO && i64 <= (int64_t)MAX_PAGENO + 1)) From 9cbbdfa025b8619c55f1eceb520e118de326034f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Mon, 19 Dec 2022 08:08:14 +0300 Subject: [PATCH 288/364] =?UTF-8?q?mdbx:=20=D0=B4=D0=BE=D0=B1=D0=B0=D0=B2?= =?UTF-8?q?=D0=BB=D0=B5=D0=BD=D0=B8=D0=B5=20`const`=20=D0=BA=20=D0=B0?= =?UTF-8?q?=D1=80=D0=B3=D1=83=D0=BC=D0=B5=D0=BD=D1=82=D0=B0=D0=BC=20=D1=84?= =?UTF-8?q?=D1=83=D0=BD=D0=BA=D1=86=D0=B8=D0=B9=20=D0=BF=D0=BE=D0=BB=D1=83?= =?UTF-8?q?=D1=87=D0=B5=D0=BD=D0=B8=D1=8F=20=D0=B8=20=D0=BA=D0=BE=D0=BD?= =?UTF-8?q?=D1=82=D1=80=D0=BE=D0=BB=D1=8F=20=D1=81=D1=82=D1=80=D0=B0=D0=BD?= =?UTF-8?q?=D0=B8=D1=86.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/src/core.c b/src/core.c index 4a1fae77..fbbc16cd 100644 --- a/src/core.c +++ b/src/core.c @@ -3236,27 +3236,27 @@ enum { static int txn_end(MDBX_txn *txn, const unsigned mode); static __always_inline pgr_t page_get_inline(const uint16_t ILL, - MDBX_cursor *const mc, + const MDBX_cursor *const mc, const pgno_t pgno, const txnid_t front); -static pgr_t page_get_any(MDBX_cursor *const mc, const pgno_t pgno, +static pgr_t page_get_any(const MDBX_cursor *const mc, const pgno_t pgno, const txnid_t front) { return page_get_inline(P_ILL_BITS, mc, pgno, front); } -__hot static pgr_t page_get_three(MDBX_cursor *const mc, const pgno_t pgno, - const txnid_t front) { +__hot static pgr_t page_get_three(const MDBX_cursor *const mc, + const pgno_t pgno, const txnid_t front) { return page_get_inline(P_ILL_BITS | P_OVERFLOW, mc, pgno, front); } -static pgr_t page_get_large(MDBX_cursor *const mc, const pgno_t pgno, +static pgr_t page_get_large(const MDBX_cursor *const mc, const pgno_t pgno, const txnid_t front) { return page_get_inline(P_ILL_BITS | P_BRANCH | P_LEAF | P_LEAF2, mc, pgno, front); } -static __always_inline int __must_check_result page_get(MDBX_cursor *mc, +static __always_inline int __must_check_result page_get(const MDBX_cursor *mc, const pgno_t pgno, MDBX_page **mp, const txnid_t front) { @@ -3330,9 +3330,9 @@ static int __must_check_result cursor_push(MDBX_cursor *mc, MDBX_page *mp); static int __must_check_result audit_ex(MDBX_txn *txn, size_t retired_stored, bool dont_filter_gc); -static int __must_check_result page_check(MDBX_cursor *const mc, +static int __must_check_result page_check(const MDBX_cursor *const mc, const MDBX_page *const mp); -static int __must_check_result cursor_check(MDBX_cursor *mc); +static int __must_check_result cursor_check(const MDBX_cursor *mc); static int __must_check_result cursor_check_updating(MDBX_cursor *mc); static int __must_check_result cursor_del(MDBX_cursor *mc); static int __must_check_result delete(MDBX_txn *txn, MDBX_dbi dbi, @@ -15348,10 +15348,9 @@ __hot static __always_inline int page_get_checker_lite(const uint16_t ILL, return MDBX_SUCCESS; } -__cold static __noinline pgr_t page_get_checker_full(const uint16_t ILL, - MDBX_page *page, - MDBX_cursor *const mc, - const txnid_t front) { +__cold static __noinline pgr_t +page_get_checker_full(const uint16_t ILL, MDBX_page *page, + const MDBX_cursor *const mc, const txnid_t front) { pgr_t r = {page, page_get_checker_lite(ILL, page, mc->mc_txn, front)}; if (likely(r.err == MDBX_SUCCESS)) r.err = page_check(mc, page); @@ -15361,7 +15360,7 @@ __cold static __noinline pgr_t page_get_checker_full(const uint16_t ILL, } __hot static __always_inline pgr_t page_get_inline(const uint16_t ILL, - MDBX_cursor *const mc, + const MDBX_cursor *const mc, const pgno_t pgno, const txnid_t front) { MDBX_txn *const txn = mc->mc_txn; @@ -19505,7 +19504,8 @@ retry: return MDBX_PROBLEM; } -__cold static int page_check(MDBX_cursor *const mc, const MDBX_page *const mp) { +__cold static int page_check(const MDBX_cursor *const mc, + const MDBX_page *const mp) { DKBUF; int rc = MDBX_SUCCESS; if (unlikely(mp->mp_pgno < MIN_PAGENO || mp->mp_pgno > MAX_PAGENO)) @@ -19882,7 +19882,7 @@ __cold static int page_check(MDBX_cursor *const mc, const MDBX_page *const mp) { return rc; } -__cold static int cursor_check(MDBX_cursor *mc) { +__cold static int cursor_check(const MDBX_cursor *mc) { if (!mc->mc_txn->tw.dirtylist) { cASSERT(mc, (mc->mc_txn->mt_flags & MDBX_WRITEMAP) != 0 && !MDBX_AVOID_MSYNC); From 85828f677a46f5482d555e61bad4187b5224bd61 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Sat, 17 Dec 2022 21:03:31 +0300 Subject: [PATCH 289/364] =?UTF-8?q?mdbx:=20=D0=BF=D0=B5=D1=80=D0=B5=D1=81?= =?UTF-8?q?=D0=BE=D0=B7=D0=B4=D0=B0=D0=BD=D0=B8=D0=B5=20=D0=BF=D1=83=D1=81?= =?UTF-8?q?=D1=82=D0=BE=D0=B9=20MAIN=5FDBI=20=D0=BF=D1=80=D0=B8=20=D0=BD?= =?UTF-8?q?=D0=B5=D0=BE=D0=B1=D1=85=D0=BE=D0=B4=D0=B8=D0=BC=D0=BE=D1=81?= =?UTF-8?q?=D1=82=D0=B8.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core.c | 133 ++++++++++++++++++++++++++++++++--------------------- 1 file changed, 80 insertions(+), 53 deletions(-) diff --git a/src/core.c b/src/core.c index fbbc16cd..69588301 100644 --- a/src/core.c +++ b/src/core.c @@ -8861,6 +8861,8 @@ static int txn_renew(MDBX_txn *txn, const unsigned flags) { (db_flags & DB_VALID) ? DBI_VALID | DBI_USRVALID | DBI_STALE : 0; } txn->mt_dbistate[MAIN_DBI] = DBI_VALID | DBI_USRVALID; + rc = + setup_dbx(&txn->mt_dbxs[MAIN_DBI], &txn->mt_dbs[MAIN_DBI], env->me_psize); txn->mt_dbistate[FREE_DBI] = DBI_VALID; txn->mt_front = txn->mt_txnid + ((flags & (MDBX_WRITEMAP | MDBX_RDONLY)) == 0); @@ -14493,6 +14495,10 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, env_pathname.ent_len * sizeof(pathchar_t)); env->me_dbxs[FREE_DBI].md_cmp = cmp_int_align4; /* aligned MDBX_INTEGERKEY */ env->me_dbxs[FREE_DBI].md_dcmp = cmp_lenfast; + env->me_dbxs[FREE_DBI].md_klen_max = env->me_dbxs[FREE_DBI].md_klen_min = 8; + env->me_dbxs[FREE_DBI].md_vlen_min = 4; + env->me_dbxs[FREE_DBI].md_vlen_max = + mdbx_env_get_maxvalsize_ex(env, MDBX_INTEGERKEY); /* Использование O_DSYNC или FILE_FLAG_WRITE_THROUGH: * @@ -22152,15 +22158,27 @@ static int dbi_open(MDBX_txn *txn, const MDBX_val *const table_name, if (unlikely(!dbi)) return rc; + void *clone = nullptr; + bool locked = false; if (unlikely((user_flags & ~DB_USABLE_FLAGS) != 0)) { - early_bailout: + bailout: + tASSERT(txn, MDBX_IS_ERROR(rc)); *dbi = 0; + if (locked) + ENSURE(txn->mt_env, + osal_fastmutex_release(&txn->mt_env->me_dbi_lock) == MDBX_SUCCESS); + osal_free(clone); return rc; } rc = check_txn(txn, MDBX_TXN_BLOCKED); if (unlikely(rc != MDBX_SUCCESS)) - goto early_bailout; + goto bailout; + + if ((user_flags & MDBX_CREATE) && unlikely(txn->mt_flags & MDBX_TXN_RDONLY)) { + rc = MDBX_EACCESS; + goto bailout; + } switch (user_flags & (MDBX_INTEGERDUP | MDBX_DUPFIXED | MDBX_DUPSORT | MDBX_REVERSEDUP | MDBX_ACCEDE)) { @@ -22170,7 +22188,7 @@ static int dbi_open(MDBX_txn *txn, const MDBX_val *const table_name, __fallthrough /* fall through */; default: rc = MDBX_EINVAL; - goto early_bailout; + goto bailout; case MDBX_DUPSORT: case MDBX_DUPSORT | MDBX_REVERSEDUP: @@ -22187,21 +22205,21 @@ static int dbi_open(MDBX_txn *txn, const MDBX_val *const table_name, table_name->iov_base == MDBX_PGWALK_MAIN) { rc = dbi_bind(txn, MAIN_DBI, user_flags, keycmp, datacmp); if (unlikely(rc != MDBX_SUCCESS)) - goto early_bailout; + goto bailout; *dbi = MAIN_DBI; return rc; } if (table_name == MDBX_PGWALK_GC || table_name->iov_base == MDBX_PGWALK_GC) { rc = dbi_bind(txn, FREE_DBI, user_flags, keycmp, datacmp); if (unlikely(rc != MDBX_SUCCESS)) - goto early_bailout; + goto bailout; *dbi = FREE_DBI; return rc; } if (table_name == MDBX_PGWALK_META || table_name->iov_base == MDBX_PGWALK_META) { rc = MDBX_EINVAL; - goto early_bailout; + goto bailout; } MDBX_val key = *table_name; @@ -22209,13 +22227,34 @@ static int dbi_open(MDBX_txn *txn, const MDBX_val *const table_name, if (key.iov_len > env->me_leaf_nodemax - NODESIZE - sizeof(MDBX_db)) return MDBX_EINVAL; - if (txn->mt_dbxs[MAIN_DBI].md_cmp == NULL) { + /* Cannot mix named table(s) with DUPSORT flags */ + if (unlikely(txn->mt_dbs[MAIN_DBI].md_flags & MDBX_DUPSORT)) { + if ((user_flags & MDBX_CREATE) == 0) { + rc = MDBX_NOTFOUND; + goto bailout; + } + if (txn->mt_dbs[MAIN_DBI].md_leaf_pages || txn->mt_dbxs[MAIN_DBI].md_cmp) { + /* В MAIN_DBI есть записи либо она уже использовалась. */ + rc = MDBX_INCOMPATIBLE; + goto bailout; + } + /* Пересоздаём MAIN_DBI если там пусто. */ + atomic_store32(&txn->mt_dbiseqs[MAIN_DBI], dbi_seq(env, MAIN_DBI), + mo_AcquireRelease); + tASSERT(txn, txn->mt_dbs[MAIN_DBI].md_depth == 0 && + txn->mt_dbs[MAIN_DBI].md_entries == 0 && + txn->mt_dbs[MAIN_DBI].md_root == P_INVALID); + txn->mt_dbs[MAIN_DBI].md_flags &= MDBX_REVERSEKEY | MDBX_INTEGERKEY; + txn->mt_dbistate[MAIN_DBI] |= DBI_DIRTY; + txn->mt_flags |= MDBX_TXN_DIRTY; txn->mt_dbxs[MAIN_DBI].md_cmp = get_default_keycmp(txn->mt_dbs[MAIN_DBI].md_flags); txn->mt_dbxs[MAIN_DBI].md_dcmp = get_default_datacmp(txn->mt_dbs[MAIN_DBI].md_flags); } + tASSERT(txn, txn->mt_dbxs[MAIN_DBI].md_cmp); + /* Is the DB already open? */ MDBX_dbi scan, slot; for (slot = scan = txn->mt_numdbs; --scan >= CORE_DBS;) { @@ -22229,7 +22268,7 @@ static int dbi_open(MDBX_txn *txn, const MDBX_val *const table_name, key.iov_len)) { rc = dbi_bind(txn, scan, user_flags, keycmp, datacmp); if (unlikely(rc != MDBX_SUCCESS)) - goto early_bailout; + goto bailout; *dbi = scan; return rc; } @@ -22238,14 +22277,7 @@ static int dbi_open(MDBX_txn *txn, const MDBX_val *const table_name, /* Fail, if no free slot and max hit */ if (unlikely(slot >= env->me_maxdbs)) { rc = MDBX_DBS_FULL; - goto early_bailout; - } - - /* Cannot mix named table with some main-table flags */ - if (unlikely(txn->mt_dbs[MAIN_DBI].md_flags & - (MDBX_DUPSORT | MDBX_INTEGERKEY))) { - rc = (user_flags & MDBX_CREATE) ? MDBX_INCOMPATIBLE : MDBX_NOTFOUND; - goto early_bailout; + goto bailout; } /* Find the DB info */ @@ -22253,37 +22285,36 @@ static int dbi_open(MDBX_txn *txn, const MDBX_val *const table_name, MDBX_cursor_couple couple; rc = cursor_init(&couple.outer, txn, MAIN_DBI); if (unlikely(rc != MDBX_SUCCESS)) - goto early_bailout; + goto bailout; rc = cursor_set(&couple.outer, &key, &data, MDBX_SET).err; if (unlikely(rc != MDBX_SUCCESS)) { if (rc != MDBX_NOTFOUND || !(user_flags & MDBX_CREATE)) - goto early_bailout; + goto bailout; } else { /* make sure this is actually a table */ MDBX_node *node = page_node(couple.outer.mc_pg[couple.outer.mc_top], couple.outer.mc_ki[couple.outer.mc_top]); if (unlikely((node_flags(node) & (F_DUPDATA | F_SUBDATA)) != F_SUBDATA)) { rc = MDBX_INCOMPATIBLE; - goto early_bailout; + goto bailout; } if (!MDBX_DISABLE_VALIDATION && unlikely(data.iov_len != sizeof(MDBX_db))) { rc = MDBX_CORRUPTED; - goto early_bailout; + goto bailout; } } if (rc != MDBX_SUCCESS && unlikely(txn->mt_flags & MDBX_TXN_RDONLY)) { rc = MDBX_EACCESS; - goto early_bailout; + goto bailout; } /* Done here so we cannot fail after creating a new DB */ - void *clone = nullptr; if (key.iov_len) { clone = osal_malloc(key.iov_len); if (unlikely(!clone)) { rc = MDBX_ENOMEM; - goto early_bailout; + goto bailout; } key.iov_base = memcpy(clone, key.iov_base, key.iov_len); } else @@ -22292,9 +22323,9 @@ static int dbi_open(MDBX_txn *txn, const MDBX_val *const table_name, int err = osal_fastmutex_acquire(&env->me_dbi_lock); if (unlikely(err != MDBX_SUCCESS)) { rc = err; - osal_free(clone); - goto early_bailout; + goto bailout; } + locked = true; /* Import handles from env */ dbi_import_locked(txn); @@ -22311,15 +22342,15 @@ static int dbi_open(MDBX_txn *txn, const MDBX_val *const table_name, key.iov_len)) { rc = dbi_bind(txn, scan, user_flags, keycmp, datacmp); if (unlikely(rc != MDBX_SUCCESS)) - goto later_bailout; - *dbi = scan; - goto later_exit; + goto bailout; + slot = scan; + goto done; } } if (unlikely(slot >= env->me_maxdbs)) { rc = MDBX_DBS_FULL; - goto later_bailout; + goto bailout; } unsigned dbiflags = DBI_FRESH | DBI_VALID | DBI_USRVALID; @@ -22336,9 +22367,8 @@ static int dbi_open(MDBX_txn *txn, const MDBX_val *const table_name, WITH_CURSOR_TRACKING(couple.outer, rc = mdbx_cursor_put(&couple.outer, &key, &data, F_SUBDATA | MDBX_NOOVERWRITE)); - if (unlikely(rc != MDBX_SUCCESS)) - goto later_bailout; + goto bailout; dbiflags |= DBI_DIRTY | DBI_CREAT; txn->mt_flags |= MDBX_TXN_DIRTY; @@ -22352,31 +22382,28 @@ static int dbi_open(MDBX_txn *txn, const MDBX_val *const table_name, rc = dbi_bind(txn, slot, user_flags, keycmp, datacmp); if (unlikely(rc != MDBX_SUCCESS)) { tASSERT(txn, (dbiflags & DBI_CREAT) == 0); - later_bailout: - *dbi = 0; - later_exit: - osal_free(clone); - } else { - txn->mt_dbistate[slot] = (uint8_t)dbiflags; - txn->mt_dbxs[slot].md_name = key; - txn->mt_dbiseqs[slot].weak = env->me_dbiseqs[slot].weak = - dbi_seq(env, slot); - if (!(dbiflags & DBI_CREAT)) - env->me_dbflags[slot] = txn->mt_dbs[slot].md_flags | DB_VALID; - if (txn->mt_numdbs == slot) { - txn->mt_cursors[slot] = NULL; - osal_compiler_barrier(); - txn->mt_numdbs = slot + 1; - } - if (env->me_numdbs <= slot) { - osal_memory_fence(mo_AcquireRelease, true); - env->me_numdbs = slot + 1; - } - *dbi = slot; + goto bailout; } + txn->mt_dbistate[slot] = (uint8_t)dbiflags; + txn->mt_dbxs[slot].md_name = key; + txn->mt_dbiseqs[slot].weak = env->me_dbiseqs[slot].weak = dbi_seq(env, slot); + if (!(dbiflags & DBI_CREAT)) + env->me_dbflags[slot] = txn->mt_dbs[slot].md_flags | DB_VALID; + if (txn->mt_numdbs == slot) { + txn->mt_cursors[slot] = NULL; + osal_compiler_barrier(); + txn->mt_numdbs = slot + 1; + } + if (env->me_numdbs <= slot) { + osal_memory_fence(mo_AcquireRelease, true); + env->me_numdbs = slot + 1; + } + +done: + *dbi = slot; ENSURE(env, osal_fastmutex_release(&env->me_dbi_lock) == MDBX_SUCCESS); - return rc; + return MDBX_SUCCESS; } static int dbi_open_cstr(MDBX_txn *txn, const char *name_cstr, From 04981144692e7599194d251c96f49d056c25e60f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Wed, 21 Dec 2022 20:55:11 +0300 Subject: [PATCH 290/364] =?UTF-8?q?mdbx:=20=D0=BE=D0=B1=D0=BD=D1=83=D0=BB?= =?UTF-8?q?=D0=B5=D0=BD=D0=B8=D0=B5=20=D0=B8=D0=BD=D1=84=D0=BE=D1=80=D0=BC?= =?UTF-8?q?=D0=B0=D1=86=D0=B8=D0=B8=20=D0=BE=20=D0=B7=D0=B0=D0=B4=D0=B5?= =?UTF-8?q?=D1=80=D0=B6=D0=BA=D0=B0=D1=85=20=D0=B4=D0=BB=D1=8F=20=D0=BD?= =?UTF-8?q?=D0=B5=D0=B2=D0=B0=D0=BB=D0=B8=D0=B4=D0=BD=D1=8B=D1=85=20=D1=82?= =?UTF-8?q?=D1=80=D0=B0=D0=BD=D0=B7=D0=B0=D0=BA=D1=86=D0=B8=D0=B9=20=D0=B2?= =?UTF-8?q?=20`mdbx=5Ftxn=5Fcommit=5Fex()`.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/core.c b/src/core.c index 69588301..dc530f7d 100644 --- a/src/core.c +++ b/src/core.c @@ -11342,15 +11342,19 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { uint64_t ts_1 = 0, ts_2 = 0, ts_3 = 0, ts_4 = 0, ts_5 = 0, gc_cputime = 0; int rc = check_txn(txn, MDBX_TXN_FINISHED); - if (unlikely(rc != MDBX_SUCCESS)) - goto provide_latency; + if (unlikely(rc != MDBX_SUCCESS)) { + if (latency) + memset(latency, 0, sizeof(*latency)); + return rc; + } MDBX_env *const env = txn->mt_env; #if MDBX_ENV_CHECKPID if (unlikely(env->me_pid != osal_getpid())) { env->me_flags |= MDBX_FATAL_ERROR; - rc = MDBX_PANIC; - goto provide_latency; + if (latency) + memset(latency, 0, sizeof(*latency)); + return MDBX_PANIC; } #endif /* MDBX_ENV_CHECKPID */ From a06fe4f168ab72d5af31666988f1d5b7476e58b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Sun, 25 Dec 2022 19:56:50 +0300 Subject: [PATCH 291/364] =?UTF-8?q?mdbx:=20=D0=BF=D0=B5=D1=80=D0=B5=D1=80?= =?UTF-8?q?=D0=B0=D0=B1=D0=BE=D1=82=D0=BA=D0=B0=20=D0=BA=D0=BE=D0=BD=D1=82?= =?UTF-8?q?=D1=80=D0=BE=D0=BB=D1=8F=20"=D0=BD=D0=B5=D0=BA=D0=BE=D0=B3?= =?UTF-8?q?=D0=B5=D1=80=D0=B5=D0=BD=D1=82=D0=BD=D0=BE=D1=81=D1=82=D0=B8"?= =?UTF-8?q?=20=D0=B4=D0=BB=D1=8F=20=D1=83=D0=BC=D0=B5=D0=BD=D1=8C=D1=88?= =?UTF-8?q?=D0=B5=D0=BD=D0=B8=D1=8F=20=D0=BD=D0=B0=D0=BA=D0=BB=D0=B0=D0=B4?= =?UTF-8?q?=D0=BD=D1=8B=D1=85=20=D1=80=D0=B0=D1=81=D1=85=D0=BE=D0=B4=D0=BE?= =?UTF-8?q?=D0=B2.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Существует проблема https://libmdbx.dqdkfa.ru/dead-github/issues/269, которая проявляется только при специфической неупорядоченности внутри ядра ОС, когда страницы, записанные в файл отображенный в память, становятся видны в памяти посредством работы unified page cache: - если записанная последней мета-страница "обгоняет" ранее записанные, т.е. когда записанное в файл позже становится видимым в отображении раньше, чем записанное ранее. Теперь, вместо постоянной полной сверки записываемых страниц, выполняется легковесная проверка при старте транзакций, с переключением в режим "как раньше" при обнаружении проблемы. В результате, в некоторых сценариях возвращается 5-10% производительности, а в отдельных синтетических тестах до 30%. --- src/core.c | 82 +++++++++++++++++++++++++++++++++++++++++++++---- src/internals.h | 5 +++ 2 files changed, 81 insertions(+), 6 deletions(-) diff --git a/src/core.c b/src/core.c index dc530f7d..26fd9c08 100644 --- a/src/core.c +++ b/src/core.c @@ -4546,10 +4546,15 @@ typedef struct iov_ctx { __must_check_result static int iov_init(MDBX_txn *const txn, iov_ctx_t *ctx, size_t items, size_t npages, - mdbx_filehandle_t fd) { + mdbx_filehandle_t fd, + bool check_coherence) { ctx->env = txn->mt_env; ctx->ior = &txn->mt_env->me_ioring; ctx->fd = fd; + ctx->coherency_timestamp = + (check_coherence || txn->mt_env->me_lck->mti_pgop_stat.incoherence.weak) + ? 0 + : UINT64_MAX /* не выполнять сверку */; ctx->err = osal_ioring_prepare(ctx->ior, items, pgno_align2os_bytes(txn->mt_env, npages)); if (likely(ctx->err == MDBX_SUCCESS)) { @@ -4582,9 +4587,63 @@ static void iov_callback4dirtypages(iov_ctx_t *ctx, size_t offset, void *data, MDBX_ASAN_UNPOISON_MEMORY_REGION(rp, bytes); osal_flush_incoherent_mmap(rp, bytes, env->me_os_psize); /* check with timeout as the workaround - * for https://libmdbx.dqdkfa.ru/dead-github/issues/269 */ - if (unlikely(memcmp(wp, rp, bytes))) { + * for https://libmdbx.dqdkfa.ru/dead-github/issues/269 + * + * Проблема проявляется только при неупорядоченности: если записанная + * последней мета-страница "обгоняет" ранее записанные, т.е. когда + * записанное в файл позже становится видимым в отображении раньше, + * чем записанное ранее. + * + * Исходно здесь всегда выполнялась полная сверка. Это давало полную + * гарантию защиты от проявления проблемы, но порождало накладные расходы. + * В некоторых сценариях наблюдалось снижение производительности до 10-15%, + * а в синтетических тестах до 30%. Конечно никто не вникал в причины, + * а просто останавливался на мнении "libmdbx не быстрее LMDB", + * например: https://clck.ru/3386er + * + * Поэтому после серии экспериментов и тестов реализовано следующее: + * 0. Посредством опции сборки MDBX_FORCE_CHECK_MMAP_COHERENCY=1 + * можно включить полную сверку после записи. + * Остальные пункты являются взвешенным компромиссом между полной + * гарантией обнаружения проблемы и бесполезными затратами на системах + * без этого недостатка. + * 1. При старте транзакций проверяется соответствие выбранной мета-страницы + * корневым страницам b-tree проверяется. Эта проверка показала себя + * достаточной без сверки после записи. При обнаружении "некогерентности" + * эти случаи подсчитываются, а при их ненулевом счетчике выполняется + * полная сверка. Таким образом, произойдет переключение в режим полной + * сверки, если показавшая себя достаточной проверка заметит проявление + * проблемы хоты-бы раз. + * 2. Сверка не выполняется при фиксации транзакции, так как: + * - при наличии проблемы "не-когерентности" (при отложенном копировании + * или обновлении PTE, после возврата из write-syscall), проверка + * в этом процессе не гарантирует актуальность данных в другом + * процессе, который может запустить транзакцию сразу после коммита; + * - сверка только последнего блока позволяет почти восстановить + * производительность в больших транзакциях, но одновременно размывает + * уверенность в отсутствии сбоев, чем обесценивает всю затею; + * - после записи данных будет записана мета-страница, соответствие + * которой корневым страницам b-tree проверяется при старте + * транзакций, и только эта проверка показала себя достаточной; + * 3. При спиллинге производится полная сверка записанных страниц. Тут был + * соблазн сверять не полностью, а например начало и конец каждого блока. + * Но при спиллинге возможна ситуация повторного вытеснения страниц, в + * том числе large/overflow. При этом возникает риск прочитать в текущей + * транзакции старую версию страницы, до повторной записи. В этом случае + * могут возникать крайне редкие невоспроизводимые ошибки. С учетом того + * что спиллинг выполняет крайне редко, решено отказаться от экономии + * в пользу надежности. */ +#ifndef MDBX_FORCE_CHECK_MMAP_COHERENCY +#define MDBX_FORCE_CHECK_MMAP_COHERENCY 0 +#endif /* MDBX_FORCE_CHECK_MMAP_COHERENCY */ + if ((MDBX_FORCE_CHECK_MMAP_COHERENCY || + ctx->coherency_timestamp != UINT64_MAX) && + unlikely(memcmp(wp, rp, bytes))) { ctx->coherency_timestamp = 0; + env->me_lck->mti_pgop_stat.incoherence.weak = + (env->me_lck->mti_pgop_stat.incoherence.weak >= INT32_MAX) + ? INT32_MAX + : env->me_lck->mti_pgop_stat.incoherence.weak + 1; WARNING("catch delayed/non-arrived page %" PRIaPGNO " %s", wp->mp_pgno, "(workaround for incoherent flaw of unified page/buffer cache)"); do @@ -5074,7 +5133,8 @@ __cold static int txn_spill_slowpath(MDBX_txn *const txn, MDBX_cursor *const m0, #if defined(_WIN32) || defined(_WIN64) txn->mt_env->me_overlapped_fd ? txn->mt_env->me_overlapped_fd : #endif - txn->mt_env->me_lazy_fd); + txn->mt_env->me_lazy_fd, + true); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; @@ -8530,6 +8590,11 @@ static bool coherency_check(const MDBX_env *env, const txnid_t txnid, ok = false; } } + if (unlikely(!ok) && report) + env->me_lck->mti_pgop_stat.incoherence.weak = + (env->me_lck->mti_pgop_stat.incoherence.weak >= INT32_MAX) + ? INT32_MAX + : env->me_lck->mti_pgop_stat.incoherence.weak + 1; return ok; } @@ -8579,11 +8644,16 @@ static int coherency_check_written(const MDBX_env *env, const txnid_t txnid, const bool report = !(timestamp && *timestamp); const txnid_t head_txnid = meta_txnid(meta); if (unlikely(head_txnid < MIN_TXNID || (head_txnid < txnid))) { - if (report) + if (report) { + env->me_lck->mti_pgop_stat.incoherence.weak = + (env->me_lck->mti_pgop_stat.incoherence.weak >= INT32_MAX) + ? INT32_MAX + : env->me_lck->mti_pgop_stat.incoherence.weak + 1; WARNING("catch %s txnid %" PRIaTXN " for meta_%" PRIaPGNO " %s", (head_txnid < MIN_TXNID) ? "invalid" : "unexpected", head_txnid, bytes2pgno(env, ptr_dist(meta, env->me_map)), "(workaround for incoherent flaw of unified page/buffer cache)"); + } return coherency_timeout(timestamp, 0); } return coherency_check_readed(env, head_txnid, meta->mm_dbs, meta, timestamp); @@ -11678,7 +11748,7 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { iov_ctx_t write_ctx; rc = iov_init(txn, &write_ctx, txn->tw.dirtylist->length, - txn->tw.dirtylist->pages_including_loose, fd); + txn->tw.dirtylist->pages_including_loose, fd, false); if (unlikely(rc != MDBX_SUCCESS)) { ERROR("txn-%s: error %d", "iov-init", rc); goto fail; diff --git a/src/internals.h b/src/internals.h index eb14a15b..684628e5 100644 --- a/src/internals.h +++ b/src/internals.h @@ -622,6 +622,11 @@ typedef struct pgop_stat { MDBX_atomic_uint64_t prefault; /* Number of prefault write operations */ MDBX_atomic_uint64_t mincore; /* Number of mincore() calls */ + MDBX_atomic_uint32_t + incoherence; /* number of https://libmdbx.dqdkfa.ru/dead-github/issues/269 + caught */ + MDBX_atomic_uint32_t reserved; + /* Статистика для профилирования GC. * Логически эти данные может быть стоит вынести в другую структуру, * но разница будет сугубо косметическая. */ From d29acf4fdca8d5615c086a151fe9920a1207679b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Sun, 25 Dec 2022 23:58:56 +0300 Subject: [PATCH 292/364] =?UTF-8?q?mdbx:=20=D0=B0=D0=BA=D1=82=D1=83=D0=B0?= =?UTF-8?q?=D0=BB=D0=B8=D0=B7=D0=B0=D1=86=D0=B8=D1=8F=20bits.md=20(=D0=B2?= =?UTF-8?q?=D0=BD=D1=83=D1=82=D1=80=D0=B5=D0=BD=D0=BD=D0=B8=D0=B9=20=D1=81?= =?UTF-8?q?=D0=BF=D1=80=D0=B0=D0=B2=D0=BE=D1=87=D0=BD=D0=B8=D0=BA).?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/bits.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/bits.md b/src/bits.md index 7566cdf3..e8708f02 100644 --- a/src/bits.md +++ b/src/bits.md @@ -1,10 +1,10 @@ N | MASK | ENV | TXN | DB | PUT | DBI | NODE | PAGE | MRESIZE | --|---------|-----------|--------------|----------|-----------|------------|---------|----------|---------| -0 |0000 0001|ALLOC_CACHE|TXN_FINISHED | | |DBI_DIRTY |F_BIGDATA|P_BRANCH | | -1 |0000 0002|ALLOC_GC |TXN_ERROR |REVERSEKEY|F_SUBDATA |DBI_STALE |F_SUBDATA|P_LEAF | | -2 |0000 0004|ALLOC_NEW |TXN_DIRTY |DUPSORT | |DBI_FRESH |F_DUPDATA|P_OVERFLOW| | -3 |0000 0008|ALLOC_SLOT |TXN_SPILLS |INTEGERKEY| |DBI_CREAT | |P_META | | -4 |0000 0010|ALLOC_FAKE |TXN_HAS_CHILD |DUPFIXED |NOOVERWRITE|DBI_VALID | |P_BAD | | +0 |0000 0001|ALLOC_RSRV |TXN_FINISHED | | |DBI_DIRTY |F_BIGDATA|P_BRANCH | | +1 |0000 0002|ALLOC_UNIMP|TXN_ERROR |REVERSEKEY|F_SUBDATA |DBI_STALE |F_SUBDATA|P_LEAF | | +2 |0000 0004|ALLOC_COLSC|TXN_DIRTY |DUPSORT | |DBI_FRESH |F_DUPDATA|P_OVERFLOW| | +3 |0000 0008|ALLOC_SSCAN|TXN_SPILLS |INTEGERKEY| |DBI_CREAT | |P_META | | +4 |0000 0010|ALLOC_FIFO |TXN_HAS_CHILD |DUPFIXED |NOOVERWRITE|DBI_VALID | |P_BAD | | 5 |0000 0020| |TXN_DRAINED_GC|INTEGERDUP|NODUPDATA |DBI_USRVALID| |P_LEAF2 | | 6 |0000 0040| | |REVERSEDUP|CURRENT |DBI_DUPDATA | |P_SUBP | | 7 |0000 0080| | | |ALLDUPS |DBI_AUDITED | | | | From e458af602e21634060c34bb02866f78d8b8a0b95 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Mon, 26 Dec 2022 00:00:09 +0300 Subject: [PATCH 293/364] =?UTF-8?q?mdbx:=20=D1=83=D1=81=D1=82=D1=80=D0=B0?= =?UTF-8?q?=D0=BD=D0=B5=D0=BD=D0=B8=D0=B5=20=D0=BD=D0=B5=D0=BD=D1=83=D0=B6?= =?UTF-8?q?=D0=BD=D1=8B=D1=85=20=D1=83=D1=81=D0=BB=D0=BE=D0=B2=D0=B8=D0=B9?= =?UTF-8?q?=20=D0=B2=20=D0=BE=D1=82=D0=BB=D0=B0=D0=B4=D0=BA=D0=B5=20(?= =?UTF-8?q?=D0=BD=D0=B5=D1=81=D1=83=D1=89=D0=B5=D1=81=D1=82=D0=B2=D0=B5?= =?UTF-8?q?=D0=BD=D0=BD=D0=BE).?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/core.c b/src/core.c index 26fd9c08..46cabb0b 100644 --- a/src/core.c +++ b/src/core.c @@ -4827,10 +4827,7 @@ spill_prio(const MDBX_txn *txn, const size_t i, const uint32_t reciprocal) { MDBX_page *const dp = dl->items[i].ptr; if (dp->mp_flags & (P_LOOSE | P_SPILLED)) { DEBUG("skip %s %zu page %" PRIaPGNO, - (dp->mp_flags & P_LOOSE) ? "loose" - : (dp->mp_flags & P_LOOSE) ? "loose" - : "parent-spilled", - npages, pgno); + (dp->mp_flags & P_LOOSE) ? "loose" : "parent-spilled", npages, pgno); return 256; } From bb2e3967ebffc22c083af81cd40185ee474ad095 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Mon, 26 Dec 2022 20:28:18 +0300 Subject: [PATCH 294/364] =?UTF-8?q?mdbx:=20=D1=83=D0=BC=D0=B5=D0=BD=D1=8C?= =?UTF-8?q?=D1=88=D0=B5=D0=BD=D0=B8=D0=B5=20=D0=BA=D0=BE=D0=BB-=D0=B2?= =?UTF-8?q?=D0=B0=20=D0=B2=D1=8B=D0=B7=D0=BE=D0=B2=D0=BE=D0=B2=20`realloc(?= =?UTF-8?q?)`.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core.c | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/src/core.c b/src/core.c index 46cabb0b..976a2251 100644 --- a/src/core.c +++ b/src/core.c @@ -2270,8 +2270,9 @@ static void pnl_shrink(MDBX_PNL *ppl) { MDBX_PNL_ALLOCLEN(*ppl) >= MDBX_PNL_GETSIZE(*ppl)); MDBX_PNL_SETSIZE(*ppl, 0); if (unlikely(MDBX_PNL_ALLOCLEN(*ppl) > - MDBX_PNL_INITIAL * 2 - MDBX_CACHELINE_SIZE / sizeof(pgno_t))) { - size_t bytes = pnl_size2bytes(MDBX_PNL_INITIAL); + MDBX_PNL_INITIAL * (MDBX_PNL_PREALLOC_FOR_RADIXSORT ? 8 : 4) - + MDBX_CACHELINE_SIZE / sizeof(pgno_t))) { + size_t bytes = pnl_size2bytes(MDBX_PNL_INITIAL * 2); MDBX_PNL pl = osal_realloc(*ppl - 1, bytes); if (likely(pl)) { #if __GLIBC_PREREQ(2, 12) || defined(__FreeBSD__) || defined(malloc_usable_size) @@ -2815,19 +2816,14 @@ static int dpl_alloc(MDBX_txn *txn) { tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); - const int wanna = (txn->mt_env->me_options.dp_initial < txn->mt_geo.upper) - ? txn->mt_env->me_options.dp_initial - : txn->mt_geo.upper; - if (txn->tw.dirtylist) { - dpl_clear(txn->tw.dirtylist); - const int realloc_threshold = 64; - if (likely( - !((int)(txn->tw.dirtylist->detent - wanna) > realloc_threshold || - (int)(txn->tw.dirtylist->detent - wanna) < -realloc_threshold))) - return MDBX_SUCCESS; - } - if (unlikely(!dpl_reserve(txn, wanna))) + const size_t wanna = (txn->mt_env->me_options.dp_initial < txn->mt_geo.upper) + ? txn->mt_env->me_options.dp_initial + : txn->mt_geo.upper; + if (unlikely(!txn->tw.dirtylist || txn->tw.dirtylist->detent < wanna || + txn->tw.dirtylist->detent > wanna + wanna) && + unlikely(!dpl_reserve(txn, wanna))) return MDBX_ENOMEM; + dpl_clear(txn->tw.dirtylist); return MDBX_SUCCESS; } From 0941319940e5b52d81a3b82daffeb2b2048f029c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Mon, 26 Dec 2022 20:30:32 +0300 Subject: [PATCH 295/364] =?UTF-8?q?mdbx:=20=D0=BF=D0=B0=D1=80=D0=BE=D1=87?= =?UTF-8?q?=D0=BA=D0=B0=20=D0=BD=D0=B5=D0=B7=D0=BD=D0=B0=D1=87=D0=B8=D1=82?= =?UTF-8?q?=D0=B5=D0=BB=D1=8C=D0=BD=D1=8B=D1=85=20likely.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/osal.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/osal.c b/src/osal.c index 8a76d865..997b9adf 100644 --- a/src/osal.c +++ b/src/osal.c @@ -1571,7 +1571,7 @@ MDBX_INTERNAL_FUNC int osal_fsync(mdbx_filehandle_t fd, return MDBX_SUCCESS /* nothing to do */; #if defined(_POSIX_SYNCHRONIZED_IO) && _POSIX_SYNCHRONIZED_IO > 0 case MDBX_SYNC_DATA: - if (fdatasync(fd) == 0) + if (likely(fdatasync(fd) == 0)) return MDBX_SUCCESS; break /* error */; #if defined(__linux__) || defined(__gnu_linux__) @@ -1581,7 +1581,7 @@ MDBX_INTERNAL_FUNC int osal_fsync(mdbx_filehandle_t fd, #endif /* Linux */ #endif /* _POSIX_SYNCHRONIZED_IO > 0 */ default: - if (fsync(fd) == 0) + if (likely(fsync(fd) == 0)) return MDBX_SUCCESS; } From 61d21b0a02c10328fd274d7905da2636c126810d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Tue, 27 Dec 2022 15:03:54 +0300 Subject: [PATCH 296/364] =?UTF-8?q?mdbx:=20=D0=BD=D0=B5=20=D1=82=D1=80?= =?UTF-8?q?=D0=BE=D0=B3=D0=B0=D1=82=D1=8C=20LRU=20=D0=B8=20dbi=20=D0=B2=20?= =?UTF-8?q?cursor=5Ftouch()=20=D0=B4=D0=BB=D1=8F=20=D0=B2=D0=BB=D0=BE?= =?UTF-8?q?=D0=B6=D0=B5=D0=BD=D0=BD=D1=8B=D1=85=20=D0=BA=D1=83=D1=80=D1=81?= =?UTF-8?q?=D0=BE=D1=80=D0=BE=D0=B2.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/core.c b/src/core.c index 976a2251..091daa05 100644 --- a/src/core.c +++ b/src/core.c @@ -16909,22 +16909,22 @@ static int touch_dbi(MDBX_cursor *mc) { return MDBX_SUCCESS; } -static int cursor_touch(MDBX_cursor *const mc, const MDBX_val *key, - const MDBX_val *data) { +static __hot int cursor_touch(MDBX_cursor *const mc, const MDBX_val *key, + const MDBX_val *data) { cASSERT(mc, (mc->mc_txn->mt_flags & MDBX_TXN_RDONLY) == 0); cASSERT(mc, (mc->mc_flags & C_INITIALIZED) || mc->mc_snum == 0); cASSERT(mc, cursor_is_tracked(mc)); - txn_lru_turn(mc->mc_txn); - - if (unlikely((*mc->mc_dbistate & DBI_DIRTY) == 0)) { - int err = touch_dbi(mc); - if (unlikely(err != MDBX_SUCCESS)) - return err; - } - if ((mc->mc_flags & C_SUB) == 0) { MDBX_txn *const txn = mc->mc_txn; + txn_lru_turn(txn); + + if (unlikely((*mc->mc_dbistate & DBI_DIRTY) == 0)) { + int err = touch_dbi(mc); + if (unlikely(err != MDBX_SUCCESS)) + return err; + } + /* Estimate how much space this operation will take: */ /* 1) Max b-tree height, reasonable enough with including dups' sub-tree */ size_t need = CURSOR_STACK + 3; From 66a57049494d45227f51e756bf15901262235f5a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Tue, 27 Dec 2022 14:51:07 +0300 Subject: [PATCH 297/364] =?UTF-8?q?mdbx:=20=D0=B2=D1=8B=D0=B4=D0=B5=D0=BB?= =?UTF-8?q?=D0=B5=D0=BD=D0=B8=D0=B5=20`cursor=5Fput()`=20=D0=B4=D0=BB?= =?UTF-8?q?=D1=8F=20=D1=83=D0=BC=D0=B5=D0=BD=D1=8C=D1=88=D0=B5=D0=BD=D0=B8?= =?UTF-8?q?=D1=8F=20=D0=BA=D0=BE=D0=BB-=D0=B2=D0=B0=20=D0=BF=D1=80=D0=BE?= =?UTF-8?q?=D0=B2=D0=B5=D1=80=D0=BE=D0=BA.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core.c | 319 ++++++++++++++++++++++++++++------------------------- 1 file changed, 169 insertions(+), 150 deletions(-) diff --git a/src/core.c b/src/core.c index 091daa05..7507983f 100644 --- a/src/core.c +++ b/src/core.c @@ -3329,6 +3329,14 @@ static int __must_check_result audit_ex(MDBX_txn *txn, size_t retired_stored, static int __must_check_result page_check(const MDBX_cursor *const mc, const MDBX_page *const mp); static int __must_check_result cursor_check(const MDBX_cursor *mc); +static int __must_check_result cursor_put_checklen(MDBX_cursor *mc, + const MDBX_val *key, + MDBX_val *data, + unsigned flags); +static int __must_check_result cursor_put_nochecklen(MDBX_cursor *mc, + const MDBX_val *key, + MDBX_val *data, + unsigned flags); static int __must_check_result cursor_check_updating(MDBX_cursor *mc); static int __must_check_result cursor_del(MDBX_cursor *mc); static int __must_check_result delete(MDBX_txn *txn, MDBX_dbi dbi, @@ -10420,7 +10428,7 @@ retry: ? env->me_maxgc_ov1page : left; data.iov_len = (chunk + 1) * sizeof(pgno_t); - rc = mdbx_cursor_put(&ctx->cursor, &key, &data, MDBX_RESERVE); + rc = cursor_put_nochecklen(&ctx->cursor, &key, &data, MDBX_RESERVE); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; @@ -10458,7 +10466,7 @@ retry: do { gcu_prepare_backlog(txn, ctx); data.iov_len = MDBX_PNL_SIZEOF(txn->tw.retired_pages); - rc = mdbx_cursor_put(&ctx->cursor, &key, &data, MDBX_RESERVE); + rc = cursor_put_nochecklen(&ctx->cursor, &key, &data, MDBX_RESERVE); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; /* Retry if tw.retired_pages[] grew during the Put() */ @@ -10753,8 +10761,8 @@ retry: TRACE("%s: reserve %zu [%zu...%zu) @%" PRIaTXN, dbg_prefix_mode, chunk, ctx->settled + 1, ctx->settled + chunk + 1, reservation_gc_id); gcu_prepare_backlog(txn, ctx); - rc = mdbx_cursor_put(&ctx->cursor, &key, &data, - MDBX_RESERVE | MDBX_NOOVERWRITE); + rc = cursor_put_nochecklen(&ctx->cursor, &key, &data, + MDBX_RESERVE | MDBX_NOOVERWRITE); tASSERT(txn, pnl_check_allocated(txn->tw.relist, txn->mt_next_pgno - MDBX_ENABLE_REFUND)); if (unlikely(rc != MDBX_SUCCESS)) @@ -10859,8 +10867,8 @@ retry: } chunk = left; } - rc = mdbx_cursor_put(&ctx->cursor, &key, &data, - MDBX_CURRENT | MDBX_RESERVE); + rc = cursor_put_nochecklen(&ctx->cursor, &key, &data, + MDBX_CURRENT | MDBX_RESERVE); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; gcu_clean_reserved(env, data); @@ -11634,10 +11642,10 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { /* Может быть mod_txnid > front после коммита вложенных тразакций */ db->md_mod_txnid = txn->mt_txnid; data.iov_base = db; - WITH_CURSOR_TRACKING(couple.outer, - rc = mdbx_cursor_put(&couple.outer, - &txn->mt_dbxs[i].md_name, - &data, F_SUBDATA)); + WITH_CURSOR_TRACKING( + couple.outer, + rc = cursor_put_nochecklen(&couple.outer, &txn->mt_dbxs[i].md_name, + &data, F_SUBDATA)); if (unlikely(rc != MDBX_SUCCESS)) goto fail; } @@ -16969,130 +16977,14 @@ static __hot int cursor_touch(MDBX_cursor *const mc, const MDBX_val *key, return rc; } -__hot int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, - unsigned flags) { - MDBX_env *env; - MDBX_page *sub_root = NULL; +static __hot int cursor_put_nochecklen(MDBX_cursor *mc, const MDBX_val *key, + MDBX_val *data, unsigned flags) { + MDBX_page *sub_root = nullptr; MDBX_val xdata, *rdata, dkey, olddata; MDBX_db nested_dupdb; int err; DKBUF_DEBUG; - - if (unlikely(mc == NULL || key == NULL || data == NULL)) - return MDBX_EINVAL; - - if (unlikely(mc->mc_signature != MDBX_MC_LIVE)) - return (mc->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL - : MDBX_EBADSIGN; - - int rc = check_txn_rw(mc->mc_txn, MDBX_TXN_BLOCKED); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - - if (unlikely(dbi_changed(mc->mc_txn, mc->mc_dbi))) - return MDBX_BAD_DBI; - - cASSERT(mc, cursor_is_tracked(mc)); - env = mc->mc_txn->mt_env; - - /* Check this first so counter will always be zero on any early failures. */ - size_t mcount = 0, dcount = 0; - if (unlikely(flags & MDBX_MULTIPLE)) { - if (unlikely(flags & MDBX_RESERVE)) - return MDBX_EINVAL; - if (unlikely(!(mc->mc_db->md_flags & MDBX_DUPFIXED))) - return MDBX_INCOMPATIBLE; - dcount = data[1].iov_len; - if (unlikely(dcount < 2 || data->iov_len == 0)) - return MDBX_BAD_VALSIZE; - if (unlikely(mc->mc_db->md_xsize != data->iov_len) && mc->mc_db->md_xsize) - return MDBX_BAD_VALSIZE; - if (unlikely(dcount > MAX_MAPSIZE / 2 / - (BRANCH_NODE_MAX(MAX_PAGESIZE) - NODESIZE))) { - /* checking for multiplication overflow */ - if (unlikely(dcount > MAX_MAPSIZE / 2 / data->iov_len)) - return MDBX_TOO_LARGE; - } - data[1].iov_len = 0 /* reset done item counter */; - } - - if (flags & MDBX_RESERVE) { - if (unlikely(mc->mc_db->md_flags & (MDBX_DUPSORT | MDBX_REVERSEDUP | - MDBX_INTEGERDUP | MDBX_DUPFIXED))) - return MDBX_INCOMPATIBLE; - data->iov_base = nullptr; - } - - if (unlikely(mc->mc_txn->mt_flags & (MDBX_TXN_RDONLY | MDBX_TXN_BLOCKED))) - return (mc->mc_txn->mt_flags & MDBX_TXN_RDONLY) ? MDBX_EACCESS - : MDBX_BAD_TXN; - - uint64_t aligned_keybytes, aligned_databytes; - MDBX_val aligned_key, aligned_data; - if (likely((mc->mc_flags & C_SUB) == 0)) { - if (unlikely(key->iov_len < mc->mc_dbx->md_klen_min || - key->iov_len > mc->mc_dbx->md_klen_max)) { - cASSERT(mc, !"Invalid key-size"); - return MDBX_BAD_VALSIZE; - } - if (unlikely(data->iov_len < mc->mc_dbx->md_vlen_min || - data->iov_len > mc->mc_dbx->md_vlen_max)) { - cASSERT(mc, !"Invalid data-size"); - return MDBX_BAD_VALSIZE; - } - - if (mc->mc_db->md_flags & MDBX_INTEGERKEY) { - switch (key->iov_len) { - default: - cASSERT(mc, !"key-size is invalid for MDBX_INTEGERKEY"); - return MDBX_BAD_VALSIZE; - case 4: - if (unlikely(3 & (uintptr_t)key->iov_base)) { - /* copy instead of return error to avoid break compatibility */ - aligned_key.iov_base = - memcpy(&aligned_keybytes, key->iov_base, aligned_key.iov_len = 4); - key = &aligned_key; - } - break; - case 8: - if (unlikely(7 & (uintptr_t)key->iov_base)) { - /* copy instead of return error to avoid break compatibility */ - aligned_key.iov_base = - memcpy(&aligned_keybytes, key->iov_base, aligned_key.iov_len = 8); - key = &aligned_key; - } - break; - } - } - if (mc->mc_db->md_flags & MDBX_INTEGERDUP) { - switch (data->iov_len) { - default: - cASSERT(mc, !"data-size is invalid for MDBX_INTEGERKEY"); - return MDBX_BAD_VALSIZE; - case 4: - if (unlikely(3 & (uintptr_t)data->iov_base)) { - if (unlikely(flags & MDBX_MULTIPLE)) - return MDBX_BAD_VALSIZE; - /* copy instead of return error to avoid break compatibility */ - aligned_data.iov_base = memcpy(&aligned_databytes, data->iov_base, - aligned_data.iov_len = 4); - data = &aligned_data; - } - break; - case 8: - if (unlikely(7 & (uintptr_t)data->iov_base)) { - if (unlikely(flags & MDBX_MULTIPLE)) - return MDBX_BAD_VALSIZE; - /* copy instead of return error to avoid break compatibility */ - aligned_data.iov_base = memcpy(&aligned_databytes, data->iov_base, - aligned_data.iov_len = 8); - data = &aligned_data; - } - break; - } - } - } - + MDBX_env *const env = mc->mc_txn->mt_env; DEBUG("==> put db %d key [%s], size %" PRIuPTR ", data [%s] size %" PRIuPTR, DDBI(mc), DKEY_DEBUG(key), key->iov_len, DVAL_DEBUG((flags & MDBX_RESERVE) ? nullptr : data), data->iov_len); @@ -17107,9 +16999,9 @@ __hot int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, * Здесь проще вызвать mdbx_cursor_get(), так как для обслуживания таблиц * с MDBX_DUPSORT также требуется текущий размер данных. */ MDBX_val current_key, current_data; - rc = mdbx_cursor_get(mc, ¤t_key, ¤t_data, MDBX_GET_CURRENT); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; + err = mdbx_cursor_get(mc, ¤t_key, ¤t_data, MDBX_GET_CURRENT); + if (unlikely(err != MDBX_SUCCESS)) + return err; if (mc->mc_dbx->md_cmp(key, ¤t_key) != 0) return MDBX_EKEYMISMATCH; @@ -17127,16 +17019,16 @@ __hot int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, if (mc->mc_xcursor->mx_db.md_entries > 1 || current_data.iov_len != data->iov_len) { drop_current: - rc = mdbx_cursor_del(mc, flags & MDBX_ALLDUPS); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; + err = mdbx_cursor_del(mc, flags & MDBX_ALLDUPS); + if (unlikely(err != MDBX_SUCCESS)) + return err; flags -= MDBX_CURRENT; goto skip_check_samedata; } } else if (unlikely(node_size(key, data) > env->me_leaf_nodemax)) { - rc = mdbx_cursor_del(mc, 0); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; + err = mdbx_cursor_del(mc, 0); + if (unlikely(err != MDBX_SUCCESS)) + return err; flags -= MDBX_CURRENT; goto skip_check_samedata; } @@ -17147,6 +17039,7 @@ __hot int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, skip_check_samedata:; } + int rc = MDBX_SUCCESS; if (mc->mc_db->md_root == P_INVALID) { /* new database, cursor has nothing to point to */ mc->mc_snum = 0; @@ -17196,9 +17089,9 @@ __hot int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, } if (unlikely(flags & MDBX_ALLDUPS) && mc->mc_xcursor && (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) { - rc = mdbx_cursor_del(mc, MDBX_ALLDUPS); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; + err = mdbx_cursor_del(mc, MDBX_ALLDUPS); + if (unlikely(err != MDBX_SUCCESS)) + return err; flags -= MDBX_ALLDUPS; rc = MDBX_NOTFOUND; exact = false; @@ -17227,12 +17120,16 @@ __hot int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, } mc->mc_flags &= ~C_DEL; - /* Cursor is positioned, check for room in the dirty list */ rdata = data; + size_t mcount = 0, dcount = 0; if (unlikely(flags & MDBX_MULTIPLE)) { + dcount = data[1].iov_len; + data[1].iov_len = 0 /* reset done item counter */; rdata = &xdata; xdata.iov_len = data->iov_len * dcount; } + + /* Cursor is positioned, check for room in the dirty list */ err = cursor_touch(mc, key, rdata); if (unlikely(err)) return err; @@ -17670,7 +17567,8 @@ new_sub:; mc->mc_xcursor->mx_cursor.mc_pg[0] = sub_root; /* converted, write the original data first */ if (dupdata_flag) { - rc = mdbx_cursor_put(&mc->mc_xcursor->mx_cursor, &dkey, &xdata, xflags); + rc = cursor_put_nochecklen(&mc->mc_xcursor->mx_cursor, &dkey, &xdata, + xflags); if (unlikely(rc)) goto bad_sub; /* we've done our job */ @@ -17706,7 +17604,8 @@ new_sub:; STATIC_ASSERT((MDBX_APPENDDUP >> SHIFT_MDBX_APPENDDUP_TO_MDBX_APPEND) == MDBX_APPEND); xflags |= (flags & MDBX_APPENDDUP) >> SHIFT_MDBX_APPENDDUP_TO_MDBX_APPEND; - rc = mdbx_cursor_put(&mc->mc_xcursor->mx_cursor, data, &xdata, xflags); + rc = cursor_put_nochecklen(&mc->mc_xcursor->mx_cursor, data, &xdata, + xflags); if (flags & F_SUBDATA) { void *db = node_data(node); mc->mc_xcursor->mx_db.md_mod_txnid = mc->mc_txn->mt_txnid; @@ -17752,6 +17651,126 @@ new_sub:; return rc; } +static __hot int cursor_put_checklen(MDBX_cursor *mc, const MDBX_val *key, + MDBX_val *data, unsigned flags) { + cASSERT(mc, (mc->mc_flags & C_SUB) == 0); + uint64_t aligned_keybytes, aligned_databytes; + MDBX_val aligned_key, aligned_data; + if (unlikely(key->iov_len < mc->mc_dbx->md_klen_min || + key->iov_len > mc->mc_dbx->md_klen_max)) { + cASSERT(mc, !"Invalid key-size"); + return MDBX_BAD_VALSIZE; + } + if (unlikely(data->iov_len < mc->mc_dbx->md_vlen_min || + data->iov_len > mc->mc_dbx->md_vlen_max)) { + cASSERT(mc, !"Invalid data-size"); + return MDBX_BAD_VALSIZE; + } + + if (mc->mc_db->md_flags & MDBX_INTEGERKEY) { + switch (key->iov_len) { + default: + cASSERT(mc, !"key-size is invalid for MDBX_INTEGERKEY"); + return MDBX_BAD_VALSIZE; + case 4: + if (unlikely(3 & (uintptr_t)key->iov_base)) { + /* copy instead of return error to avoid break compatibility */ + aligned_key.iov_base = + memcpy(&aligned_keybytes, key->iov_base, aligned_key.iov_len = 4); + key = &aligned_key; + } + break; + case 8: + if (unlikely(7 & (uintptr_t)key->iov_base)) { + /* copy instead of return error to avoid break compatibility */ + aligned_key.iov_base = + memcpy(&aligned_keybytes, key->iov_base, aligned_key.iov_len = 8); + key = &aligned_key; + } + break; + } + } + if (mc->mc_db->md_flags & MDBX_INTEGERDUP) { + switch (data->iov_len) { + default: + cASSERT(mc, !"data-size is invalid for MDBX_INTEGERKEY"); + return MDBX_BAD_VALSIZE; + case 4: + if (unlikely(3 & (uintptr_t)data->iov_base)) { + if (unlikely(flags & MDBX_MULTIPLE)) + return MDBX_BAD_VALSIZE; + /* copy instead of return error to avoid break compatibility */ + aligned_data.iov_base = memcpy(&aligned_databytes, data->iov_base, + aligned_data.iov_len = 4); + data = &aligned_data; + } + break; + case 8: + if (unlikely(7 & (uintptr_t)data->iov_base)) { + if (unlikely(flags & MDBX_MULTIPLE)) + return MDBX_BAD_VALSIZE; + /* copy instead of return error to avoid break compatibility */ + aligned_data.iov_base = memcpy(&aligned_databytes, data->iov_base, + aligned_data.iov_len = 8); + data = &aligned_data; + } + break; + } + } + return cursor_put_nochecklen(mc, key, data, flags); +} + +int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, + unsigned flags) { + if (unlikely(mc == NULL || key == NULL || data == NULL)) + return MDBX_EINVAL; + + if (unlikely(mc->mc_signature != MDBX_MC_LIVE)) + return (mc->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL + : MDBX_EBADSIGN; + + int rc = check_txn_rw(mc->mc_txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + if (unlikely(dbi_changed(mc->mc_txn, mc->mc_dbi))) + return MDBX_BAD_DBI; + + cASSERT(mc, cursor_is_tracked(mc)); + + /* Check this first so counter will always be zero on any early failures. */ + if (unlikely(flags & MDBX_MULTIPLE)) { + if (unlikely(flags & MDBX_RESERVE)) + return MDBX_EINVAL; + if (unlikely(!(mc->mc_db->md_flags & MDBX_DUPFIXED))) + return MDBX_INCOMPATIBLE; + const size_t dcount = data[1].iov_len; + if (unlikely(dcount < 2 || data->iov_len == 0)) + return MDBX_BAD_VALSIZE; + if (unlikely(mc->mc_db->md_xsize != data->iov_len) && mc->mc_db->md_xsize) + return MDBX_BAD_VALSIZE; + if (unlikely(dcount > MAX_MAPSIZE / 2 / + (BRANCH_NODE_MAX(MAX_PAGESIZE) - NODESIZE))) { + /* checking for multiplication overflow */ + if (unlikely(dcount > MAX_MAPSIZE / 2 / data->iov_len)) + return MDBX_TOO_LARGE; + } + } + + if (flags & MDBX_RESERVE) { + if (unlikely(mc->mc_db->md_flags & (MDBX_DUPSORT | MDBX_REVERSEDUP | + MDBX_INTEGERDUP | MDBX_DUPFIXED))) + return MDBX_INCOMPATIBLE; + data->iov_base = nullptr; + } + + if (unlikely(mc->mc_txn->mt_flags & (MDBX_TXN_RDONLY | MDBX_TXN_BLOCKED))) + return (mc->mc_txn->mt_flags & MDBX_TXN_RDONLY) ? MDBX_EACCESS + : MDBX_BAD_TXN; + + return cursor_put_checklen(mc, key, data, flags); +} + __hot int mdbx_cursor_del(MDBX_cursor *mc, MDBX_put_flags_t flags) { if (unlikely(!mc)) return MDBX_EINVAL; @@ -20871,7 +20890,7 @@ int mdbx_put(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, MDBX_val *data, } if (likely(rc == MDBX_SUCCESS)) - rc = mdbx_cursor_put(&cx.outer, key, data, flags); + rc = cursor_put_checklen(&cx.outer, key, data, flags); txn->mt_cursors[dbi] = cx.outer.mc_next; return rc; @@ -22431,9 +22450,9 @@ static int dbi_open(MDBX_txn *txn, const MDBX_val *const table_name, db_dummy.md_flags = user_flags & DB_PERSISTENT_FLAGS; data.iov_len = sizeof(db_dummy); data.iov_base = &db_dummy; - WITH_CURSOR_TRACKING(couple.outer, - rc = mdbx_cursor_put(&couple.outer, &key, &data, - F_SUBDATA | MDBX_NOOVERWRITE)); + WITH_CURSOR_TRACKING( + couple.outer, rc = cursor_put_checklen(&couple.outer, &key, &data, + F_SUBDATA | MDBX_NOOVERWRITE)); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; @@ -24079,7 +24098,7 @@ int mdbx_replace_ex(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, } if (likely(new_data)) - rc = mdbx_cursor_put(&cx.outer, key, new_data, flags); + rc = cursor_put_checklen(&cx.outer, key, new_data, flags); else rc = mdbx_cursor_del(&cx.outer, flags & MDBX_ALLDUPS); From df63ff0e7e554ecf8a387f4c334f605b172818ab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Tue, 27 Dec 2022 15:03:44 +0300 Subject: [PATCH 298/364] =?UTF-8?q?mdbx:=20=D0=B2=D1=8B=D0=B4=D0=B5=D0=BB?= =?UTF-8?q?=D0=B5=D0=BD=D0=B8=D0=B5=20`cursor=5Fdel()`=20=D0=B4=D0=BB?= =?UTF-8?q?=D1=8F=20=D1=83=D0=BC=D0=B5=D0=BD=D1=8C=D1=88=D0=B5=D0=BD=D0=B8?= =?UTF-8?q?=D1=8F=20=D0=BA=D0=BE=D0=BB-=D0=B2=D0=B0=20=D0=BF=D1=80=D0=BE?= =?UTF-8?q?=D0=B2=D0=B5=D1=80=D0=BE=D0=BA.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core.c | 259 ++++++++++++++++++++++++++--------------------------- 1 file changed, 125 insertions(+), 134 deletions(-) diff --git a/src/core.c b/src/core.c index 7507983f..589dd7b4 100644 --- a/src/core.c +++ b/src/core.c @@ -3338,7 +3338,8 @@ static int __must_check_result cursor_put_nochecklen(MDBX_cursor *mc, MDBX_val *data, unsigned flags); static int __must_check_result cursor_check_updating(MDBX_cursor *mc); -static int __must_check_result cursor_del(MDBX_cursor *mc); +static int __must_check_result cursor_del(MDBX_cursor *mc, + MDBX_put_flags_t flags); static int __must_check_result delete(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, const MDBX_val *data, unsigned flags); @@ -10014,7 +10015,7 @@ static int gcu_clean_stored_retired(MDBX_txn *txn, gcu_context_t *ctx) { const struct cursor_set_result csr = cursor_set(gc, &key, &val, MDBX_SET); if (csr.err == MDBX_SUCCESS && csr.exact) { ctx->retired_stored = 0; - err = mdbx_cursor_del(gc, 0); + err = cursor_del(gc, 0); TRACE("== clear-4linear, backlog %zu, err %d", gcu_backlog_size(txn), err); } @@ -10217,7 +10218,7 @@ retry: TRACE("%s: cleanup-reclaimed-id [%zu]%" PRIaTXN, dbg_prefix_mode, ctx->cleaned_slot, ctx->cleaned_id); tASSERT(txn, *txn->mt_cursors == &ctx->cursor); - rc = mdbx_cursor_del(&ctx->cursor, 0); + rc = cursor_del(&ctx->cursor, 0); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } while (ctx->cleaned_slot < MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed)); @@ -10252,7 +10253,7 @@ retry: TRACE("%s: cleanup-reclaimed-id %" PRIaTXN, dbg_prefix_mode, ctx->cleaned_id); tASSERT(txn, *txn->mt_cursors == &ctx->cursor); - rc = mdbx_cursor_del(&ctx->cursor, 0); + rc = cursor_del(&ctx->cursor, 0); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } @@ -17019,14 +17020,14 @@ static __hot int cursor_put_nochecklen(MDBX_cursor *mc, const MDBX_val *key, if (mc->mc_xcursor->mx_db.md_entries > 1 || current_data.iov_len != data->iov_len) { drop_current: - err = mdbx_cursor_del(mc, flags & MDBX_ALLDUPS); + err = cursor_del(mc, flags & MDBX_ALLDUPS); if (unlikely(err != MDBX_SUCCESS)) return err; flags -= MDBX_CURRENT; goto skip_check_samedata; } } else if (unlikely(node_size(key, data) > env->me_leaf_nodemax)) { - err = mdbx_cursor_del(mc, 0); + err = cursor_del(mc, 0); if (unlikely(err != MDBX_SUCCESS)) return err; flags -= MDBX_CURRENT; @@ -17089,7 +17090,7 @@ static __hot int cursor_put_nochecklen(MDBX_cursor *mc, const MDBX_val *key, } if (unlikely(flags & MDBX_ALLDUPS) && mc->mc_xcursor && (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) { - err = mdbx_cursor_del(mc, MDBX_ALLDUPS); + err = cursor_del(mc, MDBX_ALLDUPS); if (unlikely(err != MDBX_SUCCESS)) return err; flags -= MDBX_ALLDUPS; @@ -17771,7 +17772,7 @@ int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, return cursor_put_checklen(mc, key, data, flags); } -__hot int mdbx_cursor_del(MDBX_cursor *mc, MDBX_put_flags_t flags) { +int mdbx_cursor_del(MDBX_cursor *mc, MDBX_put_flags_t flags) { if (unlikely(!mc)) return MDBX_EINVAL; @@ -17792,7 +17793,14 @@ __hot int mdbx_cursor_del(MDBX_cursor *mc, MDBX_put_flags_t flags) { if (unlikely(mc->mc_ki[mc->mc_top] >= page_numkeys(mc->mc_pg[mc->mc_top]))) return MDBX_NOTFOUND; - rc = cursor_touch(mc, nullptr, nullptr); + return cursor_del(mc, flags); +} + +static __hot int cursor_del(MDBX_cursor *mc, MDBX_put_flags_t flags) { + cASSERT(mc, mc->mc_flags & C_INITIALIZED); + cASSERT(mc, mc->mc_ki[mc->mc_top] < page_numkeys(mc->mc_pg[mc->mc_top])); + + int rc = cursor_touch(mc, nullptr, nullptr); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -17808,22 +17816,21 @@ __hot int mdbx_cursor_del(MDBX_cursor *mc, MDBX_put_flags_t flags) { MDBX_node *node = page_node(mp, mc->mc_ki[mc->mc_top]); if (node_flags(node) & F_DUPDATA) { if (flags & (MDBX_ALLDUPS | /* for compatibility */ MDBX_NODUPDATA)) { - /* cursor_del() will subtract the final entry */ + /* will subtract the final entry later */ mc->mc_db->md_entries -= mc->mc_xcursor->mx_db.md_entries - 1; mc->mc_xcursor->mx_cursor.mc_flags &= ~C_INITIALIZED; } else { if (!(node_flags(node) & F_SUBDATA)) mc->mc_xcursor->mx_cursor.mc_pg[0] = node_data(node); - rc = mdbx_cursor_del(&mc->mc_xcursor->mx_cursor, 0); + rc = cursor_del(&mc->mc_xcursor->mx_cursor, 0); if (unlikely(rc)) return rc; /* If sub-DB still has entries, we're done */ if (mc->mc_xcursor->mx_db.md_entries) { if (node_flags(node) & F_SUBDATA) { /* update subDB info */ - void *db = node_data(node); mc->mc_xcursor->mx_db.md_mod_txnid = mc->mc_txn->mt_txnid; - memcpy(db, &mc->mc_xcursor->mx_db, sizeof(MDBX_db)); + memcpy(node_data(node), &mc->mc_xcursor->mx_db, sizeof(MDBX_db)); } else { /* shrink fake page */ node_shrink(mp, mc->mc_ki[mc->mc_top]); @@ -17878,7 +17885,109 @@ __hot int mdbx_cursor_del(MDBX_cursor *mc, MDBX_put_flags_t flags) { } del_key: - return cursor_del(mc); + mc->mc_db->md_entries--; + const MDBX_dbi dbi = mc->mc_dbi; + indx_t ki = mc->mc_ki[mc->mc_top]; + mp = mc->mc_pg[mc->mc_top]; + cASSERT(mc, IS_LEAF(mp)); + node_del(mc, mc->mc_db->md_xsize); + + /* Adjust other cursors pointing to mp */ + for (MDBX_cursor *m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) { + MDBX_cursor *m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; + if (m3 == mc || !(m2->mc_flags & m3->mc_flags & C_INITIALIZED)) + continue; + if (m3->mc_snum < mc->mc_snum) + continue; + if (m3->mc_pg[mc->mc_top] == mp) { + if (m3->mc_ki[mc->mc_top] == ki) { + m3->mc_flags |= C_DEL; + if (mc->mc_db->md_flags & MDBX_DUPSORT) { + /* Sub-cursor referred into dataset which is gone */ + m3->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED | C_EOF); + } + continue; + } else if (m3->mc_ki[mc->mc_top] > ki) { + m3->mc_ki[mc->mc_top]--; + } + if (XCURSOR_INITED(m3)) + XCURSOR_REFRESH(m3, m3->mc_pg[mc->mc_top], m3->mc_ki[mc->mc_top]); + } + } + + rc = rebalance(mc); + if (unlikely(rc != MDBX_SUCCESS)) + goto fail; + + if (unlikely(!mc->mc_snum)) { + /* DB is totally empty now, just bail out. + * Other cursors adjustments were already done + * by rebalance and aren't needed here. */ + cASSERT(mc, mc->mc_db->md_entries == 0 && mc->mc_db->md_depth == 0 && + mc->mc_db->md_root == P_INVALID); + mc->mc_flags |= C_EOF; + return MDBX_SUCCESS; + } + + ki = mc->mc_ki[mc->mc_top]; + mp = mc->mc_pg[mc->mc_top]; + cASSERT(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); + size_t nkeys = page_numkeys(mp); + cASSERT(mc, (mc->mc_db->md_entries > 0 && nkeys > 0) || + ((mc->mc_flags & C_SUB) && mc->mc_db->md_entries == 0 && + nkeys == 0)); + + /* Adjust this and other cursors pointing to mp */ + for (MDBX_cursor *m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) { + MDBX_cursor *m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; + if (!(m2->mc_flags & m3->mc_flags & C_INITIALIZED)) + continue; + if (m3->mc_snum < mc->mc_snum) + continue; + if (m3->mc_pg[mc->mc_top] == mp) { + /* if m3 points past last node in page, find next sibling */ + if (m3->mc_ki[mc->mc_top] >= nkeys) { + rc = cursor_sibling(m3, SIBLING_RIGHT); + if (rc == MDBX_NOTFOUND) { + m3->mc_flags |= C_EOF; + rc = MDBX_SUCCESS; + continue; + } + if (unlikely(rc != MDBX_SUCCESS)) + goto fail; + } + if (m3->mc_ki[mc->mc_top] >= ki || + /* moved to right sibling */ m3->mc_pg[mc->mc_top] != mp) { + if (m3->mc_xcursor && !(m3->mc_flags & C_EOF)) { + node = page_node(m3->mc_pg[m3->mc_top], m3->mc_ki[m3->mc_top]); + /* If this node has dupdata, it may need to be reinited + * because its data has moved. + * If the xcursor was not inited it must be reinited. + * Else if node points to a subDB, nothing is needed. */ + if (node_flags(node) & F_DUPDATA) { + if (m3->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) { + if (!(node_flags(node) & F_SUBDATA)) + m3->mc_xcursor->mx_cursor.mc_pg[0] = node_data(node); + } else { + rc = cursor_xinit1(m3, node, m3->mc_pg[m3->mc_top]); + if (unlikely(rc != MDBX_SUCCESS)) + goto fail; + rc = cursor_first(&m3->mc_xcursor->mx_cursor, NULL, NULL); + if (unlikely(rc != MDBX_SUCCESS)) + goto fail; + } + } + m3->mc_xcursor->mx_cursor.mc_flags |= C_DEL; + } + m3->mc_flags |= C_DEL; + } + } + } + + cASSERT(mc, rc == MDBX_SUCCESS); + if (AUDIT_ENABLED()) + rc = cursor_check(mc); + return rc; fail: mc->mc_txn->mt_flags |= MDBX_TXN_ERROR; @@ -20058,124 +20167,6 @@ __cold static int cursor_check_updating(MDBX_cursor *mc) { return rc; } -/* Complete a delete operation started by mdbx_cursor_del(). */ -static int cursor_del(MDBX_cursor *mc) { - int rc; - MDBX_page *mp; - indx_t ki; - size_t nkeys; - MDBX_dbi dbi = mc->mc_dbi; - - cASSERT(mc, cursor_is_tracked(mc)); - cASSERT(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); - ki = mc->mc_ki[mc->mc_top]; - mp = mc->mc_pg[mc->mc_top]; - node_del(mc, mc->mc_db->md_xsize); - mc->mc_db->md_entries--; - - /* Adjust other cursors pointing to mp */ - for (MDBX_cursor *m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) { - MDBX_cursor *m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; - if (m3 == mc || !(m2->mc_flags & m3->mc_flags & C_INITIALIZED)) - continue; - if (m3->mc_snum < mc->mc_snum) - continue; - if (m3->mc_pg[mc->mc_top] == mp) { - if (m3->mc_ki[mc->mc_top] == ki) { - m3->mc_flags |= C_DEL; - if (mc->mc_db->md_flags & MDBX_DUPSORT) { - /* Sub-cursor referred into dataset which is gone */ - m3->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED | C_EOF); - } - continue; - } else if (m3->mc_ki[mc->mc_top] > ki) { - m3->mc_ki[mc->mc_top]--; - } - if (XCURSOR_INITED(m3)) - XCURSOR_REFRESH(m3, m3->mc_pg[mc->mc_top], m3->mc_ki[mc->mc_top]); - } - } - - rc = rebalance(mc); - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; - - if (unlikely(!mc->mc_snum)) { - /* DB is totally empty now, just bail out. - * Other cursors adjustments were already done - * by rebalance and aren't needed here. */ - cASSERT(mc, mc->mc_db->md_entries == 0 && mc->mc_db->md_depth == 0 && - mc->mc_db->md_root == P_INVALID); - mc->mc_flags |= C_EOF; - return MDBX_SUCCESS; - } - - ki = mc->mc_ki[mc->mc_top]; - mp = mc->mc_pg[mc->mc_top]; - cASSERT(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); - nkeys = page_numkeys(mp); - cASSERT(mc, (mc->mc_db->md_entries > 0 && nkeys > 0) || - ((mc->mc_flags & C_SUB) && mc->mc_db->md_entries == 0 && - nkeys == 0)); - - /* Adjust this and other cursors pointing to mp */ - for (MDBX_cursor *m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) { - MDBX_cursor *m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; - if (!(m2->mc_flags & m3->mc_flags & C_INITIALIZED)) - continue; - if (m3->mc_snum < mc->mc_snum) - continue; - if (m3->mc_pg[mc->mc_top] == mp) { - /* if m3 points past last node in page, find next sibling */ - if (m3->mc_ki[mc->mc_top] >= nkeys) { - rc = cursor_sibling(m3, SIBLING_RIGHT); - if (rc == MDBX_NOTFOUND) { - m3->mc_flags |= C_EOF; - rc = MDBX_SUCCESS; - continue; - } - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; - } - if (m3->mc_ki[mc->mc_top] >= ki || - /* moved to right sibling */ m3->mc_pg[mc->mc_top] != mp) { - if (m3->mc_xcursor && !(m3->mc_flags & C_EOF)) { - MDBX_node *node = - page_node(m3->mc_pg[m3->mc_top], m3->mc_ki[m3->mc_top]); - /* If this node has dupdata, it may need to be reinited - * because its data has moved. - * If the xcursor was not inited it must be reinited. - * Else if node points to a subDB, nothing is needed. */ - if (node_flags(node) & F_DUPDATA) { - if (m3->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) { - if (!(node_flags(node) & F_SUBDATA)) - m3->mc_xcursor->mx_cursor.mc_pg[0] = node_data(node); - } else { - rc = cursor_xinit1(m3, node, m3->mc_pg[m3->mc_top]); - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; - rc = cursor_first(&m3->mc_xcursor->mx_cursor, NULL, NULL); - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; - } - } - m3->mc_xcursor->mx_cursor.mc_flags |= C_DEL; - } - m3->mc_flags |= C_DEL; - } - } - } - - cASSERT(mc, rc == MDBX_SUCCESS); - if (AUDIT_ENABLED()) - rc = cursor_check(mc); - return rc; - -bailout: - mc->mc_txn->mt_flags |= MDBX_TXN_ERROR; - return rc; -} - int mdbx_del(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, const MDBX_val *data) { int rc = check_txn_rw(txn, MDBX_TXN_BLOCKED); @@ -20228,7 +20219,7 @@ static int delete(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, * cursor to be consistent until the end of the rebalance. */ cx.outer.mc_next = txn->mt_cursors[dbi]; txn->mt_cursors[dbi] = &cx.outer; - rc = mdbx_cursor_del(&cx.outer, flags); + rc = cursor_del(&cx.outer, flags); txn->mt_cursors[dbi] = cx.outer.mc_next; } return rc; @@ -24100,7 +24091,7 @@ int mdbx_replace_ex(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, if (likely(new_data)) rc = cursor_put_checklen(&cx.outer, key, new_data, flags); else - rc = mdbx_cursor_del(&cx.outer, flags & MDBX_ALLDUPS); + rc = cursor_del(&cx.outer, flags & MDBX_ALLDUPS); bailout: txn->mt_cursors[dbi] = cx.outer.mc_next; From ef460a922941e14e9eb0962a8f097580890406b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Tue, 27 Dec 2022 16:39:07 +0300 Subject: [PATCH 299/364] =?UTF-8?q?mdbx:=20=D0=B2=D1=8B=D0=B4=D0=B5=D0=BB?= =?UTF-8?q?=D0=B5=D0=BD=D0=B8=D0=B5=20`cursor=5Fget()`=20=D0=B4=D0=BB?= =?UTF-8?q?=D1=8F=20=D1=83=D0=BC=D0=B5=D0=BD=D1=8C=D1=88=D0=B5=D0=BD=D0=B8?= =?UTF-8?q?=D1=8F=20=D0=BA=D0=BE=D0=BB-=D0=B2=D0=B0=20=D0=BF=D1=80=D0=BE?= =?UTF-8?q?=D0=B2=D0=B5=D1=80=D0=BE=D0=BA.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core.c | 68 ++++++++++++++++++++++++++++++------------------------ 1 file changed, 38 insertions(+), 30 deletions(-) diff --git a/src/core.c b/src/core.c index 589dd7b4..0c303fd3 100644 --- a/src/core.c +++ b/src/core.c @@ -3329,6 +3329,8 @@ static int __must_check_result audit_ex(MDBX_txn *txn, size_t retired_stored, static int __must_check_result page_check(const MDBX_cursor *const mc, const MDBX_page *const mp); static int __must_check_result cursor_check(const MDBX_cursor *mc); +static int __must_check_result cursor_get(MDBX_cursor *mc, MDBX_val *key, + MDBX_val *data, MDBX_cursor_op op); static int __must_check_result cursor_put_checklen(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, @@ -7293,7 +7295,7 @@ next_gc:; #endif /* MDBX_ENABLE_PROFGC */ /* Seek first/next GC record */ - ret.err = mdbx_cursor_get(gc, &key, NULL, op); + ret.err = cursor_get(gc, &key, NULL, op); if (unlikely(ret.err != MDBX_SUCCESS)) { if (unlikely(ret.err != MDBX_NOTFOUND)) goto fail; @@ -9863,7 +9865,7 @@ __cold static int audit_ex(MDBX_txn *txn, size_t retired_stored, size_t gc = 0; MDBX_val key, data; - while ((rc = mdbx_cursor_get(&cx.outer, &key, &data, MDBX_NEXT)) == 0) { + while ((rc = cursor_get(&cx.outer, &key, &data, MDBX_NEXT)) == 0) { if (!dont_filter_gc) { if (unlikely(key.iov_len != sizeof(txnid_t))) return MDBX_CORRUPTED; @@ -10204,7 +10206,7 @@ retry: ctx->cleaned_id <= env->me_lck->mti_oldest_reader.weak); key.iov_base = &ctx->cleaned_id; key.iov_len = sizeof(ctx->cleaned_id); - rc = mdbx_cursor_get(&ctx->cursor, &key, NULL, MDBX_SET); + rc = cursor_set(&ctx->cursor, &key, NULL, MDBX_SET).err; if (rc == MDBX_NOTFOUND) continue; if (unlikely(rc != MDBX_SUCCESS)) @@ -10601,7 +10603,7 @@ retry: ctx->rid -= 1; key.iov_base = &ctx->rid; key.iov_len = sizeof(ctx->rid); - rc = mdbx_cursor_get(&ctx->cursor, &key, &data, MDBX_SET_KEY); + rc = cursor_set(&ctx->cursor, &key, &data, MDBX_SET_KEY).err; if (unlikely(rc == MDBX_SUCCESS)) { DEBUG("%s: GC's id %" PRIaTXN " is present, going to first", dbg_prefix_mode, ctx->rid); @@ -10844,7 +10846,7 @@ retry: dbg_prefix_mode, fill_gc_id, ctx->filled_slot); key.iov_base = &fill_gc_id; key.iov_len = sizeof(fill_gc_id); - rc = mdbx_cursor_get(&ctx->cursor, &key, &data, MDBX_SET_KEY); + rc = cursor_set(&ctx->cursor, &key, &data, MDBX_SET_KEY).err; if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } @@ -15869,7 +15871,7 @@ int mdbx_get_equal_or_great(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, if (unlikely(rc != MDBX_SUCCESS)) return rc; - return mdbx_cursor_get(&cx.outer, key, data, MDBX_SET_LOWERBOUND); + return cursor_get(&cx.outer, key, data, MDBX_SET_LOWERBOUND); } int mdbx_get_ex(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data, @@ -16545,20 +16547,11 @@ static int cursor_last(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data) { return MDBX_SUCCESS; } -__hot int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, - MDBX_cursor_op op) { - if (unlikely(mc == NULL)) - return MDBX_EINVAL; - - if (unlikely(mc->mc_signature != MDBX_MC_LIVE)) - return (mc->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL - : MDBX_EBADSIGN; - - int rc = check_txn(mc->mc_txn, MDBX_TXN_BLOCKED); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - +static __hot int cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, + MDBX_cursor_op op) { int (*mfunc)(MDBX_cursor * mc, MDBX_val * key, MDBX_val * data); + int rc; + switch (op) { case MDBX_GET_CURRENT: { if (unlikely(!(mc->mc_flags & C_INITIALIZED))) @@ -16597,8 +16590,8 @@ __hot int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, if (unlikely(rc)) return rc; } else { - rc = mdbx_cursor_get(&mc->mc_xcursor->mx_cursor, data, NULL, - MDBX_GET_CURRENT); + rc = cursor_get(&mc->mc_xcursor->mx_cursor, data, NULL, + MDBX_GET_CURRENT); if (unlikely(rc)) return rc; } @@ -16704,8 +16697,7 @@ __hot int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, mc->mc_ki[mc->mc_top] = (indx_t)page_numkeys(mc->mc_pg[mc->mc_top]); mc->mc_flags |= C_EOF; return MDBX_NOTFOUND; - } - { + } else { MDBX_node *node = page_node(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); if (!(node_flags(node) & F_DUPDATA)) { get_key_optional(node, key); @@ -16775,6 +16767,22 @@ __hot int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, return rc; } +int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, + MDBX_cursor_op op) { + if (unlikely(mc == NULL)) + return MDBX_EINVAL; + + if (unlikely(mc->mc_signature != MDBX_MC_LIVE)) + return (mc->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL + : MDBX_EBADSIGN; + + int rc = check_txn(mc->mc_txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + return cursor_get(mc, key, data, op); +} + static int cursor_first_batch(MDBX_cursor *mc) { if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) { int err = page_search(mc, NULL, MDBX_PS_FIRST); @@ -16997,10 +17005,10 @@ static __hot int cursor_put_nochecklen(MDBX_cursor *mc, const MDBX_val *key, /* Опция MDBX_CURRENT означает, что запрошено обновление текущей записи, * на которой сейчас стоит курсор. Проверяем что переданный ключ совпадает * со значением в текущей позиции курсора. - * Здесь проще вызвать mdbx_cursor_get(), так как для обслуживания таблиц + * Здесь проще вызвать cursor_get(), так как для обслуживания таблиц * с MDBX_DUPSORT также требуется текущий размер данных. */ MDBX_val current_key, current_data; - err = mdbx_cursor_get(mc, ¤t_key, ¤t_data, MDBX_GET_CURRENT); + err = cursor_get(mc, ¤t_key, ¤t_data, MDBX_GET_CURRENT); if (unlikely(err != MDBX_SUCCESS)) return err; if (mc->mc_dbx->md_cmp(key, ¤t_key) != 0) @@ -20865,7 +20873,7 @@ int mdbx_put(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, MDBX_val *data, /* LY: support for update (explicit overwrite) */ if (flags & MDBX_CURRENT) { - rc = mdbx_cursor_get(&cx.outer, (MDBX_val *)key, NULL, MDBX_SET); + rc = cursor_set(&cx.outer, (MDBX_val *)key, NULL, MDBX_SET).err; if (likely(rc == MDBX_SUCCESS) && (txn->mt_dbs[dbi].md_flags & MDBX_DUPSORT) && (flags & MDBX_ALLDUPS) == 0) { @@ -21288,7 +21296,7 @@ __cold static int env_compact(MDBX_env *env, MDBX_txn *read_txn, read_txn->mt_dbs[FREE_DBI].md_leaf_pages + read_txn->mt_dbs[FREE_DBI].md_overflow_pages; MDBX_val key, data; - while ((rc = mdbx_cursor_get(&couple.outer, &key, &data, MDBX_NEXT)) == + while ((rc = cursor_get(&couple.outer, &key, &data, MDBX_NEXT)) == MDBX_SUCCESS) { const MDBX_PNL pnl = data.iov_base; if (unlikely(data.iov_len % sizeof(pgno_t) || @@ -23802,7 +23810,7 @@ int mdbx_estimate_move(const MDBX_cursor *cursor, MDBX_val *key, MDBX_val *data, } next.outer.mc_signature = MDBX_MC_LIVE; - rc = mdbx_cursor_get(&next.outer, key, data, move_op); + rc = cursor_get(&next.outer, key, data, move_op); if (unlikely(rc != MDBX_SUCCESS && (rc != MDBX_NOTFOUND || !(next.outer.mc_flags & C_INITIALIZED)))) return rc; @@ -24031,7 +24039,7 @@ int mdbx_replace_ex(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, /* убираем лишний бит, он был признаком запрошенного режима */ flags -= MDBX_NOOVERWRITE; - rc = mdbx_cursor_get(&cx.outer, &present_key, old_data, MDBX_GET_BOTH); + rc = cursor_set(&cx.outer, &present_key, old_data, MDBX_GET_BOTH).err; if (rc != MDBX_SUCCESS) goto bailout; } else { @@ -24039,7 +24047,7 @@ int mdbx_replace_ex(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, if (unlikely(new_data && old_data->iov_base == new_data->iov_base)) return MDBX_EINVAL; MDBX_val present_data; - rc = mdbx_cursor_get(&cx.outer, &present_key, &present_data, MDBX_SET_KEY); + rc = cursor_set(&cx.outer, &present_key, &present_data, MDBX_SET_KEY).err; if (unlikely(rc != MDBX_SUCCESS)) { old_data->iov_base = NULL; old_data->iov_len = 0; From 7ffea70087167a9d9375a902f504313b5fe45336 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Wed, 28 Dec 2022 16:23:08 +0300 Subject: [PATCH 300/364] =?UTF-8?q?mdbx:=20=D0=B4=D0=BE=D1=80=D0=B0=D0=B1?= =?UTF-8?q?=D0=BE=D1=82=D0=BA=D0=B0=20loose-=D0=BF=D1=83=D1=82=D0=B8=20?= =?UTF-8?q?=D0=B2=20`page=5Fretire()`.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core.c | 174 ++++++++++++++++++++++++++--------------------------- 1 file changed, 86 insertions(+), 88 deletions(-) diff --git a/src/core.c b/src/core.c index 0c303fd3..62d2cc65 100644 --- a/src/core.c +++ b/src/core.c @@ -4183,40 +4183,51 @@ __cold static void kill_page(MDBX_txn *txn, MDBX_page *mp, pgno_t pgno, } /* Remove page from dirty list, etc */ -static __inline void page_wash(MDBX_txn *txn, const size_t di, - MDBX_page *const mp, const size_t npages) { +static __inline void page_wash(MDBX_txn *txn, size_t di, MDBX_page *const mp, + const size_t npages) { tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); mp->mp_txnid = INVALID_TXNID; mp->mp_flags = P_BAD; - if (di) { - tASSERT(txn, txn->tw.dirtylist != nullptr); + if (txn->tw.dirtylist) { tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); - tASSERT(txn, di <= txn->tw.dirtylist->length && - txn->tw.dirtylist->items[di].ptr == mp); - dpl_remove_ex(txn, di, npages); - txn->tw.dirtyroom++; - tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length == - (txn->mt_parent ? txn->mt_parent->tw.dirtyroom - : txn->mt_env->me_options.dp_limit)); - } else { - tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP)); - if (txn->tw.dirtylist == nullptr) { - tASSERT(txn, !MDBX_AVOID_MSYNC); - txn->tw.writemap_dirty_npages -= (txn->tw.writemap_dirty_npages > npages) - ? npages - : txn->tw.writemap_dirty_npages; + tASSERT(txn, + MDBX_AVOID_MSYNC || (di && txn->tw.dirtylist->items[di].ptr == mp)); + if (!MDBX_AVOID_MSYNC || di) { + dpl_remove_ex(txn, di, npages); + txn->tw.dirtyroom++; + tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length == + (txn->mt_parent ? txn->mt_parent->tw.dirtyroom + : txn->mt_env->me_options.dp_limit)); + if (!MDBX_AVOID_MSYNC || !(txn->mt_flags & MDBX_WRITEMAP)) { + dpage_free(txn->mt_env, mp, npages); + return; + } } + } else { + tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) && !MDBX_AVOID_MSYNC && !di); + txn->tw.writemap_dirty_npages -= (txn->tw.writemap_dirty_npages > npages) + ? npages + : txn->tw.writemap_dirty_npages; } - VALGRIND_MAKE_MEM_UNDEFINED(mp, PAGEHDRSZ); - if (txn->mt_flags & MDBX_WRITEMAP) { - VALGRIND_MAKE_MEM_NOACCESS(page_data(mp), - pgno2bytes(txn->mt_env, npages) - PAGEHDRSZ); - MDBX_ASAN_POISON_MEMORY_REGION(page_data(mp), - pgno2bytes(txn->mt_env, npages) - PAGEHDRSZ); - } else - dpage_free(txn->mt_env, mp, npages); + VALGRIND_MAKE_MEM_NOACCESS(page_data(mp), + pgno2bytes(txn->mt_env, npages) - PAGEHDRSZ); + MDBX_ASAN_POISON_MEMORY_REGION(page_data(mp), + pgno2bytes(txn->mt_env, npages) - PAGEHDRSZ); +} + +static __inline bool suitable4loose(const MDBX_txn *txn, pgno_t pgno) { + /* TODO: + * 1) при включенной "экономии последовательностей" проверить, что + * страница не примыкает к какой-либо из уже находящийся в reclaimed. + * 2) стоит подумать над тем, чтобы при большом loose-списке отбрасывать + половину в reclaimed. */ + return txn->tw.loose_count < txn->mt_env->me_options.dp_loose_limit && + (!MDBX_ENABLE_REFUND || + /* skip pages near to the end in favor of compactification */ + txn->mt_next_pgno > pgno + txn->mt_env->me_options.dp_loose_limit || + txn->mt_next_pgno <= txn->mt_env->me_options.dp_loose_limit); } /* Retire, loosen or free a single page. @@ -4316,7 +4327,8 @@ static int page_retire_ex(MDBX_cursor *mc, const pgno_t pgno, tASSERT(txn, !txn->tw.spilled.list || !search_spilled(txn, pgno)); } else if (IS_MODIFIABLE(txn, mp)) { status = modifable; - di = txn->tw.dirtylist ? dpl_exist(txn, pgno) : 0; + if (txn->tw.dirtylist) + di = dpl_exist(txn, pgno); tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) || !IS_SPILLED(txn, mp)); tASSERT(txn, !txn->tw.spilled.list || !search_spilled(txn, pgno)); } else if (IS_SHADOWED(txn, mp)) { @@ -4370,7 +4382,7 @@ status_done: * Её МОЖНО вытолкнуть в нераспределенный хвост. */ kind = "dirty"; /* Remove from dirty list */ - page_wash(txn, di, mp ? mp : pgno2page(txn->mt_env, pgno), npages); + page_wash(txn, di, mp, npages); } else if (si) { /* Страница пролита в этой транзакции, т.е. она аллоцирована * и запачкана в этой или одной из родительских транзакций. @@ -4409,71 +4421,60 @@ status_done: } if (status == modifable) { - if (di) { - /* Dirty page from this transaction */ - /* If suitable we can reuse it through loose list */ - if (likely( - npages == 1 && - txn->tw.loose_count < txn->mt_env->me_options.dp_loose_limit && - (!MDBX_ENABLE_REFUND || - /* skip pages near to the end in favor of compactification */ - txn->mt_next_pgno > - pgno + txn->mt_env->me_options.dp_loose_limit || - txn->mt_next_pgno <= txn->mt_env->me_options.dp_loose_limit))) { - DEBUG("loosen dirty page %" PRIaPGNO, pgno); - if (MDBX_DEBUG != 0 || - unlikely(txn->mt_env->me_flags & MDBX_PAGEPERTURB)) - memset(page_data(mp), -1, txn->mt_env->me_psize - PAGEHDRSZ); - mp->mp_txnid = INVALID_TXNID; - mp->mp_flags = P_LOOSE; - mp_next(mp) = txn->tw.loose_pages; - txn->tw.loose_pages = mp; - txn->tw.loose_count++; + /* Dirty page from this transaction */ + /* If suitable we can reuse it through loose list */ + if (likely(npages == 1 && suitable4loose(txn, pgno)) && + (di || !txn->tw.dirtylist)) { + DEBUG("loosen dirty page %" PRIaPGNO, pgno); + if (MDBX_DEBUG != 0 || unlikely(txn->mt_env->me_flags & MDBX_PAGEPERTURB)) + memset(page_data(mp), -1, txn->mt_env->me_psize - PAGEHDRSZ); + mp->mp_txnid = INVALID_TXNID; + mp->mp_flags = P_LOOSE; + mp_next(mp) = txn->tw.loose_pages; + txn->tw.loose_pages = mp; + txn->tw.loose_count++; #if MDBX_ENABLE_REFUND - txn->tw.loose_refund_wl = (pgno + 2 > txn->tw.loose_refund_wl) - ? pgno + 2 - : txn->tw.loose_refund_wl; + txn->tw.loose_refund_wl = (pgno + 2 > txn->tw.loose_refund_wl) + ? pgno + 2 + : txn->tw.loose_refund_wl; #endif /* MDBX_ENABLE_REFUND */ - VALGRIND_MAKE_MEM_NOACCESS(page_data(mp), - txn->mt_env->me_psize - PAGEHDRSZ); - MDBX_ASAN_POISON_MEMORY_REGION(page_data(mp), - txn->mt_env->me_psize - PAGEHDRSZ); - return MDBX_SUCCESS; - } + VALGRIND_MAKE_MEM_NOACCESS(page_data(mp), + txn->mt_env->me_psize - PAGEHDRSZ); + MDBX_ASAN_POISON_MEMORY_REGION(page_data(mp), + txn->mt_env->me_psize - PAGEHDRSZ); + return MDBX_SUCCESS; + } #if !MDBX_DEBUG && !defined(MDBX_USE_VALGRIND) && !defined(__SANITIZE_ADDRESS__) - if (unlikely(txn->mt_env->me_flags & MDBX_PAGEPERTURB)) + if (unlikely(txn->mt_env->me_flags & MDBX_PAGEPERTURB)) #endif - { - /* Страница могла быть изменена в одной из родительских транзакций, - * в том числе, позже выгружена и затем снова загружена и изменена. - * В обоих случаях её нельзя затирать на диске и помечать недоступной - * в asan и/или valgrind */ - for (MDBX_txn *parent = txn->mt_parent; - parent && (parent->mt_flags & MDBX_TXN_SPILLS); - parent = parent->mt_parent) { - if (intersect_spilled(parent, pgno, npages)) - goto skip_invalidate; - if (dpl_intersect(parent, pgno, npages)) - goto skip_invalidate; - } + { + /* Страница могла быть изменена в одной из родительских транзакций, + * в том числе, позже выгружена и затем снова загружена и изменена. + * В обоих случаях её нельзя затирать на диске и помечать недоступной + * в asan и/или valgrind */ + for (MDBX_txn *parent = txn->mt_parent; + parent && (parent->mt_flags & MDBX_TXN_SPILLS); + parent = parent->mt_parent) { + if (intersect_spilled(parent, pgno, npages)) + goto skip_invalidate; + if (dpl_intersect(parent, pgno, npages)) + goto skip_invalidate; + } #if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__) - if (MDBX_DEBUG != 0 || - unlikely(txn->mt_env->me_flags & MDBX_PAGEPERTURB)) + if (MDBX_DEBUG != 0 || unlikely(txn->mt_env->me_flags & MDBX_PAGEPERTURB)) #endif - kill_page(txn, mp, pgno, npages); - if ((txn->mt_flags & MDBX_WRITEMAP) == 0) { - VALGRIND_MAKE_MEM_NOACCESS(page_data(pgno2page(txn->mt_env, pgno)), - pgno2bytes(txn->mt_env, npages) - - PAGEHDRSZ); - MDBX_ASAN_POISON_MEMORY_REGION( - page_data(pgno2page(txn->mt_env, pgno)), - pgno2bytes(txn->mt_env, npages) - PAGEHDRSZ); - } + kill_page(txn, mp, pgno, npages); + if ((txn->mt_flags & MDBX_WRITEMAP) == 0) { + VALGRIND_MAKE_MEM_NOACCESS(page_data(pgno2page(txn->mt_env, pgno)), + pgno2bytes(txn->mt_env, npages) - PAGEHDRSZ); + MDBX_ASAN_POISON_MEMORY_REGION(page_data(pgno2page(txn->mt_env, pgno)), + pgno2bytes(txn->mt_env, npages) - + PAGEHDRSZ); } - skip_invalidate:; } + skip_invalidate: /* wash dirty page */ page_wash(txn, di, mp, npages); @@ -4990,6 +4991,7 @@ __cold static int txn_spill_slowpath(MDBX_txn *const txn, MDBX_cursor *const m0, dpl_clear(txn->tw.dirtylist); txn->tw.dirtyroom = env->me_options.dp_limit - txn->tw.loose_count; for (MDBX_page *lp = txn->tw.loose_pages; lp != nullptr; lp = mp_next(lp)) { + tASSERT(txn, lp->mp_flags == P_LOOSE); rc = dpl_append(txn, lp->mp_pgno, lp, 1); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; @@ -9188,8 +9190,6 @@ int mdbx_txn_begin_ex(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags, if (parent->tw.loose_count) { do { MDBX_page *lp = parent->tw.loose_pages; - const size_t di = dpl_exist(parent, lp->mp_pgno); - tASSERT(parent, di && parent->tw.dirtylist->items[di].ptr == lp); tASSERT(parent, lp->mp_flags == P_LOOSE); rc = pnl_insert_range(&parent->tw.relist, lp->mp_pgno, 1); if (unlikely(rc != MDBX_SUCCESS)) @@ -9198,7 +9198,7 @@ int mdbx_txn_begin_ex(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags, VALGRIND_MAKE_MEM_DEFINED(&mp_next(lp), sizeof(MDBX_page *)); parent->tw.loose_pages = mp_next(lp); /* Remove from dirty list */ - page_wash(parent, di, lp, 1); + page_wash(parent, dpl_exist(parent, lp->mp_pgno), lp, 1); } while (parent->tw.loose_pages); parent->tw.loose_count = 0; #if MDBX_ENABLE_REFUND @@ -19339,8 +19339,6 @@ static int page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { } } - /* If not operating on GC, allow this page to be reused - * in this txn. Otherwise just add to free list. */ rc = page_retire(csrc, (MDBX_page *)psrc); if (unlikely(rc)) return rc; From 48bd3fc4c8e9eb221d8651f6a9638bd7ec88be9e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Wed, 28 Dec 2022 16:25:04 +0300 Subject: [PATCH 301/364] =?UTF-8?q?mdbx:=20=D1=83=D0=BF=D1=80=D0=BE=D1=89?= =?UTF-8?q?=D0=B5=D0=BD=D0=B8=D0=B5=20`default=5Fprefault=5Fwrite()`.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/core.c b/src/core.c index 62d2cc65..2758e524 100644 --- a/src/core.c +++ b/src/core.c @@ -5963,11 +5963,8 @@ __cold static unsigned default_rp_augment_limit(const MDBX_env *env) { } __cold static bool default_prefault_write(const MDBX_env *env) { - if (env->me_incore || - (env->me_flags & (MDBX_WRITEMAP | MDBX_RDONLY)) != MDBX_WRITEMAP) - return false; - - return !MDBX_MMAP_INCOHERENT_FILE_WRITE; + return !MDBX_MMAP_INCOHERENT_FILE_WRITE && !env->me_incore && + (env->me_flags & (MDBX_WRITEMAP | MDBX_RDONLY)) == MDBX_WRITEMAP; } static void adjust_defaults(MDBX_env *env) { From adf433a1bc4692211917c375f36702b3198b7025 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Wed, 28 Dec 2022 21:42:57 +0300 Subject: [PATCH 302/364] =?UTF-8?q?mdbx-make:=20=D0=B4=D0=BE=D1=80=D0=B0?= =?UTF-8?q?=D0=B1=D0=BE=D1=82=D0=BA=D0=B0=20=D0=BC=D0=B0=D0=BA=D1=80=D0=BE?= =?UTF-8?q?=20=D0=B4=D0=BB=D1=8F=20bench-=D1=86=D0=B5=D0=BB=D0=B5=D0=B9.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- GNUmakefile | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/GNUmakefile b/GNUmakefile index 85147f74..e46a36bf 100644 --- a/GNUmakefile +++ b/GNUmakefile @@ -813,15 +813,20 @@ define bench-rule bench-$(1)_$(2).txt: $(3) $(IOARENA) $(lastword $(MAKEFILE_LIST)) @echo ' RUNNING ioarena for $1/$2...' $(QUIET)(export LD_LIBRARY_PATH="./:$$$${LD_LIBRARY_PATH}"; \ - ldd $(IOARENA) && \ + ldd $(IOARENA) | grep -i $(1) && \ + $(IOARENA) -D $(1) -B batch -m $(BENCH_CRUD_MODE) -n $(2) \ + | tee $$@ | grep throughput | sed 's/throughput/batch×N/' && \ $(IOARENA) -D $(1) -B crud -m $(BENCH_CRUD_MODE) -n $(2) \ - | tee $$@ | grep throughput && \ + | tee -a $$@ | grep throughput | sed 's/throughput/ crud/' && \ $(IOARENA) -D $(1) -B iterate,get,iterate,get,iterate -m $(BENCH_CRUD_MODE) -r 4 -n $(2) \ - | tee -a $$@ | grep throughput \ - ) || mv -f $$@ $$@.error + | tee -a $$@ | grep throughput | sed '0,/throughput/{s/throughput/iterate/};s/throughput/ get/' && \ + $(IOARENA) -D $(1) -B delete -m $(BENCH_CRUD_MODE) -n $(2) \ + | tee -a $$@ | grep throughput | sed 's/throughput/ delete/' && \ + true) || mv -f $$@ $$@.error endef + $(eval $(call bench-rule,mdbx,$(NN),libmdbx.$(SO_SUFFIX))) $(eval $(call bench-rule,sophia,$(NN))) From f0c43fb24aa2b679d54cdf9db8274cc6589e2081 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Thu, 29 Dec 2022 00:38:50 +0300 Subject: [PATCH 303/364] =?UTF-8?q?mdbx:=20=D0=B1=D0=B5=D0=B7=20=D0=BD?= =?UTF-8?q?=D0=B5=D0=BE=D0=B1=D1=85=D0=BE=D0=B4=D0=B8=D0=BC=D0=BE=D1=81?= =?UTF-8?q?=D1=82=D0=B8=20=D0=BD=D0=B5=20=D0=BE=D0=B1=D1=8A=D0=B5=D0=B4?= =?UTF-8?q?=D0=B8=D0=BD=D1=8F=D0=B5=D0=BC=20=D0=BD=D0=B5-=D0=B3=D1=80?= =?UTF-8?q?=D1=8F=D0=B7=D0=BD=D1=8B=D0=B5=20=D1=81=D1=82=D1=80=D0=B0=D0=BD?= =?UTF-8?q?=D0=B8=D1=86=D1=8B=20=D0=B2=20=D0=B4=D0=B5=D1=80=D0=B5=D0=B2?= =?UTF-8?q?=D0=B5.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/src/core.c b/src/core.c index 2758e524..bc5021a6 100644 --- a/src/core.c +++ b/src/core.c @@ -19488,6 +19488,7 @@ static int rebalance(MDBX_cursor *mc) { (pagetype & P_LEAF) ? "leaf" : "branch", tp->mp_pgno, numkeys, page_fill(mc->mc_txn->mt_env, tp), page_used(mc->mc_txn->mt_env, tp), room); + cASSERT(mc, IS_MODIFIABLE(mc->mc_txn, tp)); if (unlikely(numkeys < minkeys)) { DEBUG("page %" PRIaPGNO " must be merged due keys < %zu threshold", @@ -19576,8 +19577,9 @@ static int rebalance(MDBX_cursor *mc) { IS_LEAF(mc->mc_pg[mc->mc_db->md_depth - 1])); rc = page_retire(mc, mp); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; + if (likely(rc == MDBX_SUCCESS)) + rc = page_touch(mc); + return rc; } else { DEBUG("root page %" PRIaPGNO " doesn't need rebalancing (flags 0x%x)", mp->mp_pgno, mp->mp_flags); @@ -19627,8 +19629,10 @@ static int rebalance(MDBX_cursor *mc) { const size_t right_room = right ? page_room(right) : 0; const size_t left_nkeys = left ? page_numkeys(left) : 0; const size_t right_nkeys = right ? page_numkeys(right) : 0; + bool involve = false; retry: - if (left_room > room_threshold && left_room >= right_room) { + if (left_room > room_threshold && left_room >= right_room && + (IS_MODIFIABLE(mc->mc_txn, left) || involve)) { /* try merge with left */ cASSERT(mc, left_nkeys >= minkeys); mn.mc_pg[mn.mc_top] = left; @@ -19646,7 +19650,8 @@ retry: return rc; } } - if (right_room > room_threshold) { + if (right_room > room_threshold && + (IS_MODIFIABLE(mc->mc_txn, right) || involve)) { /* try merge with right */ cASSERT(mc, right_nkeys >= minkeys); mn.mc_pg[mn.mc_top] = right; @@ -19662,7 +19667,8 @@ retry: } if (left_nkeys > minkeys && - (right_nkeys <= left_nkeys || right_room >= left_room)) { + (right_nkeys <= left_nkeys || right_room >= left_room) && + (IS_MODIFIABLE(mc->mc_txn, left) || involve)) { /* try move from left */ mn.mc_pg[mn.mc_top] = left; mn.mc_ki[mn.mc_top - 1] = (indx_t)(ki_pre_top - 1); @@ -19675,7 +19681,7 @@ retry: return rc; } } - if (right_nkeys > minkeys) { + if (right_nkeys > minkeys && (IS_MODIFIABLE(mc->mc_txn, right) || involve)) { /* try move from right */ mn.mc_pg[mn.mc_top] = right; mn.mc_ki[mn.mc_top - 1] = (indx_t)(ki_pre_top + 1); @@ -19696,6 +19702,10 @@ retry: return MDBX_SUCCESS; } + if (likely(!involve)) { + involve = true; + goto retry; + } if (likely(room_threshold > 0)) { room_threshold = 0; goto retry; From 37867a0b84f1d75042b4f4f943093d980d183001 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Thu, 29 Dec 2022 14:10:39 +0300 Subject: [PATCH 304/364] =?UTF-8?q?mdbx:=20=D0=BD=D0=B5=20=D0=BE=D0=B1?= =?UTF-8?q?=D1=85=D0=BE=D0=B4=D0=B8=D0=BC=20=D0=BF=D1=80=D0=BE=D0=B2=D0=B5?= =?UTF-8?q?=D1=80=D0=BA=D1=83=20=D0=BA=D0=BE=D0=B3=D0=B5=D1=80=D0=B5=D0=BD?= =?UTF-8?q?=D1=82=D0=BD=D0=BE=D1=81=D1=82=D0=B8=20=D0=B2=20=D1=80=D0=B5?= =?UTF-8?q?=D0=B6=D0=B8=D0=BC=D0=B5=20=D0=B2=D0=BE=D1=81=D1=81=D1=82=D0=B0?= =?UTF-8?q?=D0=BD=D0=BE=D0=B2=D0=BB=D0=B5=D0=BD=D0=B8=D1=8F.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/core.c b/src/core.c index bc5021a6..f3a1d765 100644 --- a/src/core.c +++ b/src/core.c @@ -8782,9 +8782,8 @@ static int txn_renew(MDBX_txn *txn, const unsigned flags) { memcpy(txn->mt_dbs, head.ptr_c->mm_dbs, CORE_DBS * sizeof(MDBX_db)); txn->mt_canary = head.ptr_v->mm_canary; - if (unlikely(env->me_stuck_meta >= 0)) - break; - if (unlikely(meta_should_retry(env, &troika) || + if (likely(env->me_stuck_meta < 0) && + unlikely(meta_should_retry(env, &troika) || head.txnid < atomic_load64(&env->me_lck->mti_oldest_reader, mo_AcquireRelease))) { if (unlikely(++loop > 42)) { From 2322138a8e75925386f58fd00625371db8cde569 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Thu, 29 Dec 2022 14:10:53 +0300 Subject: [PATCH 305/364] =?UTF-8?q?mdbx:=20=D0=BA=D0=BE=D1=80=D1=80=D0=B5?= =?UTF-8?q?=D0=BA=D1=82=D0=B8=D1=80=D0=BE=D0=B2=D0=BA=D0=B0=20=D1=81=D0=BE?= =?UTF-8?q?=D0=BE=D0=B1=D1=89=D0=B5=D0=BD=D0=B8=D1=8F=20=D0=BE=D0=B1=20?= =?UTF-8?q?=D0=BE=D1=88=D0=B8=D0=B1=D0=BA=D0=B5.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/core.c b/src/core.c index f3a1d765..fef9b487 100644 --- a/src/core.c +++ b/src/core.c @@ -13409,8 +13409,7 @@ __cold static int setup_dxb(MDBX_env *env, const int lck_rc, pv2pages(header.mm_geo.grow_pv) * pagesize, pv2pages(header.mm_geo.shrink_pv) * pagesize, header.mm_psize); if (unlikely(err != MDBX_SUCCESS)) { - ERROR("%s: err %d", "could not apply preconfigured geometry from db", - err); + ERROR("%s: err %d", "could not apply geometry from db", err); return (err == MDBX_EINVAL) ? MDBX_INCOMPATIBLE : err; } } else if (env->me_dbgeo.now) { From f53dc70038cd61e66782301f4dea247c46f3da43 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Fri, 30 Dec 2022 01:51:08 +0300 Subject: [PATCH 306/364] =?UTF-8?q?mdbx:=20=D0=B4=D0=BE=D0=B1=D0=B0=D0=B2?= =?UTF-8?q?=D0=BB=D0=B5=D0=BD=D0=B8=D0=B5=20`eq=5Ffast()`=20=D0=B4=D0=BB?= =?UTF-8?q?=D1=8F=20=D1=81=D1=80=D0=B0=D0=B2=D0=BD=D0=B5=D0=BD=D0=B8=D0=B9?= =?UTF-8?q?=20=D0=BD=D0=B0=20(=D0=BD=D0=B5)=D1=80=D0=B0=D0=B2=D0=B5=D0=BD?= =?UTF-8?q?=D1=81=D1=82=D0=B2=D0=BE.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Цель в том, чтобы уменьшить кол-во условных и безусловных переходов при сравнениях равно/неравно, в том числе избегать вызовов задаваемых кастомных компаратаров и memcmp() для коротких ключей/значений. --- src/core.c | 92 +++++++++++++++++++++++++++++------------------------- 1 file changed, 50 insertions(+), 42 deletions(-) diff --git a/src/core.c b/src/core.c index fef9b487..3116448d 100644 --- a/src/core.c +++ b/src/core.c @@ -15236,12 +15236,31 @@ __hot static int cmp_lenfast(const MDBX_val *a, const MDBX_val *b) { : memcmp(a->iov_base, b->iov_base, a->iov_len); } -static bool unsure_equal(MDBX_cmp_func cmp, const MDBX_val *a, - const MDBX_val *b) { - /* checking for the use of a known good comparator - * or/otherwise for a full byte-to-byte match */ - return cmp == cmp_lenfast || cmp == cmp_lexical || cmp == cmp_reverse || - cmp == cmp_int_unaligned || cmp_lenfast(a, b) == 0; +__hot static bool eq_fast_slowpath(const uint8_t *a, const uint8_t *b, + size_t l) { + if (likely(l > 3)) { + if (MDBX_UNALIGNED_OK >= 4 && likely(l < 9)) + return ((unaligned_peek_u32(1, a) - unaligned_peek_u32(1, b)) | + (unaligned_peek_u32(1, a + l - 4) - + unaligned_peek_u32(1, b + l - 4))) == 0; + if (MDBX_UNALIGNED_OK >= 8 && sizeof(size_t) > 7 && likely(l < 17)) + return ((unaligned_peek_u64(1, a) - unaligned_peek_u64(1, b)) | + (unaligned_peek_u64(1, a + l - 8) - + unaligned_peek_u64(1, b + l - 8))) == 0; + return memcmp(a, b, l) == 0; + } + if (likely(l)) { + STATIC_ASSERT(sizeof(int) > 2); + const unsigned a3 = a[0] << 16 | a[l >> 1] << 8 | a[l - 1]; + const unsigned b3 = b[0] << 16 | b[l >> 1] << 8 | b[l - 1]; + return a3 == b3; + } + return true; +} + +static __always_inline bool eq_fast(const MDBX_val *a, const MDBX_val *b) { + return unlikely(a->iov_len == b->iov_len) && + eq_fast_slowpath(a->iov_base, b->iov_base, a->iov_len); } /* Search for key within a page, using binary search. @@ -17099,25 +17118,21 @@ static __hot int cursor_put_nochecklen(MDBX_cursor *mc, const MDBX_val *key, flags -= MDBX_ALLDUPS; rc = MDBX_NOTFOUND; exact = false; - } else /* checking for early exit without dirtying pages */ - if (!(flags & (MDBX_RESERVE | MDBX_MULTIPLE)) && - unlikely(mc->mc_dbx->md_dcmp(data, &olddata) == 0)) { - if (!mc->mc_xcursor) - /* the same data, nothing to update */ - return MDBX_SUCCESS; - if (flags & MDBX_NODUPDATA) - return MDBX_KEYEXIST; - if (flags & MDBX_APPENDDUP) - return MDBX_EKEYMISMATCH; - if (likely(unsure_equal(mc->mc_dbx->md_dcmp, data, &olddata))) - /* data is match exactly byte-to-byte, nothing to update */ - return MDBX_SUCCESS; - else { - /* The data has differences, but the user-provided comparator - * considers them equal. So continue update since called without. - * Continue to update since was called without MDBX_NODUPDATA. */ + } else if (!(flags & (MDBX_RESERVE | MDBX_MULTIPLE))) { + /* checking for early exit without dirtying pages */ + if (unlikely(eq_fast(data, &olddata))) { + cASSERT(mc, mc->mc_dbx->md_dcmp(data, &olddata) == 0); + if (mc->mc_xcursor) { + if (flags & MDBX_NODUPDATA) + return MDBX_KEYEXIST; + if (flags & MDBX_APPENDDUP) + return MDBX_EKEYMISMATCH; } + /* the same data, nothing to update */ + return MDBX_SUCCESS; } + cASSERT(mc, mc->mc_dbx->md_dcmp(data, &olddata) != 0); + } } } else if (unlikely(rc != MDBX_NOTFOUND)) return rc; @@ -17322,28 +17337,21 @@ static __hot int cursor_put_nochecklen(MDBX_cursor *mc, const MDBX_val *key, /* Was a single item before, must convert now */ if (!(node_flags(node) & F_DUPDATA)) { - /* does data match? */ - const int cmp = mc->mc_dbx->md_dcmp(data, &olddata); - if ((flags & MDBX_APPENDDUP) && unlikely(cmp <= 0)) - return MDBX_EKEYMISMATCH; - if (cmp == 0) { + if (flags & MDBX_APPENDDUP) { + const int cmp = mc->mc_dbx->md_dcmp(data, &olddata); + cASSERT(mc, cmp != 0 || eq_fast(data, &olddata)); + if (unlikely(cmp <= 0)) + return MDBX_EKEYMISMATCH; + } else if (eq_fast(data, &olddata)) { + cASSERT(mc, mc->mc_dbx->md_dcmp(data, &olddata) == 0); if (flags & MDBX_NODUPDATA) return MDBX_KEYEXIST; - if (likely(unsure_equal(mc->mc_dbx->md_dcmp, data, &olddata))) { - /* data is match exactly byte-to-byte, nothing to update */ - if (unlikely(flags & MDBX_MULTIPLE)) { - rc = MDBX_SUCCESS; - goto continue_multiple; - } - return MDBX_SUCCESS; - } else { - /* The data has differences, but the user-provided comparator - * considers them equal. So continue update since called without. - * Continue to update since was called without MDBX_NODUPDATA. */ - } - cASSERT(mc, node_size(key, data) <= env->me_leaf_nodemax); - goto current; + /* data is match exactly byte-to-byte, nothing to update */ + rc = MDBX_SUCCESS; + if (likely((flags & MDBX_MULTIPLE) == 0)) + return rc; + goto continue_multiple; } /* Just overwrite the current item */ From be050379066f0b152afd78aeb105a5ea3e05184b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Fri, 30 Dec 2022 17:18:52 +0300 Subject: [PATCH 307/364] =?UTF-8?q?mdbx:=20=D0=BF=D0=B5=D1=80=D0=B5=D0=BC?= =?UTF-8?q?=D0=B5=D1=89=D0=B5=D0=BD=D0=B8=D0=B5=20debug/assert-=D0=BC?= =?UTF-8?q?=D0=B0=D0=BA=D1=80=D0=BE=D1=81=D0=BE=D0=B2=20=D0=BF=D0=B5=D1=80?= =?UTF-8?q?=D0=B5=D0=B4=20=D0=B0=D1=82=D0=BE=D0=BC=D0=B8=D0=BA=D0=B0=D0=BC?= =?UTF-8?q?=D0=B8.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/internals.h | 269 ++++++++++++++++++++++++------------------------ 1 file changed, 136 insertions(+), 133 deletions(-) diff --git a/src/internals.h b/src/internals.h index 684628e5..0e346ab4 100644 --- a/src/internals.h +++ b/src/internals.h @@ -227,6 +227,142 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #undef NDEBUG #endif +#ifndef __cplusplus +/*----------------------------------------------------------------------------*/ +/* Debug and Logging stuff */ + +#define MDBX_RUNTIME_FLAGS_INIT \ + ((MDBX_DEBUG) > 0) * MDBX_DBG_ASSERT + ((MDBX_DEBUG) > 1) * MDBX_DBG_AUDIT + +extern uint8_t runtime_flags; +extern uint8_t loglevel; +extern MDBX_debug_func *debug_logger; + +MDBX_MAYBE_UNUSED static __inline void jitter4testing(bool tiny) { +#if MDBX_DEBUG + if (MDBX_DBG_JITTER & runtime_flags) + osal_jitter(tiny); +#else + (void)tiny; +#endif +} + +MDBX_INTERNAL_FUNC void MDBX_PRINTF_ARGS(4, 5) + debug_log(int level, const char *function, int line, const char *fmt, ...) + MDBX_PRINTF_ARGS(4, 5); +MDBX_INTERNAL_FUNC void debug_log_va(int level, const char *function, int line, + const char *fmt, va_list args); + +#if MDBX_DEBUG +#define LOG_ENABLED(msg) unlikely(msg <= loglevel) +#define AUDIT_ENABLED() unlikely((runtime_flags & MDBX_DBG_AUDIT)) +#else /* MDBX_DEBUG */ +#define LOG_ENABLED(msg) (msg < MDBX_LOG_VERBOSE && msg <= loglevel) +#define AUDIT_ENABLED() (0) +#endif /* MDBX_DEBUG */ + +#if MDBX_FORCE_ASSERTIONS +#define ASSERT_ENABLED() (1) +#elif MDBX_DEBUG +#define ASSERT_ENABLED() likely((runtime_flags & MDBX_DBG_ASSERT)) +#else +#define ASSERT_ENABLED() (0) +#endif /* assertions */ + +#define DEBUG_EXTRA(fmt, ...) \ + do { \ + if (LOG_ENABLED(MDBX_LOG_EXTRA)) \ + debug_log(MDBX_LOG_EXTRA, __func__, __LINE__, fmt, __VA_ARGS__); \ + } while (0) + +#define DEBUG_EXTRA_PRINT(fmt, ...) \ + do { \ + if (LOG_ENABLED(MDBX_LOG_EXTRA)) \ + debug_log(MDBX_LOG_EXTRA, NULL, 0, fmt, __VA_ARGS__); \ + } while (0) + +#define TRACE(fmt, ...) \ + do { \ + if (LOG_ENABLED(MDBX_LOG_TRACE)) \ + debug_log(MDBX_LOG_TRACE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ + } while (0) + +#define DEBUG(fmt, ...) \ + do { \ + if (LOG_ENABLED(MDBX_LOG_DEBUG)) \ + debug_log(MDBX_LOG_DEBUG, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ + } while (0) + +#define VERBOSE(fmt, ...) \ + do { \ + if (LOG_ENABLED(MDBX_LOG_VERBOSE)) \ + debug_log(MDBX_LOG_VERBOSE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ + } while (0) + +#define NOTICE(fmt, ...) \ + do { \ + if (LOG_ENABLED(MDBX_LOG_NOTICE)) \ + debug_log(MDBX_LOG_NOTICE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ + } while (0) + +#define WARNING(fmt, ...) \ + do { \ + if (LOG_ENABLED(MDBX_LOG_WARN)) \ + debug_log(MDBX_LOG_WARN, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ + } while (0) + +#undef ERROR /* wingdi.h \ + Yeah, morons from M$ put such definition to the public header. */ + +#define ERROR(fmt, ...) \ + do { \ + if (LOG_ENABLED(MDBX_LOG_ERROR)) \ + debug_log(MDBX_LOG_ERROR, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ + } while (0) + +#define FATAL(fmt, ...) \ + debug_log(MDBX_LOG_FATAL, __func__, __LINE__, fmt "\n", __VA_ARGS__); + +#if MDBX_DEBUG +#define ASSERT_FAIL(env, msg, func, line) mdbx_assert_fail(env, msg, func, line) +#else /* MDBX_DEBUG */ +MDBX_NORETURN __cold void assert_fail(const char *msg, const char *func, + unsigned line); +#define ASSERT_FAIL(env, msg, func, line) \ + do { \ + (void)(env); \ + assert_fail(msg, func, line); \ + } while (0) +#endif /* MDBX_DEBUG */ + +#define ENSURE_MSG(env, expr, msg) \ + do { \ + if (unlikely(!(expr))) \ + ASSERT_FAIL(env, msg, __func__, __LINE__); \ + } while (0) + +#define ENSURE(env, expr) ENSURE_MSG(env, expr, #expr) + +/* assert(3) variant in environment context */ +#define eASSERT(env, expr) \ + do { \ + if (ASSERT_ENABLED()) \ + ENSURE(env, expr); \ + } while (0) + +/* assert(3) variant in cursor context */ +#define cASSERT(mc, expr) eASSERT((mc)->mc_txn->mt_env, expr) + +/* assert(3) variant in transaction context */ +#define tASSERT(txn, expr) eASSERT((txn)->mt_env, expr) + +#ifndef xMDBX_TOOLS /* Avoid using internal eASSERT() */ +#undef assert +#define assert(expr) eASSERT(NULL, expr) +#endif + +#endif /* __cplusplus */ + /*----------------------------------------------------------------------------*/ /* Atomics */ @@ -1359,139 +1495,6 @@ struct MDBX_env { }; #ifndef __cplusplus -/*----------------------------------------------------------------------------*/ -/* Debug and Logging stuff */ - -#define MDBX_RUNTIME_FLAGS_INIT \ - ((MDBX_DEBUG) > 0) * MDBX_DBG_ASSERT + ((MDBX_DEBUG) > 1) * MDBX_DBG_AUDIT - -extern uint8_t runtime_flags; -extern uint8_t loglevel; -extern MDBX_debug_func *debug_logger; - -MDBX_MAYBE_UNUSED static __inline void jitter4testing(bool tiny) { -#if MDBX_DEBUG - if (MDBX_DBG_JITTER & runtime_flags) - osal_jitter(tiny); -#else - (void)tiny; -#endif -} - -MDBX_INTERNAL_FUNC void MDBX_PRINTF_ARGS(4, 5) - debug_log(int level, const char *function, int line, const char *fmt, ...) - MDBX_PRINTF_ARGS(4, 5); -MDBX_INTERNAL_FUNC void debug_log_va(int level, const char *function, int line, - const char *fmt, va_list args); - -#if MDBX_DEBUG -#define LOG_ENABLED(msg) unlikely(msg <= loglevel) -#define AUDIT_ENABLED() unlikely((runtime_flags & MDBX_DBG_AUDIT)) -#else /* MDBX_DEBUG */ -#define LOG_ENABLED(msg) (msg < MDBX_LOG_VERBOSE && msg <= loglevel) -#define AUDIT_ENABLED() (0) -#endif /* MDBX_DEBUG */ - -#if MDBX_FORCE_ASSERTIONS -#define ASSERT_ENABLED() (1) -#elif MDBX_DEBUG -#define ASSERT_ENABLED() likely((runtime_flags & MDBX_DBG_ASSERT)) -#else -#define ASSERT_ENABLED() (0) -#endif /* assertions */ - -#define DEBUG_EXTRA(fmt, ...) \ - do { \ - if (LOG_ENABLED(MDBX_LOG_EXTRA)) \ - debug_log(MDBX_LOG_EXTRA, __func__, __LINE__, fmt, __VA_ARGS__); \ - } while (0) - -#define DEBUG_EXTRA_PRINT(fmt, ...) \ - do { \ - if (LOG_ENABLED(MDBX_LOG_EXTRA)) \ - debug_log(MDBX_LOG_EXTRA, NULL, 0, fmt, __VA_ARGS__); \ - } while (0) - -#define TRACE(fmt, ...) \ - do { \ - if (LOG_ENABLED(MDBX_LOG_TRACE)) \ - debug_log(MDBX_LOG_TRACE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ - } while (0) - -#define DEBUG(fmt, ...) \ - do { \ - if (LOG_ENABLED(MDBX_LOG_DEBUG)) \ - debug_log(MDBX_LOG_DEBUG, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ - } while (0) - -#define VERBOSE(fmt, ...) \ - do { \ - if (LOG_ENABLED(MDBX_LOG_VERBOSE)) \ - debug_log(MDBX_LOG_VERBOSE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ - } while (0) - -#define NOTICE(fmt, ...) \ - do { \ - if (LOG_ENABLED(MDBX_LOG_NOTICE)) \ - debug_log(MDBX_LOG_NOTICE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ - } while (0) - -#define WARNING(fmt, ...) \ - do { \ - if (LOG_ENABLED(MDBX_LOG_WARN)) \ - debug_log(MDBX_LOG_WARN, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ - } while (0) - -#undef ERROR /* wingdi.h \ - Yeah, morons from M$ put such definition to the public header. */ - -#define ERROR(fmt, ...) \ - do { \ - if (LOG_ENABLED(MDBX_LOG_ERROR)) \ - debug_log(MDBX_LOG_ERROR, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ - } while (0) - -#define FATAL(fmt, ...) \ - debug_log(MDBX_LOG_FATAL, __func__, __LINE__, fmt "\n", __VA_ARGS__); - -#if MDBX_DEBUG -#define ASSERT_FAIL(env, msg, func, line) mdbx_assert_fail(env, msg, func, line) -#else /* MDBX_DEBUG */ -MDBX_NORETURN __cold void assert_fail(const char *msg, const char *func, - unsigned line); -#define ASSERT_FAIL(env, msg, func, line) \ - do { \ - (void)(env); \ - assert_fail(msg, func, line); \ - } while (0) -#endif /* MDBX_DEBUG */ - -#define ENSURE_MSG(env, expr, msg) \ - do { \ - if (unlikely(!(expr))) \ - ASSERT_FAIL(env, msg, __func__, __LINE__); \ - } while (0) - -#define ENSURE(env, expr) ENSURE_MSG(env, expr, #expr) - -/* assert(3) variant in environment context */ -#define eASSERT(env, expr) \ - do { \ - if (ASSERT_ENABLED()) \ - ENSURE(env, expr); \ - } while (0) - -/* assert(3) variant in cursor context */ -#define cASSERT(mc, expr) eASSERT((mc)->mc_txn->mt_env, expr) - -/* assert(3) variant in transaction context */ -#define tASSERT(txn, expr) eASSERT((txn)->mt_env, expr) - -#ifndef xMDBX_TOOLS /* Avoid using internal eASSERT() */ -#undef assert -#define assert(expr) eASSERT(NULL, expr) -#endif - /*----------------------------------------------------------------------------*/ /* Cache coherence and mmap invalidation */ From 5317e516d25fffce8cb6179897141366aaf5970a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Fri, 30 Dec 2022 18:21:42 +0300 Subject: [PATCH 308/364] =?UTF-8?q?mdbx:=20=D0=BC=D0=B8=D0=BA=D1=80=D0=BE-?= =?UTF-8?q?=D0=BE=D0=BF=D1=82=D0=B8=D0=BC=D0=B8=D0=B7=D0=B0=D1=86=D0=B8?= =?UTF-8?q?=D1=8F=20`cmp=5Fint()`.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core.c | 225 ++++++++++++++++++++++++++--------------------------- 1 file changed, 110 insertions(+), 115 deletions(-) diff --git a/src/core.c b/src/core.c index 3116448d..049c17ce 100644 --- a/src/core.c +++ b/src/core.c @@ -3381,9 +3381,6 @@ static int __must_check_result setup_dbx(MDBX_dbx *const dbx, const MDBX_db *const db, const unsigned pagesize); -static MDBX_cmp_func cmp_lexical, cmp_reverse, cmp_int_align4, cmp_int_align2, - cmp_int_unaligned, cmp_lenfast; - static __inline MDBX_cmp_func *get_default_keycmp(unsigned flags); static __inline MDBX_cmp_func *get_default_datacmp(unsigned flags); @@ -11832,6 +11829,116 @@ fail: goto provide_latency; } +static __always_inline int cmp_int_inline(const size_t expected_alignment, + const MDBX_val *a, + const MDBX_val *b) { + if (likely(a->iov_len == b->iov_len)) { + if (sizeof(size_t) > 7 && likely(a->iov_len == 8)) + return CMP2INT(unaligned_peek_u64(expected_alignment, a->iov_base), + unaligned_peek_u64(expected_alignment, b->iov_base)); + if (likely(a->iov_len == 4)) + return CMP2INT(unaligned_peek_u32(expected_alignment, a->iov_base), + unaligned_peek_u32(expected_alignment, b->iov_base)); + if (sizeof(size_t) < 8 && likely(a->iov_len == 8)) + return CMP2INT(unaligned_peek_u64(expected_alignment, a->iov_base), + unaligned_peek_u64(expected_alignment, b->iov_base)); + } + ERROR("mismatch and/or invalid size %p.%zu/%p.%zu for INTEGERKEY/INTEGERDUP", + a->iov_base, a->iov_len, b->iov_base, b->iov_len); + return 0; +} + +__hot static int cmp_int_unaligned(const MDBX_val *a, const MDBX_val *b) { + return cmp_int_inline(1, a, b); +} + +/* Compare two items pointing at 2-byte aligned unsigned int's. */ +#if MDBX_UNALIGNED_OK < 2 || \ + (MDBX_DEBUG || MDBX_FORCE_ASSERTIONS || !defined(NDEBUG)) +__hot static int cmp_int_align2(const MDBX_val *a, const MDBX_val *b) { + return cmp_int_inline(2, a, b); +} +#else +#define cmp_int_align2 cmp_int_unaligned +#endif /* !MDBX_UNALIGNED_OK || debug */ + +/* Compare two items pointing at aligned unsigned int's. */ +#if MDBX_UNALIGNED_OK < 4 || \ + (MDBX_DEBUG || MDBX_FORCE_ASSERTIONS || !defined(NDEBUG)) +__hot static int cmp_int_align4(const MDBX_val *a, const MDBX_val *b) { + return cmp_int_inline(4, a, b); +} +#else +#define cmp_int_align4 cmp_int_unaligned +#endif /* !MDBX_UNALIGNED_OK || debug */ + +/* Compare two items lexically */ +__hot static int cmp_lexical(const MDBX_val *a, const MDBX_val *b) { + if (a->iov_len == b->iov_len) + return a->iov_len ? memcmp(a->iov_base, b->iov_base, a->iov_len) : 0; + + const int diff_len = (a->iov_len < b->iov_len) ? -1 : 1; + const size_t shortest = (a->iov_len < b->iov_len) ? a->iov_len : b->iov_len; + int diff_data = shortest ? memcmp(a->iov_base, b->iov_base, shortest) : 0; + return likely(diff_data) ? diff_data : diff_len; +} + +MDBX_NOTHROW_PURE_FUNCTION static __always_inline unsigned +tail3le(const uint8_t *p, size_t l) { + STATIC_ASSERT(sizeof(unsigned) > 2); + // 1: 0 0 0 + // 2: 0 1 1 + // 3: 0 1 2 + return p[0] | p[l >> 1] << 8 | p[l - 1] << 16; +} + +/* Compare two items in reverse byte order */ +__hot static int cmp_reverse(const MDBX_val *a, const MDBX_val *b) { + const size_t shortest = (a->iov_len < b->iov_len) ? a->iov_len : b->iov_len; + if (likely(shortest)) { + const uint8_t *pa = ptr_disp(a->iov_base, a->iov_len); + const uint8_t *pb = ptr_disp(b->iov_base, b->iov_len); + const uint8_t *const end = pa - shortest; + do { + int diff = *--pa - *--pb; + if (likely(diff)) + return diff; + } while (pa != end); + } + return CMP2INT(a->iov_len, b->iov_len); +} + +/* Fast non-lexically comparator */ +__hot static int cmp_lenfast(const MDBX_val *a, const MDBX_val *b) { + int diff = CMP2INT(a->iov_len, b->iov_len); + return (likely(diff) || a->iov_len == 0) + ? diff + : memcmp(a->iov_base, b->iov_base, a->iov_len); +} + +__hot static bool eq_fast_slowpath(const uint8_t *a, const uint8_t *b, + size_t l) { + if (likely(l > 3)) { + if (MDBX_UNALIGNED_OK >= 4 && likely(l < 9)) + return ((unaligned_peek_u32(1, a) - unaligned_peek_u32(1, b)) | + (unaligned_peek_u32(1, a + l - 4) - + unaligned_peek_u32(1, b + l - 4))) == 0; + if (MDBX_UNALIGNED_OK >= 8 && sizeof(size_t) > 7 && likely(l < 17)) + return ((unaligned_peek_u64(1, a) - unaligned_peek_u64(1, b)) | + (unaligned_peek_u64(1, a + l - 8) - + unaligned_peek_u64(1, b + l - 8))) == 0; + return memcmp(a, b, l) == 0; + } + if (likely(l)) + return tail3le(a, l) == tail3le(b, l); + return true; +} + +static __always_inline bool eq_fast(const MDBX_val *a, const MDBX_val *b) { + return unlikely(a->iov_len == b->iov_len) && + eq_fast_slowpath(a->iov_base, b->iov_base, a->iov_len); +} + static int validate_meta(MDBX_env *env, MDBX_meta *const meta, const MDBX_page *const page, const unsigned meta_number, unsigned *guess_pagesize) { @@ -15151,118 +15258,6 @@ __cold int mdbx_env_close(MDBX_env *env) { } #endif /* LIBMDBX_NO_EXPORTS_LEGACY_API */ -/* Compare two items pointing at aligned unsigned int's. */ -__hot static int cmp_int_align4(const MDBX_val *a, const MDBX_val *b) { - eASSERT(NULL, a->iov_len == b->iov_len); - switch (a->iov_len) { - case 4: - return CMP2INT(unaligned_peek_u32(4, a->iov_base), - unaligned_peek_u32(4, b->iov_base)); - case 8: - return CMP2INT(unaligned_peek_u64(4, a->iov_base), - unaligned_peek_u64(4, b->iov_base)); - default: - mdbx_panic("invalid size %zu for INTEGERKEY/INTEGERDUP", a->iov_len); - return 0; - } -} - -/* Compare two items pointing at 2-byte aligned unsigned int's. */ -__hot static int cmp_int_align2(const MDBX_val *a, const MDBX_val *b) { - eASSERT(NULL, a->iov_len == b->iov_len); - switch (a->iov_len) { - case 4: - return CMP2INT(unaligned_peek_u32(2, a->iov_base), - unaligned_peek_u32(2, b->iov_base)); - case 8: - return CMP2INT(unaligned_peek_u64(2, a->iov_base), - unaligned_peek_u64(2, b->iov_base)); - default: - mdbx_panic("invalid size %zu for INTEGERKEY/INTEGERDUP", a->iov_len); - return 0; - } -} - -/* Compare two items pointing at unsigned values with unknown alignment. - * - * This is also set as MDBX_INTEGERDUP|MDBX_DUPFIXED's MDBX_dbx.md_dcmp. */ -__hot static int cmp_int_unaligned(const MDBX_val *a, const MDBX_val *b) { - eASSERT(NULL, a->iov_len == b->iov_len); - switch (a->iov_len) { - case 4: - return CMP2INT(unaligned_peek_u32(1, a->iov_base), - unaligned_peek_u32(1, b->iov_base)); - case 8: - return CMP2INT(unaligned_peek_u64(1, a->iov_base), - unaligned_peek_u64(1, b->iov_base)); - default: - mdbx_panic("invalid size %zu for INTEGERKEY/INTEGERDUP", a->iov_len); - return 0; - } -} - -/* Compare two items lexically */ -__hot static int cmp_lexical(const MDBX_val *a, const MDBX_val *b) { - if (a->iov_len == b->iov_len) - return a->iov_len ? memcmp(a->iov_base, b->iov_base, a->iov_len) : 0; - - const int diff_len = (a->iov_len < b->iov_len) ? -1 : 1; - const size_t shortest = (a->iov_len < b->iov_len) ? a->iov_len : b->iov_len; - int diff_data = shortest ? memcmp(a->iov_base, b->iov_base, shortest) : 0; - return likely(diff_data) ? diff_data : diff_len; -} - -/* Compare two items in reverse byte order */ -__hot static int cmp_reverse(const MDBX_val *a, const MDBX_val *b) { - const size_t shortest = (a->iov_len < b->iov_len) ? a->iov_len : b->iov_len; - if (likely(shortest)) { - const uint8_t *pa = ptr_disp(a->iov_base, a->iov_len); - const uint8_t *pb = ptr_disp(b->iov_base, b->iov_len); - const uint8_t *const end = pa - shortest; - do { - int diff = *--pa - *--pb; - if (likely(diff)) - return diff; - } while (pa != end); - } - return CMP2INT(a->iov_len, b->iov_len); -} - -/* Fast non-lexically comparator */ -__hot static int cmp_lenfast(const MDBX_val *a, const MDBX_val *b) { - int diff = CMP2INT(a->iov_len, b->iov_len); - return likely(diff) || a->iov_len == 0 - ? diff - : memcmp(a->iov_base, b->iov_base, a->iov_len); -} - -__hot static bool eq_fast_slowpath(const uint8_t *a, const uint8_t *b, - size_t l) { - if (likely(l > 3)) { - if (MDBX_UNALIGNED_OK >= 4 && likely(l < 9)) - return ((unaligned_peek_u32(1, a) - unaligned_peek_u32(1, b)) | - (unaligned_peek_u32(1, a + l - 4) - - unaligned_peek_u32(1, b + l - 4))) == 0; - if (MDBX_UNALIGNED_OK >= 8 && sizeof(size_t) > 7 && likely(l < 17)) - return ((unaligned_peek_u64(1, a) - unaligned_peek_u64(1, b)) | - (unaligned_peek_u64(1, a + l - 8) - - unaligned_peek_u64(1, b + l - 8))) == 0; - return memcmp(a, b, l) == 0; - } - if (likely(l)) { - STATIC_ASSERT(sizeof(int) > 2); - const unsigned a3 = a[0] << 16 | a[l >> 1] << 8 | a[l - 1]; - const unsigned b3 = b[0] << 16 | b[l >> 1] << 8 | b[l - 1]; - return a3 == b3; - } - return true; -} - -static __always_inline bool eq_fast(const MDBX_val *a, const MDBX_val *b) { - return unlikely(a->iov_len == b->iov_len) && - eq_fast_slowpath(a->iov_base, b->iov_base, a->iov_len); -} - /* Search for key within a page, using binary search. * Returns the smallest entry larger or equal to the key. * Updates the cursor index with the index of the found entry. From bcddeaba9fb150c71b568075546442cabd1ab3cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Sat, 31 Dec 2022 00:55:46 +0300 Subject: [PATCH 309/364] =?UTF-8?q?mdbx:=20=D0=B8=D0=B7=D0=BC=D0=B5=D0=BD?= =?UTF-8?q?=D0=B5=D0=BD=D0=B8=D0=B5=20`CMP2INT()`.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Решил вернуться к старому варианту. Вроде-бы все актуальные компиляторы ведут себя с ним прилично (не хуже), а некоторые лучше. --- src/internals.h | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/src/internals.h b/src/internals.h index 0e346ab4..862a6697 100644 --- a/src/internals.h +++ b/src/internals.h @@ -1681,13 +1681,7 @@ typedef struct MDBX_node { * | 1, a > b * \ */ -#ifndef __e2k__ -/* LY: fast enough on most systems */ -#define CMP2INT(a, b) (((b) > (a)) ? -1 : (a) > (b)) -#else -/* LY: more parallelable on VLIW Elbrus */ -#define CMP2INT(a, b) (((a) > (b)) - ((b) > (a))) -#endif +#define CMP2INT(a, b) (((a) != (b)) ? (((a) < (b)) ? -1 : 1) : 0) MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline pgno_t int64pgno(int64_t i64) { From 8519fde741ee93119ba7e6085f5e53f68fd64e0b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Sat, 31 Dec 2022 01:30:52 +0300 Subject: [PATCH 310/364] =?UTF-8?q?mdbx:=20=D0=BC=D0=B8=D0=BA=D1=80=D0=BE-?= =?UTF-8?q?=D0=BE=D0=BF=D1=82=D0=B8=D0=BC=D0=B8=D0=B7=D0=B0=D1=86=D0=B8?= =?UTF-8?q?=D1=8F=20`cmp=5Freverse()`.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++-------- src/osal.h | 40 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 86 insertions(+), 8 deletions(-) diff --git a/src/core.c b/src/core.c index 049c17ce..38d80830 100644 --- a/src/core.c +++ b/src/core.c @@ -11894,16 +11894,54 @@ tail3le(const uint8_t *p, size_t l) { /* Compare two items in reverse byte order */ __hot static int cmp_reverse(const MDBX_val *a, const MDBX_val *b) { - const size_t shortest = (a->iov_len < b->iov_len) ? a->iov_len : b->iov_len; - if (likely(shortest)) { + size_t left = (a->iov_len < b->iov_len) ? a->iov_len : b->iov_len; + if (likely(left)) { const uint8_t *pa = ptr_disp(a->iov_base, a->iov_len); const uint8_t *pb = ptr_disp(b->iov_base, b->iov_len); - const uint8_t *const end = pa - shortest; - do { - int diff = *--pa - *--pb; - if (likely(diff)) - return diff; - } while (pa != end); + while (left >= sizeof(size_t)) { + pa -= sizeof(size_t); + pb -= sizeof(size_t); + left -= sizeof(size_t); + STATIC_ASSERT(sizeof(size_t) == 4 || sizeof(size_t) == 8); + if (sizeof(size_t) == 4) { + uint32_t xa = unaligned_peek_u32(1, pa); + uint32_t xb = unaligned_peek_u32(1, pb); +#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__ + xa = osal_bswap32(xa); + xb = osal_bswap32(xb); +#endif /* __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__ */ + if (xa != xb) + return (xa < xb) ? -1 : 1; + } else { + uint64_t xa = unaligned_peek_u64(1, pa); + uint64_t xb = unaligned_peek_u64(1, pb); +#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__ + xa = osal_bswap64(xa); + xb = osal_bswap64(xb); +#endif /* __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__ */ + if (xa != xb) + return (xa < xb) ? -1 : 1; + } + } + if (sizeof(size_t) == 8 && left >= 4) { + pa -= 4; + pb -= 4; + left -= 4; + uint32_t xa = unaligned_peek_u32(1, pa); + uint32_t xb = unaligned_peek_u32(1, pb); +#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__ + xa = osal_bswap32(xa); + xb = osal_bswap32(xb); +#endif /* __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__ */ + if (xa != xb) + return (xa < xb) ? -1 : 1; + } + if (left) { + unsigned xa = tail3le(pa - left, left); + unsigned xb = tail3le(pb - left, left); + if (xa != xb) + return (xa < xb) ? -1 : 1; + } } return CMP2INT(a->iov_len, b->iov_len); } diff --git a/src/osal.h b/src/osal.h index 53b80e58..d7809ae4 100644 --- a/src/osal.h +++ b/src/osal.h @@ -890,6 +890,46 @@ MDBX_INTERNAL_VAR MDBX_SetFileIoOverlappedRange mdbx_SetFileIoOverlappedRange; /*----------------------------------------------------------------------------*/ +MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static __always_inline uint64_t +osal_bswap64(uint64_t v) { +#if __GNUC_PREREQ(4, 4) || __CLANG_PREREQ(4, 0) || \ + __has_builtin(__builtin_bswap64) + return __builtin_bswap64(v); +#elif defined(_MSC_VER) && !defined(__clang__) + return _byteswap_uint64(v); +#elif defined(__bswap_64) + return __bswap_64(v); +#elif defined(bswap_64) + return bswap_64(v); +#else + return v << 56 | v >> 56 | ((v << 40) & UINT64_C(0x00ff000000000000)) | + ((v << 24) & UINT64_C(0x0000ff0000000000)) | + ((v << 8) & UINT64_C(0x000000ff00000000)) | + ((v >> 8) & UINT64_C(0x00000000ff000000)) | + ((v >> 24) & UINT64_C(0x0000000000ff0000)) | + ((v >> 40) & UINT64_C(0x000000000000ff00)); +#endif +} + +MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static __always_inline uint32_t +osal_bswap32(uint32_t v) { +#if __GNUC_PREREQ(4, 4) || __CLANG_PREREQ(4, 0) || \ + __has_builtin(__builtin_bswap32) + return __builtin_bswap32(v); +#elif defined(_MSC_VER) && !defined(__clang__) + return _byteswap_ulong(v); +#elif defined(__bswap_32) + return __bswap_32(v); +#elif defined(bswap_32) + return bswap_32(v); +#else + return v << 24 | v >> 24 | ((v << 8) & UINT32_C(0x00ff0000)) | + ((v >> 8) & UINT32_C(0x0000ff00)); +#endif +} + +/*----------------------------------------------------------------------------*/ + #if defined(_MSC_VER) && _MSC_VER >= 1900 /* LY: MSVC 2015/2017/2019 has buggy/inconsistent PRIuPTR/PRIxPTR macros * for internal format-args checker. */ From f0c2927fc71df44b6923031c32fdc11e9a9ba000 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Sun, 1 Jan 2023 01:26:55 +0300 Subject: [PATCH 311/364] =?UTF-8?q?mdbx:=20=D0=BF=D0=B5=D1=80=D0=B5=D0=BD?= =?UTF-8?q?=D0=BE=D1=81=20LRU-=D0=BE=D1=82=D0=BC=D0=B5=D1=82=D0=BE=D0=BA?= =?UTF-8?q?=20=D0=B2=20=D1=82=D0=B5=D0=BD=D0=B5=D0=B2=D1=8B=D0=B5=20=D1=81?= =?UTF-8?q?=D1=82=D1=80=D0=B0=D0=BD=D0=B8=D1=86=D1=8B=20=D0=BF=D0=BE=20?= =?UTF-8?q?=D0=BE=D1=82=D1=80=D0=B8=D1=86=D0=B0=D1=82=D0=B5=D0=BB=D1=8C?= =?UTF-8?q?=D0=BD=D0=BE=D0=BC=D1=83=20=D1=81=D0=BC=D0=B5=D1=89=D0=B5=D0=BD?= =?UTF-8?q?=D0=B8=D1=8E.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Это позволяет избавиться от повторного поиска в "гзязном" списке страниц, уже находящихся в стеке курсора, для обнлвления LRU-отметок. --- src/core.c | 96 +++++++++++++++++++++++++++---------------------- src/internals.h | 5 +-- 2 files changed, 55 insertions(+), 46 deletions(-) diff --git a/src/core.c b/src/core.c index 38d80830..32ce7847 100644 --- a/src/core.c +++ b/src/core.c @@ -2768,7 +2768,7 @@ static __always_inline size_t dpl_setlen(MDBX_dpl *dl, size_t len) { dl->length = len; dl->items[len + 1].ptr = (MDBX_page *)&dpl_stub_pageE; dl->items[len + 1].pgno = P_INVALID; - dl->items[len + 1].mlru = 0; + dl->items[len + 1].npages = 1; return len; } @@ -2783,7 +2783,7 @@ static __always_inline void dpl_clear(MDBX_dpl *dl) { dl->pages_including_loose = 0; dl->items[0].ptr = (MDBX_page *)&dpl_stub_pageB; dl->items[0].pgno = 0; - dl->items[0].mlru = 0; + dl->items[0].npages = 1; assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); } @@ -2954,9 +2954,7 @@ __hot __noinline static size_t dpl_search(const MDBX_txn *txn, pgno_t pgno) { MDBX_NOTHROW_PURE_FUNCTION static __inline unsigned dpl_npages(const MDBX_dpl *dl, size_t i) { assert(0 <= (intptr_t)i && i <= dl->length); - unsigned n = 1; - if (unlikely(dl->items[i].mlru & MDBX_dp_multi_mask)) - n = dl->items[i].ptr->mp_pages; + unsigned n = dl->items[i].npages; assert(n == (IS_OVERFLOW(dl->items[i].ptr) ? dl->items[i].ptr->mp_pages : 1)); return n; } @@ -3047,13 +3045,14 @@ static void dpl_remove(const MDBX_txn *txn, size_t i) { static __noinline void txn_lru_reduce(MDBX_txn *txn) { NOTICE("lru-reduce %u -> %u", txn->tw.dirtylru, txn->tw.dirtylru >> 1); + tASSERT(txn, (txn->mt_flags & (MDBX_TXN_RDONLY | MDBX_WRITEMAP)) == 0); do { txn->tw.dirtylru >>= 1; MDBX_dpl *dl = txn->tw.dirtylist; for (size_t i = 1; i <= dl->length; ++i) { - uint32_t mlru = dl->items[i].mlru; - mlru = (mlru & MDBX_dp_multi_mask) + ((mlru >> 1) & MDBX_dp_lru_mask); - dl->items[i].mlru = mlru; + size_t *const ptr = + ptr_disp(dl->items[i].ptr, -(ptrdiff_t)sizeof(size_t)); + *ptr >>= 1; } txn = txn->mt_parent; } while (txn); @@ -3061,18 +3060,19 @@ static __noinline void txn_lru_reduce(MDBX_txn *txn) { MDBX_NOTHROW_PURE_FUNCTION static __inline uint32_t dpl_age(const MDBX_txn *txn, size_t i) { - tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); - tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); + tASSERT(txn, (txn->mt_flags & (MDBX_TXN_RDONLY | MDBX_WRITEMAP)) == 0); const MDBX_dpl *dl = txn->tw.dirtylist; assert((intptr_t)i > 0 && i <= dl->length); - return (txn->tw.dirtylru >> 1) - (dl->items[i].mlru >> 1); + size_t *const ptr = ptr_disp(dl->items[i].ptr, -(ptrdiff_t)sizeof(size_t)); + return txn->tw.dirtylru - (uint32_t)*ptr; } static __inline uint32_t txn_lru_turn(MDBX_txn *txn) { - txn->tw.dirtylru += 2; - if (unlikely(txn->tw.dirtylru > UINT32_MAX / 3)) + txn->tw.dirtylru += 1; + if (unlikely(txn->tw.dirtylru > UINT32_MAX / 3) && + (txn->mt_flags & MDBX_WRITEMAP) == 0) txn_lru_reduce(txn); - return txn->tw.dirtylru & MDBX_dp_lru_mask; + return txn->tw.dirtylru; } static __always_inline int __must_check_result dpl_append(MDBX_txn *txn, @@ -3081,7 +3081,12 @@ static __always_inline int __must_check_result dpl_append(MDBX_txn *txn, size_t npages) { tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); - const MDBX_dp dp = {page, pgno, txn_lru_turn(txn) + (npages > 1)}; + const MDBX_dp dp = {page, pgno, (pgno_t)npages}; + if ((txn->mt_flags & MDBX_WRITEMAP) == 0) { + size_t *const ptr = ptr_disp(page, -(ptrdiff_t)sizeof(size_t)); + *ptr = txn->tw.dirtylru; + } + MDBX_dpl *dl = txn->tw.dirtylist; tASSERT(txn, dl->length <= MDBX_PGL_LIMIT + MDBX_PNL_GRANULATE); tASSERT(txn, dl->items[0].pgno == 0 && @@ -3166,7 +3171,7 @@ static __always_inline int __must_check_result dpl_append(MDBX_txn *txn, #else i[1].ptr = i->ptr; i[1].pgno = i->pgno; - i[1].mlru = i->mlru; + i[1].npages = i->npages; #endif --i; } @@ -3784,12 +3789,13 @@ static MDBX_page *page_malloc(MDBX_txn *txn, size_t num) { env->me_dp_reserve_len -= 1; } else { size = pgno2bytes(env, num); - np = osal_malloc(size); - if (unlikely(!np)) { + void *const ptr = osal_malloc(size + sizeof(size_t)); + if (unlikely(!ptr)) { txn->mt_flags |= MDBX_TXN_ERROR; - return np; + return nullptr; } - VALGRIND_MEMPOOL_ALLOC(env, np, size); + VALGRIND_MEMPOOL_ALLOC(env, ptr, size + sizeof(size_t)); + np = ptr_disp(ptr, sizeof(size_t)); } if ((env->me_flags & MDBX_NOMEMINIT) == 0) { @@ -3826,8 +3832,9 @@ static void dpage_free(MDBX_env *env, MDBX_page *dp, size_t npages) { env->me_dp_reserve_len += 1; } else { /* large pages just get freed directly */ - VALGRIND_MEMPOOL_FREE(env, dp); - osal_free(dp); + void *const ptr = ptr_disp(dp, -(ptrdiff_t)sizeof(size_t)); + VALGRIND_MEMPOOL_FREE(env, ptr); + osal_free(ptr); } } @@ -3879,10 +3886,12 @@ MDBX_MAYBE_UNUSED __cold static bool dirtylist_check(MDBX_txn *txn) { if (unlikely(dp->mp_pgno != dl->items[i].pgno)) return false; - const uint32_t age = dpl_age(txn, i); - tASSERT(txn, age < UINT32_MAX / 3); - if (unlikely(age > UINT32_MAX / 3)) - return false; + if ((txn->mt_flags & MDBX_WRITEMAP) == 0) { + const uint32_t age = dpl_age(txn, i); + tASSERT(txn, age < UINT32_MAX / 3); + if (unlikely(age > UINT32_MAX / 3)) + return false; + } tASSERT(txn, dp->mp_flags == P_LOOSE || IS_MODIFIABLE(txn, dp)); if (dp->mp_flags == P_LOOSE) { @@ -4766,8 +4775,7 @@ static int spill_page(MDBX_txn *txn, iov_ctx_t *ctx, MDBX_page *dp, /* Set unspillable LRU-label for dirty pages watched by txn. * Returns the number of pages marked as unspillable. */ static size_t cursor_keep(const MDBX_txn *const txn, const MDBX_cursor *mc) { - tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); - tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0); + tASSERT(txn, (txn->mt_flags & (MDBX_TXN_RDONLY | MDBX_WRITEMAP)) == 0); size_t keep = 0; while ((mc->mc_flags & C_INITIALIZED) && mc->mc_snum) { tASSERT(txn, mc->mc_top == mc->mc_snum - 1); @@ -4780,9 +4788,9 @@ static size_t cursor_keep(const MDBX_txn *const txn, const MDBX_cursor *mc) { size_t const n = dpl_search(txn, mp->mp_pgno); if (txn->tw.dirtylist->items[n].pgno == mp->mp_pgno && /* не считаем дважды */ dpl_age(txn, n)) { - txn->tw.dirtylist->items[n].mlru = - (txn->tw.dirtylist->items[n].mlru & MDBX_dp_multi_mask) + - (txn->tw.dirtylru & MDBX_dp_lru_mask); + size_t *const ptr = ptr_disp(txn->tw.dirtylist->items[n].ptr, + -(ptrdiff_t)sizeof(size_t)); + *ptr = txn->tw.dirtylru; tASSERT(txn, dpl_age(txn, n) == 0); ++keep; } @@ -4801,8 +4809,7 @@ static size_t cursor_keep(const MDBX_txn *const txn, const MDBX_cursor *mc) { } static size_t txn_keep(MDBX_txn *txn, MDBX_cursor *m0) { - tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); - tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0); + tASSERT(txn, (txn->mt_flags & (MDBX_TXN_RDONLY | MDBX_WRITEMAP)) == 0); txn_lru_turn(txn); size_t keep = m0 ? cursor_keep(txn, m0) : 0; for (size_t i = FREE_DBI; i < txn->mt_numdbs; ++i) @@ -5085,10 +5092,10 @@ __cold static int txn_spill_slowpath(MDBX_txn *const txn, MDBX_cursor *const m0, const uint32_t reciprocal = (UINT32_C(255) << 24) / (age_max + 1); for (size_t i = 1; i <= dl->length; ++i) { const unsigned prio = spill_prio(txn, i, reciprocal); + size_t *const ptr = ptr_disp(dl->items[i].ptr, -(ptrdiff_t)sizeof(size_t)); TRACE("page %" PRIaPGNO - ", lru %u, is_multi %c, npages %u, age %u of %u, prio %u", - dl->items[i].pgno, dl->items[i].mlru & MDBX_dp_lru_mask, - (dl->items[i].mlru & MDBX_dp_multi_mask) ? 'Y' : 'N', + ", lru %zu, is_multi %c, npages %u, age %u of %u, prio %u", + dl->items[i].pgno, *ptr, (dl->items[i].npages > 1) ? 'Y' : 'N', dpl_npages(dl, i), dpl_age(txn, i), age_max, prio); if (prio < 256) { radix_entries[prio] += 1; @@ -7858,9 +7865,11 @@ __hot static int page_touch(MDBX_cursor *mc) { tASSERT(txn, n > 0 && n <= txn->tw.dirtylist->length); tASSERT(txn, txn->tw.dirtylist->items[n].pgno == mp->mp_pgno && txn->tw.dirtylist->items[n].ptr == mp); - txn->tw.dirtylist->items[n].mlru = - (txn->tw.dirtylist->items[n].mlru & MDBX_dp_multi_mask) + - (txn->tw.dirtylru & MDBX_dp_lru_mask); + if (!MDBX_AVOID_MSYNC || (txn->mt_flags & MDBX_WRITEMAP) == 0) { + size_t *const ptr = + ptr_disp(txn->tw.dirtylist->items[n].ptr, -(ptrdiff_t)sizeof(size_t)); + *ptr = txn->tw.dirtylru; + } return MDBX_SUCCESS; } if (IS_SUBP(mp)) { @@ -15280,7 +15289,9 @@ __cold int mdbx_env_close_ex(MDBX_env *env, bool dont_sync) { MDBX_ASAN_UNPOISON_MEMORY_REGION(dp, env->me_psize); VALGRIND_MAKE_MEM_DEFINED(&mp_next(dp), sizeof(MDBX_page *)); env->me_dp_reserve = mp_next(dp); - osal_free(dp); + void *const ptr = ptr_disp(dp, -(ptrdiff_t)sizeof(size_t)); + VALGRIND_MEMPOOL_FREE(env, ptr); + osal_free(ptr); } VALGRIND_DESTROY_MEMPOOL(env); ENSURE(env, env->me_lcklist_next == nullptr); @@ -24653,8 +24664,9 @@ __cold int mdbx_env_set_option(MDBX_env *env, const MDBX_option_t option, MDBX_ASAN_UNPOISON_MEMORY_REGION(dp, env->me_psize); VALGRIND_MAKE_MEM_DEFINED(&mp_next(dp), sizeof(MDBX_page *)); env->me_dp_reserve = mp_next(dp); - VALGRIND_MEMPOOL_FREE(env, dp); - osal_free(dp); + void *const ptr = ptr_disp(dp, -(ptrdiff_t)sizeof(size_t)); + VALGRIND_MEMPOOL_FREE(env, ptr); + osal_free(ptr); env->me_dp_reserve_len -= 1; } } diff --git a/src/internals.h b/src/internals.h index 862a6697..25a788b9 100644 --- a/src/internals.h +++ b/src/internals.h @@ -1059,10 +1059,7 @@ typedef txnid_t *MDBX_TXL; /* An Dirty-Page list item is an pgno/pointer pair. */ typedef struct MDBX_dp { MDBX_page *ptr; - pgno_t pgno; - uint32_t mlru; -#define MDBX_dp_multi_mask 1 -#define MDBX_dp_lru_mask UINT32_C(0xffffFFFe) + pgno_t pgno, npages; } MDBX_dp; /* An DPL (dirty-page list) is a sorted array of MDBX_DPs. */ From ab55016599a502faa9362583b14986bfb6c47370 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Sun, 1 Jan 2023 15:39:01 +0300 Subject: [PATCH 312/364] =?UTF-8?q?mdbx:=20=D1=83=D1=81=D1=82=D1=80=D0=B0?= =?UTF-8?q?=D0=BD=D0=B5=D0=BD=D0=B8=D0=B5=20=D0=BB=D0=BE=D0=B6=D0=BD=D0=BE?= =?UTF-8?q?=D0=B3=D0=BE=20=D1=81=D1=80=D0=B0=D0=B1=D0=B0=D1=82=D1=8B=D0=B2?= =?UTF-8?q?=D0=B0=D0=BD=D0=B8=D1=8F=20assert=20=D0=B2=D0=BD=D1=83=D1=82?= =?UTF-8?q?=D1=80=D0=B8=20`dpl=5Freserve()`.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/core.c b/src/core.c index 32ce7847..ead34221 100644 --- a/src/core.c +++ b/src/core.c @@ -2819,6 +2819,11 @@ static int dpl_alloc(MDBX_txn *txn) { const size_t wanna = (txn->mt_env->me_options.dp_initial < txn->mt_geo.upper) ? txn->mt_env->me_options.dp_initial : txn->mt_geo.upper; +#if MDBX_FORCE_ASSERTIONS || MDBX_DEBUG + if (txn->tw.dirtylist) + /* обнуляем чтобы не сработал ассерт внутри dpl_reserve() */ + txn->tw.dirtylist->sorted = txn->tw.dirtylist->length = 0; +#endif /* asertions enabled */ if (unlikely(!txn->tw.dirtylist || txn->tw.dirtylist->detent < wanna || txn->tw.dirtylist->detent > wanna + wanna) && unlikely(!dpl_reserve(txn, wanna))) From 2c8d3e1e12f88b6834fbf3175137cdf419a84484 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Sun, 1 Jan 2023 17:26:34 +0300 Subject: [PATCH 313/364] =?UTF-8?q?mdbx:=20=D0=B8=D1=81=D0=BF=D1=80=D0=B0?= =?UTF-8?q?=D0=B2=D0=BB=D0=B5=D0=BD=D0=B8=D0=B5=20=D0=BF=D1=80=D0=B5=D0=B4?= =?UTF-8?q?=D1=83=D0=BF=D1=80=D0=B5=D0=B6=D0=B4=D0=B5=D0=BD=D0=B8=D1=8F=20?= =?UTF-8?q?UBSAN.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/core.c b/src/core.c index ead34221..b23b3700 100644 --- a/src/core.c +++ b/src/core.c @@ -4805,8 +4805,7 @@ static size_t cursor_keep(const MDBX_txn *const txn, const MDBX_cursor *mc) { tASSERT(txn, IS_LEAF(mp)); if (!mc->mc_xcursor || mc->mc_ki[mc->mc_top] >= page_numkeys(mp)) break; - const MDBX_node *const node = page_node(mp, mc->mc_ki[mc->mc_top]); - if (!(node->mn_flags & F_SUBDATA)) + if (!(node_flags(page_node(mp, mc->mc_ki[mc->mc_top])) & F_SUBDATA)) break; mc = &mc->mc_xcursor->mx_cursor; } From 24f2e878c102c9c2444c54f0dd66eba2a20cd246 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Mon, 2 Jan 2023 15:47:48 +0300 Subject: [PATCH 314/364] =?UTF-8?q?mdbx:=20=D1=83=D1=81=D1=82=D1=80=D0=B0?= =?UTF-8?q?=D0=BD=D0=B5=D0=BD=D0=B8=D0=B5=20=D0=BD=D0=B5=D1=81=D1=83=D1=89?= =?UTF-8?q?=D0=B5=D1=81=D1=82=D0=B2=D0=B5=D0=BD=D0=BD=D1=8B=D1=85=20=D0=BF?= =?UTF-8?q?=D1=80=D0=B5=D0=B4=D1=83=D0=BF=D1=80=D0=B5=D0=B6=D0=B4=D0=B5?= =?UTF-8?q?=D0=BD=D0=B8=D0=B9=20Valgrind.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core.c | 29 +++++++++++++++++++---------- test/valgrind_suppress.txt | 8 ++++++++ 2 files changed, 27 insertions(+), 10 deletions(-) diff --git a/src/core.c b/src/core.c index b23b3700..80ddb7c3 100644 --- a/src/core.c +++ b/src/core.c @@ -2221,11 +2221,11 @@ static __always_inline size_t pnl_size2bytes(size_t size) { #endif /* MDBX_PNL_PREALLOC_FOR_RADIXSORT */ STATIC_ASSERT(MDBX_ASSUME_MALLOC_OVERHEAD + (MDBX_PGL_LIMIT * (MDBX_PNL_PREALLOC_FOR_RADIXSORT + 1) + - MDBX_PNL_GRANULATE + 2) * + MDBX_PNL_GRANULATE + 3) * sizeof(pgno_t) < SIZE_MAX / 4 * 3); size_t bytes = - ceil_powerof2(MDBX_ASSUME_MALLOC_OVERHEAD + sizeof(pgno_t) * (size + 2), + ceil_powerof2(MDBX_ASSUME_MALLOC_OVERHEAD + sizeof(pgno_t) * (size + 3), MDBX_PNL_GRANULATE * sizeof(pgno_t)) - MDBX_ASSUME_MALLOC_OVERHEAD; return bytes; @@ -2233,8 +2233,8 @@ static __always_inline size_t pnl_size2bytes(size_t size) { static __always_inline pgno_t pnl_bytes2size(const size_t bytes) { size_t size = bytes / sizeof(pgno_t); - assert(size > 2 && size <= MDBX_PGL_LIMIT + /* alignment gap */ 65536); - size -= 2; + assert(size > 3 && size <= MDBX_PGL_LIMIT + /* alignment gap */ 65536); + size -= 3; #if MDBX_PNL_PREALLOC_FOR_RADIXSORT size >>= 1; #endif /* MDBX_PNL_PREALLOC_FOR_RADIXSORT */ @@ -2454,9 +2454,9 @@ __hot static size_t pnl_merge(MDBX_PNL dst, const MDBX_PNL src) { const size_t src_len = MDBX_PNL_GETSIZE(src); const size_t dst_len = MDBX_PNL_GETSIZE(dst); size_t total = dst_len; + assert(MDBX_PNL_ALLOCLEN(dst) >= total); if (likely(src_len > 0)) { total += src_len; - assert(MDBX_PNL_ALLOCLEN(dst) >= total); if (!MDBX_DEBUG && total < (MDBX_HAVE_CMOV ? 21 : 12)) goto avoid_call_libc_for_short_cases; if (dst_len == 0 || @@ -2572,9 +2572,19 @@ __hot __noinline static size_t pnl_search_nochk(const MDBX_PNL pnl, static __inline size_t pnl_search(const MDBX_PNL pnl, pgno_t pgno, size_t limit) { assert(pnl_check_allocated(pnl, limit)); + if (MDBX_HAVE_CMOV) { + /* cmov-ускоренный бинарный поиск может читать (но не использовать) один + * элемент за концом данных, этот элемент в пределах выделенного участка + * памяти, но не инициализирован. */ + VALGRIND_MAKE_MEM_DEFINED(MDBX_PNL_END(pnl), sizeof(pgno_t)); + } assert(pgno < limit); (void)limit; - return pnl_search_nochk(pnl, pgno); + size_t n = pnl_search_nochk(pnl, pgno); + if (MDBX_HAVE_CMOV) { + VALGRIND_MAKE_MEM_UNDEFINED(MDBX_PNL_END(pnl), sizeof(pgno_t)); + } + return n; } static __inline size_t search_spilled(const MDBX_txn *txn, pgno_t pgno) { @@ -3788,7 +3798,8 @@ static MDBX_page *page_malloc(MDBX_txn *txn, size_t num) { if (likely(num == 1 && np)) { eASSERT(env, env->me_dp_reserve_len > 0); MDBX_ASAN_UNPOISON_MEMORY_REGION(np, size); - VALGRIND_MEMPOOL_ALLOC(env, np, size); + VALGRIND_MEMPOOL_ALLOC(env, ptr_disp(np, -(ptrdiff_t)sizeof(size_t)), + size + sizeof(size_t)); VALGRIND_MAKE_MEM_DEFINED(&mp_next(np), sizeof(MDBX_page *)); env->me_dp_reserve = mp_next(np); env->me_dp_reserve_len -= 1; @@ -3832,7 +3843,7 @@ static void dpage_free(MDBX_env *env, MDBX_page *dp, size_t npages) { MDBX_ASAN_POISON_MEMORY_REGION(dp, env->me_psize); MDBX_ASAN_UNPOISON_MEMORY_REGION(&mp_next(dp), sizeof(MDBX_page *)); mp_next(dp) = env->me_dp_reserve; - VALGRIND_MEMPOOL_FREE(env, dp); + VALGRIND_MEMPOOL_FREE(env, ptr_disp(dp, -(ptrdiff_t)sizeof(size_t))); env->me_dp_reserve = dp; env->me_dp_reserve_len += 1; } else { @@ -15294,7 +15305,6 @@ __cold int mdbx_env_close_ex(MDBX_env *env, bool dont_sync) { VALGRIND_MAKE_MEM_DEFINED(&mp_next(dp), sizeof(MDBX_page *)); env->me_dp_reserve = mp_next(dp); void *const ptr = ptr_disp(dp, -(ptrdiff_t)sizeof(size_t)); - VALGRIND_MEMPOOL_FREE(env, ptr); osal_free(ptr); } VALGRIND_DESTROY_MEMPOOL(env); @@ -24669,7 +24679,6 @@ __cold int mdbx_env_set_option(MDBX_env *env, const MDBX_option_t option, VALGRIND_MAKE_MEM_DEFINED(&mp_next(dp), sizeof(MDBX_page *)); env->me_dp_reserve = mp_next(dp); void *const ptr = ptr_disp(dp, -(ptrdiff_t)sizeof(size_t)); - VALGRIND_MEMPOOL_FREE(env, ptr); osal_free(ptr); env->me_dp_reserve_len -= 1; } diff --git a/test/valgrind_suppress.txt b/test/valgrind_suppress.txt index 3d0d1be4..5bc50077 100644 --- a/test/valgrind_suppress.txt +++ b/test/valgrind_suppress.txt @@ -38,6 +38,14 @@ ... fun:meta_sync* } +{ + msync-spill + Memcheck:Param + msync(start) + fun:msync + ... + fun:txn_spill* +} # memcmp() inside iov_write() as workaround for https://libmdbx.dqdkfa.ru/dead-github/issues/269 { From c6b73c8a24fae7b735beb6d7a72024ee8c1a15f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Tue, 3 Jan 2023 20:20:03 +0300 Subject: [PATCH 315/364] =?UTF-8?q?mdbx:=20=D0=B4=D0=BE=D0=B1=D0=B0=D0=B2?= =?UTF-8?q?=D0=BB=D0=B5=D0=BD=D0=B8=D0=B5=20`me=5Fmadv=5Fthreshold`=20?= =?UTF-8?q?=D0=B8=20=D1=80=D0=B5=D1=84=D0=B0=D0=BA=D1=82=D0=BE=D1=80=D0=B8?= =?UTF-8?q?=D0=BD=D0=B3/=D1=83=D0=BF=D1=80=D0=BE=D1=89=D0=B5=D0=BD=D0=B8?= =?UTF-8?q?=D0=B5.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Для уменьшения затрат на MDBX_SHRINK_ALLOWED. --- src/core.c | 253 ++++++++++++++++++++++++------------------------ src/internals.h | 1 + 2 files changed, 130 insertions(+), 124 deletions(-) diff --git a/src/core.c b/src/core.c index 80ddb7c3..9caf8a91 100644 --- a/src/core.c +++ b/src/core.c @@ -5651,8 +5651,8 @@ static txnid_t txn_oldest_reader(const MDBX_txn *const txn) { } /* Find largest mvcc-snapshot still referenced. */ -__cold static pgno_t find_largest_snapshot(const MDBX_env *env, - pgno_t last_used_page) { +static pgno_t find_largest_snapshot(const MDBX_env *env, + pgno_t last_used_page) { MDBX_lockinfo *const lck = env->me_lck_mmap.lck; if (likely(lck != NULL /* check for exclusive without-lck mode */)) { retry:; @@ -5981,7 +5981,7 @@ __cold static unsigned default_rp_augment_limit(const MDBX_env *env) { (augment > MDBX_PNL_INITIAL) ? augment : MDBX_PNL_INITIAL)); } -__cold static bool default_prefault_write(const MDBX_env *env) { +static bool default_prefault_write(const MDBX_env *env) { return !MDBX_MMAP_INCOHERENT_FILE_WRITE && !env->me_incore && (env->me_flags & (MDBX_WRITEMAP | MDBX_RDONLY)) == MDBX_WRITEMAP; } @@ -5991,6 +5991,21 @@ static void adjust_defaults(MDBX_env *env) { env->me_options.rp_augment_limit = default_rp_augment_limit(env); if (!env->me_options.flags.non_auto.prefault_write) env->me_options.prefault_write = default_prefault_write(env); + + const size_t basis = env->me_dbgeo.now; + /* TODO: use options? */ + const unsigned factor = 9; + size_t threshold = (basis < (65536ul << factor)) + ? 65536 /* minimal threshold */ + : (basis > (MEGABYTE * 4 << factor)) + ? MEGABYTE * 4 /* maximal threshold */ + : basis >> factor; + threshold = (threshold < env->me_dbgeo.shrink || !env->me_dbgeo.shrink) + ? threshold + : env->me_dbgeo.shrink; + + env->me_madv_threshold = + bytes2pgno(env, bytes_align2os_bytes(env, threshold)); } __cold static int map_resize(MDBX_env *env, const pgno_t used_pgno, @@ -12435,20 +12450,6 @@ __cold static MDBX_meta *init_metas(const MDBX_env *env, void *buffer) { return page_meta(page2); } -#if MDBX_ENABLE_MADVISE && !(defined(_WIN32) || defined(_WIN64)) -static size_t madvise_threshold(const MDBX_env *env, - const size_t largest_bytes) { - /* TODO: use options */ - const unsigned factor = 9; - const size_t threshold = (largest_bytes < (65536ul << factor)) - ? 65536 /* minimal threshold */ - : (largest_bytes > (MEGABYTE * 4 << factor)) - ? MEGABYTE * 4 /* maximal threshold */ - : largest_bytes >> factor; - return bytes_align2os_bytes(env, threshold); -} -#endif /* MDBX_ENABLE_MADVISE */ - static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending, meta_troika_t *const troika) { eASSERT(env, ((env->me_flags ^ flags) & MDBX_WRITEMAP) == 0); @@ -12482,127 +12483,131 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending, pgno_t shrink = 0; if (flags & MDBX_SHRINK_ALLOWED) { - /* LY: check conditions to discard unused pages */ - const pgno_t largest_pgno = find_largest_snapshot( - env, (head.ptr_c->mm_geo.next > pending->mm_geo.next) - ? head.ptr_c->mm_geo.next - : pending->mm_geo.next); - eASSERT(env, largest_pgno >= NUM_METAS); + const size_t prev_discarded_pgno = + atomic_load32(&env->me_lck->mti_discarded_tail, mo_Relaxed); + if (prev_discarded_pgno < pending->mm_geo.next) + env->me_lck->mti_discarded_tail.weak = pending->mm_geo.next; + else if (prev_discarded_pgno >= + pending->mm_geo.next + env->me_madv_threshold) { + /* LY: check conditions to discard unused pages */ + const pgno_t largest_pgno = find_largest_snapshot( + env, (head.ptr_c->mm_geo.next > pending->mm_geo.next) + ? head.ptr_c->mm_geo.next + : pending->mm_geo.next); + eASSERT(env, largest_pgno >= NUM_METAS); + #if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__) - const pgno_t edge = env->me_poison_edge; - if (edge > largest_pgno) { - env->me_poison_edge = largest_pgno; - VALGRIND_MAKE_MEM_NOACCESS( - ptr_disp(env->me_map, pgno2bytes(env, largest_pgno)), - pgno2bytes(env, edge - largest_pgno)); - MDBX_ASAN_POISON_MEMORY_REGION( - ptr_disp(env->me_map, pgno2bytes(env, largest_pgno)), - pgno2bytes(env, edge - largest_pgno)); - } + const pgno_t edge = env->me_poison_edge; + if (edge > largest_pgno) { + env->me_poison_edge = largest_pgno; + VALGRIND_MAKE_MEM_NOACCESS( + ptr_disp(env->me_map, pgno2bytes(env, largest_pgno)), + pgno2bytes(env, edge - largest_pgno)); + MDBX_ASAN_POISON_MEMORY_REGION( + ptr_disp(env->me_map, pgno2bytes(env, largest_pgno)), + pgno2bytes(env, edge - largest_pgno)); + } #endif /* MDBX_USE_VALGRIND || __SANITIZE_ADDRESS__ */ + #if MDBX_ENABLE_MADVISE && \ (defined(MADV_DONTNEED) || defined(POSIX_MADV_DONTNEED)) - const size_t largest_bytes = pgno2bytes(env, largest_pgno); - /* threshold to avoid unreasonable frequent madvise() calls */ - const size_t threshold = madvise_threshold(env, largest_bytes); - const size_t discard_edge_bytes = bytes_align2os_bytes( - env, ((MDBX_RDONLY & - (env->me_lck_mmap.lck ? env->me_lck_mmap.lck->mti_envmode.weak - : env->me_flags)) - ? largest_bytes - : largest_bytes + threshold)); - const pgno_t discard_edge_pgno = bytes2pgno(env, discard_edge_bytes); - const pgno_t prev_discarded_pgno = - atomic_load32(&env->me_lck->mti_discarded_tail, mo_Relaxed); - if (prev_discarded_pgno >= discard_edge_pgno + bytes2pgno(env, threshold)) { - NOTICE("shrink-MADV_%s %u..%u", "DONTNEED", largest_pgno, - prev_discarded_pgno); - atomic_store32(&env->me_lck->mti_discarded_tail, discard_edge_pgno, - mo_Relaxed); - const size_t prev_discarded_bytes = - ceil_powerof2(pgno2bytes(env, prev_discarded_pgno), env->me_os_psize); - ENSURE(env, prev_discarded_bytes > discard_edge_bytes); - munlock_after(env, discard_edge_pgno, - bytes_align2os_bytes(env, env->me_dxb_mmap.current)); - const uint32_t munlocks_before = - atomic_load32(&env->me_lck->mti_mlcnt[1], mo_Relaxed); + const size_t discard_edge_pgno = pgno_align2os_pgno(env, largest_pgno); + if (prev_discarded_pgno >= discard_edge_pgno + env->me_madv_threshold) { + const size_t prev_discarded_bytes = + pgno_align2os_bytes(env, prev_discarded_pgno); + const size_t discard_edge_bytes = pgno2bytes(env, discard_edge_pgno); + /* из-за выравнивания prev_discarded_bytes и discard_edge_bytes + * могут быть равны */ + if (prev_discarded_bytes > discard_edge_bytes) { + NOTICE("shrink-MADV_%s %zu..%zu", "DONTNEED", discard_edge_pgno, + prev_discarded_pgno); + munlock_after(env, discard_edge_pgno, + bytes_align2os_bytes(env, env->me_dxb_mmap.current)); + const uint32_t munlocks_before = + atomic_load32(&env->me_lck->mti_mlcnt[1], mo_Relaxed); #if defined(MADV_DONTNEED) - int advise = MADV_DONTNEED; + int advise = MADV_DONTNEED; #if defined(MADV_FREE) && \ 0 /* MADV_FREE works for only anonymous vma at the moment */ - if ((env->me_flags & MDBX_WRITEMAP) && linux_kernel_version > 0x04050000) - advise = MADV_FREE; + if ((env->me_flags & MDBX_WRITEMAP) && + linux_kernel_version > 0x04050000) + advise = MADV_FREE; #endif /* MADV_FREE */ - int err = madvise(ptr_disp(env->me_map, discard_edge_bytes), - prev_discarded_bytes - discard_edge_bytes, advise) - ? ignore_enosys(errno) - : MDBX_SUCCESS; + int err = madvise(ptr_disp(env->me_map, discard_edge_bytes), + prev_discarded_bytes - discard_edge_bytes, advise) + ? ignore_enosys(errno) + : MDBX_SUCCESS; #else - int err = ignore_enosys(posix_madvise( - ptr_disp(env->me_map, discard_edge_bytes), - prev_discarded_bytes - discard_edge_bytes, POSIX_MADV_DONTNEED)); + int err = ignore_enosys(posix_madvise( + ptr_disp(env->me_map, discard_edge_bytes), + prev_discarded_bytes - discard_edge_bytes, POSIX_MADV_DONTNEED)); #endif - if (unlikely(MDBX_IS_ERROR(err))) { - const uint32_t mlocks_after = - atomic_load32(&env->me_lck->mti_mlcnt[0], mo_Relaxed); - if (err == MDBX_EINVAL) { - const int severity = (mlocks_after - munlocks_before) - ? MDBX_LOG_NOTICE - : MDBX_LOG_WARN; - if (LOG_ENABLED(severity)) - debug_log(severity, __func__, __LINE__, - "%s-madvise: ignore EINVAL (%d) since some pages maybe " - "locked (%u/%u mlcnt-processes)", - "shrink", err, mlocks_after, munlocks_before); - } else { - ERROR("%s-madvise(%s, %zu, +%zu), %u/%u mlcnt-processes, err %d", - "shrink", "DONTNEED", discard_edge_bytes, - prev_discarded_bytes - discard_edge_bytes, mlocks_after, - munlocks_before, err); - return err; + if (unlikely(MDBX_IS_ERROR(err))) { + const uint32_t mlocks_after = + atomic_load32(&env->me_lck->mti_mlcnt[0], mo_Relaxed); + if (err == MDBX_EINVAL) { + const int severity = (mlocks_after - munlocks_before) + ? MDBX_LOG_NOTICE + : MDBX_LOG_WARN; + if (LOG_ENABLED(severity)) + debug_log( + severity, __func__, __LINE__, + "%s-madvise: ignore EINVAL (%d) since some pages maybe " + "locked (%u/%u mlcnt-processes)", + "shrink", err, mlocks_after, munlocks_before); + } else { + ERROR("%s-madvise(%s, %zu, +%zu), %u/%u mlcnt-processes, err %d", + "shrink", "DONTNEED", discard_edge_bytes, + prev_discarded_bytes - discard_edge_bytes, mlocks_after, + munlocks_before, err); + return err; + } + } else + env->me_lck->mti_discarded_tail.weak = discard_edge_pgno; } - } else - env->me_lck->mti_discarded_tail.weak = discard_edge_pgno; - } + } #endif /* MDBX_ENABLE_MADVISE && (MADV_DONTNEED || POSIX_MADV_DONTNEED) */ - /* LY: check conditions to shrink datafile */ - const pgno_t backlog_gap = 3 + pending->mm_dbs[FREE_DBI].md_depth * 3; - pgno_t shrink_step = 0; - if (pending->mm_geo.shrink_pv && - pending->mm_geo.now - pending->mm_geo.next > - (shrink_step = pv2pages(pending->mm_geo.shrink_pv)) + backlog_gap) { - if (pending->mm_geo.now > largest_pgno && - pending->mm_geo.now - largest_pgno > shrink_step + backlog_gap) { - const pgno_t aligner = - pending->mm_geo.grow_pv - ? /* grow_step */ pv2pages(pending->mm_geo.grow_pv) - : shrink_step; - const pgno_t with_backlog_gap = largest_pgno + backlog_gap; - const pgno_t aligned = pgno_align2os_pgno( - env, with_backlog_gap + aligner - with_backlog_gap % aligner); - const pgno_t bottom = - (aligned > pending->mm_geo.lower) ? aligned : pending->mm_geo.lower; - if (pending->mm_geo.now > bottom) { - if (TROIKA_HAVE_STEADY(troika)) - /* force steady, but only if steady-checkpoint is present */ - flags &= MDBX_WRITEMAP | MDBX_SHRINK_ALLOWED; - shrink = pending->mm_geo.now - bottom; - pending->mm_geo.now = bottom; - if (unlikely(head.txnid == pending->unsafe_txnid)) { - const txnid_t txnid = safe64_txnid_next(pending->unsafe_txnid); - NOTICE("force-forward pending-txn %" PRIaTXN " -> %" PRIaTXN, - pending->unsafe_txnid, txnid); - ENSURE(env, !env->me_txn0 || - (env->me_txn0->mt_owner != osal_thread_self() && - !env->me_txn)); - if (unlikely(txnid > MAX_TXNID)) { - rc = MDBX_TXN_FULL; - ERROR("txnid overflow, raise %d", rc); - goto fail; + /* LY: check conditions to shrink datafile */ + const pgno_t backlog_gap = 3 + pending->mm_dbs[FREE_DBI].md_depth * 3; + pgno_t shrink_step = 0; + if (pending->mm_geo.shrink_pv && + pending->mm_geo.now - pending->mm_geo.next > + (shrink_step = pv2pages(pending->mm_geo.shrink_pv)) + + backlog_gap) { + if (pending->mm_geo.now > largest_pgno && + pending->mm_geo.now - largest_pgno > shrink_step + backlog_gap) { + const pgno_t aligner = + pending->mm_geo.grow_pv + ? /* grow_step */ pv2pages(pending->mm_geo.grow_pv) + : shrink_step; + const pgno_t with_backlog_gap = largest_pgno + backlog_gap; + const pgno_t aligned = pgno_align2os_pgno( + env, with_backlog_gap + aligner - with_backlog_gap % aligner); + const pgno_t bottom = (aligned > pending->mm_geo.lower) + ? aligned + : pending->mm_geo.lower; + if (pending->mm_geo.now > bottom) { + if (TROIKA_HAVE_STEADY(troika)) + /* force steady, but only if steady-checkpoint is present */ + flags &= MDBX_WRITEMAP | MDBX_SHRINK_ALLOWED; + shrink = pending->mm_geo.now - bottom; + pending->mm_geo.now = bottom; + if (unlikely(head.txnid == pending->unsafe_txnid)) { + const txnid_t txnid = safe64_txnid_next(pending->unsafe_txnid); + NOTICE("force-forward pending-txn %" PRIaTXN " -> %" PRIaTXN, + pending->unsafe_txnid, txnid); + ENSURE(env, !env->me_txn0 || + (env->me_txn0->mt_owner != osal_thread_self() && + !env->me_txn)); + if (unlikely(txnid > MAX_TXNID)) { + rc = MDBX_TXN_FULL; + ERROR("txnid overflow, raise %d", rc); + goto fail; + } + meta_set_txnid(env, pending, txnid); + eASSERT(env, coherency_check_meta(env, pending, true)); } - meta_set_txnid(env, pending, txnid); - eASSERT(env, coherency_check_meta(env, pending, true)); } } } diff --git a/src/internals.h b/src/internals.h index 25a788b9..4484b180 100644 --- a/src/internals.h +++ b/src/internals.h @@ -1398,6 +1398,7 @@ struct MDBX_env { uint32_t me_live_reader; /* have liveness lock in reader table */ void *me_userctx; /* User-settable context */ MDBX_hsr_func *me_hsr_callback; /* Callback for kicking laggard readers */ + size_t me_madv_threshold; struct { unsigned dp_reserve_limit; From f2a49b687a3e69cace547f2f80c36989074e87f5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Wed, 4 Jan 2023 00:19:48 +0300 Subject: [PATCH 316/364] =?UTF-8?q?mdbx:=20=D0=BE=D0=B1=D0=BD=D0=BE=D0=B2?= =?UTF-8?q?=D0=BB=D0=B5=D0=BD=D0=B8=D0=B5=20ChangeLog.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ChangeLog.md | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/ChangeLog.md b/ChangeLog.md index 2c31d3f9..fc423fe7 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -5,12 +5,12 @@ English version [by Google](https://gitflic-ru.translate.goog/project/erthink/li and [by Yandex](https://translated.turbopages.org/proxy_u/ru-en.en/https/gitflic.ru/project/erthink/libmdbx/blob?file=ChangeLog.md). -## v0.12.3 (Акула) запланирован на 2022-12-20 +## v0.12.3 (Акула) запланирован на 2023-01-07 Выпуск с существенными доработками и новой функциональностью в память о закрытом open-source проекте "Акула". ``` -18 files changed, 2792 insertions(+), 1698 deletions(-) +20 files changed, 4388 insertions(+), 2907 deletions(-) Signed-off-by: Леонид Юрьев (Leonid Yuriev) ``` @@ -61,7 +61,7 @@ Signed-off-by: Леонид Юрьев (Leonid Yuriev) Второй способ выгоднее если требуется записать много страниц и/или канал взаимодействия имеет весомую задержку (датацентры, облака). Добавленная опция `MDBX_opt_writethrough_threshold` позволяет во время выполнения - задать порог для динамического выбора способа записи в зависимост от + задать порог для динамического выбора способа записи в зависимости от объема и конкретных условия использования. - Автоматическая установка `MDBX_opt_rp_augment_limit` в зависимости от размера БД. @@ -86,6 +86,18 @@ Signed-off-by: Леонид Юрьев (Leonid Yuriev) ОС), но позволяют застраховать пользователей от массы неверных действий приводящих к повреждению БД. + - Поддержка не-печатных имен для subDb. + + - Добавлен явный выбор `tls_model("local-dynamic")` для обзода проблемы + `relocation R_X86_64_TPOFF32 against FOO cannot be used with -shared` + из-за ошибки в CLANG приводящей к использованию неверного режима `ls_model`. + + - Изменение тактики слияние страниц при удалении. + Теперь слияние выполняется преимущественно с уже измененной/грязной страницей. + Если же справа и слева обе страницы с одинаковым статусом, + то с наименее заполненной, как прежде. В сценариях с массивным удалением + это позволяет увеличить производительность до 50%. + Исправления (без корректировок новых функций): - Изменение размера отображения если это требуется для сброса данных на @@ -122,6 +134,9 @@ Signed-off-by: Леонид Юрьев (Leonid Yuriev) - Оптимизация `pnl_merge()` для случаев неперекрывающихся объединяемых списков. - Оптимизация поддержки отсортированного списка страниц в `dpl_append()`. - Ускорение работы `mdbx_chk` при обработке пользовательских записей в `@MAIN`. + - Переработка LRU-отметок для спиллинга. + - Переработка контроля "некогерентности" Unified page cache для уменьшения накладных расходов. + - Рефакторинг и микрооптимизация. ------------------------------------------------------------------------------- From 08fb7d5838e4a71e2a5ee2e381e04f2b59a86c24 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Thu, 5 Jan 2023 01:34:52 +0300 Subject: [PATCH 317/364] =?UTF-8?q?mdbx:=20=D0=BA=D0=BE=D1=80=D1=80=D0=B5?= =?UTF-8?q?=D0=BA=D1=82=D0=B8=D1=80=D0=BE=D0=B2=D0=BA=D0=B0=20=D0=BE=D1=82?= =?UTF-8?q?=D0=BA=D0=BB=D1=8E=D1=87=D0=B5=D0=BD=D0=B8=D1=8F=20`MDBX=5FNOSU?= =?UTF-8?q?BDIR`=20=D0=BF=D1=80=D0=B8=20=D0=BE=D1=82=D0=BA=D1=80=D1=8B?= =?UTF-8?q?=D1=82=D0=B8=D0=B8=20`mdbx.dat`=20=D0=B1=D0=B5=D0=B7=20=D0=B4?= =?UTF-8?q?=D0=B8=D1=80=D0=B5=D0=BA=D1=82=D0=BE=D1=80=D0=B8=D0=B8.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core.c | 71 +++++++++++++++++++++++++++--------------------------- src/osal.c | 17 +++++++++++++ src/osal.h | 10 ++++++++ 3 files changed, 62 insertions(+), 36 deletions(-) diff --git a/src/core.c b/src/core.c index 9caf8a91..b04cbc23 100644 --- a/src/core.c +++ b/src/core.c @@ -14461,22 +14461,6 @@ typedef struct { size_t ent_len; } MDBX_handle_env_pathname; -static bool path_equal(const pathchar_t *l, const pathchar_t *r, size_t len) { -#if defined(_WIN32) || defined(_WIN64) - while (len > 0) { - pathchar_t a = *l++; - pathchar_t b = *r++; - a = (a == '\\') ? '/' : a; - b = (b == '\\') ? '/' : b; - if (a != b) - return false; - } - return true; -#else - return memcmp(l, r, len * sizeof(pathchar_t)) == 0; -#endif -} - __cold static int handle_env_pathname(MDBX_handle_env_pathname *ctx, const pathchar_t *pathname, MDBX_env_flags_t *flags, @@ -14515,7 +14499,7 @@ __cold static int handle_env_pathname(MDBX_handle_env_pathname *ctx, if (rc != MDBX_ENOFILE) return rc; if (mode == 0 || (*flags & MDBX_RDONLY) != 0) - /* can't open existing */ + /* can't open non-existing */ return rc /* MDBX_ENOFILE */; /* auto-create directory if requested */ @@ -14549,36 +14533,51 @@ __cold static int handle_env_pathname(MDBX_handle_env_pathname *ctx, assert(dxb_name[0] == '/' && lck_name[0] == '/'); const size_t pathname_len = strlen(pathname); #endif - assert(lock_suffix[0] != '\\' && lock_suffix[0] != '/'); + assert(!osal_isdirsep(lock_suffix[0])); ctx->ent_len = pathname_len; static const size_t dxb_name_len = ARRAY_LENGTH(dxb_name) - 1; - if ((*flags & MDBX_NOSUBDIR) && ctx->ent_len > dxb_name_len && - path_equal(pathname + ctx->ent_len - dxb_name_len, dxb_name, - dxb_name_len)) { - *flags -= MDBX_NOSUBDIR; - ctx->ent_len -= dxb_name_len; + if (*flags & MDBX_NOSUBDIR) { + if (ctx->ent_len > dxb_name_len && + osal_pathequal(pathname + ctx->ent_len - dxb_name_len, dxb_name, + dxb_name_len)) { + *flags -= MDBX_NOSUBDIR; + ctx->ent_len -= dxb_name_len; + } else if (ctx->ent_len == dxb_name_len - 1 && osal_isdirsep(dxb_name[0]) && + osal_isdirsep(lck_name[0]) && + osal_pathequal(pathname + ctx->ent_len - dxb_name_len + 1, + dxb_name + 1, dxb_name_len - 1)) { + *flags -= MDBX_NOSUBDIR; + ctx->ent_len -= dxb_name_len - 1; + } } - const size_t bytes_needed = - sizeof(pathchar_t) * ctx->ent_len * 2 + - ((*flags & MDBX_NOSUBDIR) ? sizeof(lock_suffix) + sizeof(pathchar_t) - : sizeof(lck_name) + sizeof(dxb_name)); + const size_t suflen_with_NOSUBDIR = sizeof(lock_suffix) + sizeof(pathchar_t); + const size_t suflen_without_NOSUBDIR = sizeof(lck_name) + sizeof(dxb_name); + const size_t enogh4any = (suflen_with_NOSUBDIR > suflen_without_NOSUBDIR) + ? suflen_with_NOSUBDIR + : suflen_without_NOSUBDIR; + const size_t bytes_needed = sizeof(pathchar_t) * ctx->ent_len * 2 + enogh4any; ctx->buffer_for_free = osal_malloc(bytes_needed); if (!ctx->buffer_for_free) return MDBX_ENOMEM; ctx->dxb = ctx->buffer_for_free; - ctx->lck = ctx->dxb + ctx->ent_len + 1; - memcpy(ctx->dxb, pathname, sizeof(pathchar_t) * (ctx->ent_len + 1)); - if (*flags & MDBX_NOSUBDIR) { - memcpy(ctx->lck + ctx->ent_len, lock_suffix, sizeof(lock_suffix)); + if (ctx->ent_len) { + ctx->lck = ctx->dxb + ctx->ent_len + 1; + memcpy(ctx->dxb, pathname, sizeof(pathchar_t) * (ctx->ent_len + 1)); + if (*flags & MDBX_NOSUBDIR) { + memcpy(ctx->lck + ctx->ent_len, lock_suffix, sizeof(lock_suffix)); + } else { + ctx->lck += dxb_name_len; + memcpy(ctx->lck + ctx->ent_len, lck_name, sizeof(lck_name)); + memcpy(ctx->dxb + ctx->ent_len, dxb_name, sizeof(dxb_name)); + } + memcpy(ctx->lck, pathname, sizeof(pathchar_t) * ctx->ent_len); } else { - ctx->lck += dxb_name_len; - memcpy(ctx->lck + ctx->ent_len, lck_name, sizeof(lck_name)); - memcpy(ctx->dxb + ctx->ent_len, dxb_name, sizeof(dxb_name)); + ctx->lck = ctx->dxb + dxb_name_len; + memcpy(ctx->lck, lck_name + 1, sizeof(lck_name) - sizeof(pathchar_t)); + memcpy(ctx->dxb, dxb_name + 1, sizeof(dxb_name) - sizeof(pathchar_t)); } - memcpy(ctx->lck, pathname, sizeof(pathchar_t) * ctx->ent_len); - return MDBX_SUCCESS; } diff --git a/src/osal.c b/src/osal.c index 997b9adf..e2abfa0a 100644 --- a/src/osal.c +++ b/src/osal.c @@ -1185,6 +1185,23 @@ MDBX_INTERNAL_FUNC int osal_removedirectory(const pathchar_t *pathname) { #endif } +MDBX_INTERNAL_FUNC bool osal_pathequal(const pathchar_t *l, const pathchar_t *r, + size_t len) { +#if defined(_WIN32) || defined(_WIN64) + for (size_t i = 0; i < len; ++i) { + pathchar_t a = l[i]; + pathchar_t b = r[i]; + a = (a == '\\') ? '/' : a; + b = (b == '\\') ? '/' : b; + if (a != b) + return false; + } + return true; +#else + return memcmp(l, r, len * sizeof(pathchar_t)) == 0; +#endif +} + MDBX_INTERNAL_FUNC int osal_openfile(const enum osal_openfile_purpose purpose, const MDBX_env *env, const pathchar_t *pathname, diff --git a/src/osal.h b/src/osal.h index d7809ae4..b362d4b6 100644 --- a/src/osal.h +++ b/src/osal.h @@ -549,6 +549,16 @@ enum osal_openfile_purpose { MDBX_OPEN_DELETE }; +MDBX_MAYBE_UNUSED static __inline bool osal_isdirsep(pathchar_t c) { + return +#if defined(_WIN32) || defined(_WIN64) + c == '\\' || +#endif + c == '/'; +} + +MDBX_INTERNAL_FUNC bool osal_pathequal(const pathchar_t *l, const pathchar_t *r, + size_t len); MDBX_INTERNAL_FUNC int osal_openfile(const enum osal_openfile_purpose purpose, const MDBX_env *env, const pathchar_t *pathname, From 61e77e7b70e347abbfc230784169c4181a932b12 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Thu, 5 Jan 2023 22:33:56 +0300 Subject: [PATCH 318/364] =?UTF-8?q?mdbx:=20=D0=BA=D0=BE=D0=BD=D1=82=D1=80?= =?UTF-8?q?=D0=BE=D0=BB=D1=8C=20=D0=BE=D1=82=D1=81=D1=83=D1=82=D1=81=D1=82?= =?UTF-8?q?=D0=B2=D0=B8=D1=8F=20=D0=B4=D1=83=D0=B1=D0=BB=D0=B8=D0=BA=D0=B0?= =?UTF-8?q?=D1=82=D0=BE=D0=B2=20LCK-=D1=84=D0=B0=D0=B9=D0=BB=D0=B0=20?= =?UTF-8?q?=D1=81=20=D0=B0=D0=BB=D1=8C=D1=82=D0=B5=D1=80=D0=BD=D0=B0=D1=82?= =?UTF-8?q?=D0=B8=D0=B2=D0=BD=D1=8B=D0=BC=D0=B8=20=D0=B8=D0=BC=D0=B5=D0=BD?= =?UTF-8?q?=D0=B0=D0=BC=D0=B8.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mdbx.h | 3 +++ src/core.c | 50 +++++++++++++++++++++++++++++++++++++++++++------- src/osal.c | 27 +++++++++++++++++++++++++++ src/osal.h | 5 +++++ 4 files changed, 78 insertions(+), 7 deletions(-) diff --git a/mdbx.h b/mdbx.h index 8d8621de..77251ab9 100644 --- a/mdbx.h +++ b/mdbx.h @@ -1932,6 +1932,9 @@ enum MDBX_error_t { * равнозначна \ref MDBX_PROBLEM. */ MDBX_BACKLOG_DEPLETED = -30414, + /** Alternative/Duplicate LCK-file is exists and should be removed manually */ + MDBX_DUPLICATED_CLK = -30413, + /* The last of MDBX-added error codes */ MDBX_LAST_ADDED_ERRCODE = MDBX_TXN_OVERLAPPING, diff --git a/src/core.c b/src/core.c index b04cbc23..1a3ab15d 100644 --- a/src/core.c +++ b/src/core.c @@ -3471,6 +3471,9 @@ __cold const char *mdbx_liberr2str(int errnum) { case MDBX_TXN_OVERLAPPING: return "MDBX_TXN_OVERLAPPING: Overlapping read and write transactions for" " the current thread"; + case MDBX_DUPLICATED_CLK: + return "MDBX_DUPLICATED_CLK: Alternative/Duplicate LCK-file is exists, " + "please keep one and remove unused other"; default: return NULL; } @@ -14461,6 +14464,17 @@ typedef struct { size_t ent_len; } MDBX_handle_env_pathname; +__cold static int check_alternative_lck_absent(const pathchar_t *lck_pathname) { + int err = osal_fileexists(lck_pathname); + if (unlikely(err != MDBX_RESULT_FALSE)) { + if (err == MDBX_RESULT_TRUE) + err = MDBX_DUPLICATED_CLK; + ERROR("Alternative/Duplicate LCK-file '%" MDBX_PRIsPATH "' error %d", + lck_pathname, err); + } + return err; +} + __cold static int handle_env_pathname(MDBX_handle_env_pathname *ctx, const pathchar_t *pathname, MDBX_env_flags_t *flags, @@ -14562,23 +14576,45 @@ __cold static int handle_env_pathname(MDBX_handle_env_pathname *ctx, return MDBX_ENOMEM; ctx->dxb = ctx->buffer_for_free; + ctx->lck = ctx->dxb + ctx->ent_len + dxb_name_len + 1; + pathchar_t *const buf = ctx->buffer_for_free; + rc = MDBX_SUCCESS; if (ctx->ent_len) { - ctx->lck = ctx->dxb + ctx->ent_len + 1; + memcpy(buf, pathname, sizeof(pathchar_t) * pathname_len); + if (*flags & MDBX_NOSUBDIR) { + const pathchar_t *const lck_ext = + osal_fileext(lck_name, ARRAY_LENGTH(lck_name)); + if (lck_ext) { + pathchar_t *pathname_ext = osal_fileext(buf, pathname_len); + memcpy(pathname_ext ? pathname_ext : buf + pathname_len, lck_ext, + sizeof(pathchar_t) * (ARRAY_END(lck_name) - lck_ext)); + rc = check_alternative_lck_absent(buf); + } + } else { + memcpy(buf + ctx->ent_len, dxb_name, sizeof(dxb_name)); + memcpy(buf + ctx->ent_len + dxb_name_len, lock_suffix, + sizeof(lock_suffix)); + rc = check_alternative_lck_absent(buf); + } + memcpy(ctx->dxb, pathname, sizeof(pathchar_t) * (ctx->ent_len + 1)); + memcpy(ctx->lck, pathname, sizeof(pathchar_t) * ctx->ent_len); if (*flags & MDBX_NOSUBDIR) { memcpy(ctx->lck + ctx->ent_len, lock_suffix, sizeof(lock_suffix)); } else { - ctx->lck += dxb_name_len; - memcpy(ctx->lck + ctx->ent_len, lck_name, sizeof(lck_name)); memcpy(ctx->dxb + ctx->ent_len, dxb_name, sizeof(dxb_name)); + memcpy(ctx->lck + ctx->ent_len, lck_name, sizeof(lck_name)); } - memcpy(ctx->lck, pathname, sizeof(pathchar_t) * ctx->ent_len); } else { - ctx->lck = ctx->dxb + dxb_name_len; - memcpy(ctx->lck, lck_name + 1, sizeof(lck_name) - sizeof(pathchar_t)); + assert(!(*flags & MDBX_NOSUBDIR)); + memcpy(buf, dxb_name + 1, sizeof(dxb_name) - sizeof(pathchar_t)); + memcpy(buf + dxb_name_len - 1, lock_suffix, sizeof(lock_suffix)); + rc = check_alternative_lck_absent(buf); + memcpy(ctx->dxb, dxb_name + 1, sizeof(dxb_name) - sizeof(pathchar_t)); + memcpy(ctx->lck, lck_name + 1, sizeof(lck_name) - sizeof(pathchar_t)); } - return MDBX_SUCCESS; + return rc; } __cold int mdbx_env_delete(const char *pathname, MDBX_env_delete_mode_t mode) { diff --git a/src/osal.c b/src/osal.c index e2abfa0a..5e7606c1 100644 --- a/src/osal.c +++ b/src/osal.c @@ -1185,6 +1185,33 @@ MDBX_INTERNAL_FUNC int osal_removedirectory(const pathchar_t *pathname) { #endif } +MDBX_INTERNAL_FUNC int osal_fileexists(const pathchar_t *pathname) { +#if defined(_WIN32) || defined(_WIN64) + if (GetFileAttributesW(pathname) != INVALID_FILE_ATTRIBUTES) + return MDBX_RESULT_TRUE; + int err = GetLastError(); + return (err == ERROR_FILE_NOT_FOUND || err == ERROR_PATH_NOT_FOUND) + ? MDBX_RESULT_FALSE + : err; +#else + if (access(pathname, F_OK) == 0) + return MDBX_RESULT_TRUE; + int err = errno; + return (err == ENOENT || err == ENOTDIR) ? MDBX_RESULT_FALSE : err; +#endif +} + +MDBX_INTERNAL_FUNC pathchar_t *osal_fileext(const pathchar_t *pathname, + size_t len) { + const pathchar_t *ext = nullptr; + for (size_t i = 0; i < len && pathname[i]; i++) + if (pathname[i] == '.') + ext = pathname + i; + else if (osal_isdirsep(pathname[i])) + ext = nullptr; + return (pathchar_t *)ext; +} + MDBX_INTERNAL_FUNC bool osal_pathequal(const pathchar_t *l, const pathchar_t *r, size_t len) { #if defined(_WIN32) || defined(_WIN64) diff --git a/src/osal.h b/src/osal.h index b362d4b6..90173bc4 100644 --- a/src/osal.h +++ b/src/osal.h @@ -225,8 +225,10 @@ osal_syspagesize(void) { #if defined(_WIN32) || defined(_WIN64) typedef wchar_t pathchar_t; +#define MDBX_PRIsPATH "ls" #else typedef char pathchar_t; +#define MDBX_PRIsPATH "s" #endif typedef struct osal_mmap { @@ -559,6 +561,9 @@ MDBX_MAYBE_UNUSED static __inline bool osal_isdirsep(pathchar_t c) { MDBX_INTERNAL_FUNC bool osal_pathequal(const pathchar_t *l, const pathchar_t *r, size_t len); +MDBX_INTERNAL_FUNC pathchar_t *osal_fileext(const pathchar_t *pathname, + size_t len); +MDBX_INTERNAL_FUNC int osal_fileexists(const pathchar_t *pathname); MDBX_INTERNAL_FUNC int osal_openfile(const enum osal_openfile_purpose purpose, const MDBX_env *env, const pathchar_t *pathname, From b86b71a9481e6f1f39b99192b055f79a94b9b4e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Fri, 6 Jan 2023 12:03:01 +0300 Subject: [PATCH 319/364] =?UTF-8?q?mdbx:=20=D0=BE=D0=B1=D0=BD=D0=BE=D0=B2?= =?UTF-8?q?=D0=BB=D0=B5=D0=BD=D0=B8=D0=B5=20ChangeLog.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ChangeLog.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/ChangeLog.md b/ChangeLog.md index fc423fe7..48013d08 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -10,7 +10,7 @@ and [by Yandex](https://translated.turbopages.org/proxy_u/ru-en.en/https/gitflic Выпуск с существенными доработками и новой функциональностью в память о закрытом open-source проекте "Акула". ``` -20 files changed, 4388 insertions(+), 2907 deletions(-) +20 files changed, 4504 insertions(+), 2924 deletions(-) Signed-off-by: Леонид Юрьев (Leonid Yuriev) ``` @@ -92,12 +92,14 @@ Signed-off-by: Леонид Юрьев (Leonid Yuriev) `relocation R_X86_64_TPOFF32 against FOO cannot be used with -shared` из-за ошибки в CLANG приводящей к использованию неверного режима `ls_model`. - - Изменение тактики слияние страниц при удалении. + - Изменение тактики слияния страниц при удалении. Теперь слияние выполняется преимущественно с уже измененной/грязной страницей. Если же справа и слева обе страницы с одинаковым статусом, то с наименее заполненной, как прежде. В сценариях с массивным удалением это позволяет увеличить производительность до 50%. + - Добавлен контроль отсутствия LCK-файлов с альтернативным именованием. + Исправления (без корректировок новых функций): - Изменение размера отображения если это требуется для сброса данных на From 68a8a15621cfae2a7d45eb115d263791e0f367ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Fri, 6 Jan 2023 23:31:07 +0300 Subject: [PATCH 320/364] =?UTF-8?q?mdbx:=20=D0=B8=D0=B7=D0=BC=D0=B5=D0=BD?= =?UTF-8?q?=D0=B5=D0=BD=D0=B8=D0=B5=20=D0=B0=D0=B4=D1=80=D0=B5=D1=81=D0=B0?= =?UTF-8?q?=20ioarena.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- GNUmakefile | 4 ++-- README.md | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/GNUmakefile b/GNUmakefile index e46a36bf..eee59356 100644 --- a/GNUmakefile +++ b/GNUmakefile @@ -788,7 +788,7 @@ IOARENA := $(shell \ (test -x ../ioarena/@BUILD/src/ioarena && echo ../ioarena/@BUILD/src/ioarena) || \ (test -x ../../@BUILD/src/ioarena && echo ../../@BUILD/src/ioarena) || \ (test -x ../../src/ioarena && echo ../../src/ioarena) || which ioarena 2>&- || \ - (echo false && echo '$(TIP) Clone and build the https://github.com/pmwkaa/ioarena.git within a neighbouring directory for availability of benchmarking.' >&2)) + (echo false && echo '$(TIP) Clone and build the https://abf.io/erthink/ioarena.git within a neighbouring directory for availability of benchmarking.' >&2)) endif NN ?= 25000000 BENCH_CRUD_MODE ?= nosync @@ -802,7 +802,7 @@ re-bench: bench-clean bench ifeq ($(or $(IOARENA),false),false) bench bench-quartet bench-triplet bench-couple: $(QUIET)echo 'The `ioarena` benchmark is required.' >&2 && \ - echo 'Please clone and build the https://github.com/pmwkaa/ioarena.git within a neighbouring `ioarena` directory.' >&2 && \ + echo 'Please clone and build the https://abf.io/erthink/ioarena.git within a neighbouring `ioarena` directory.' >&2 && \ false else diff --git a/README.md b/README.md index 44c68726..818bf0d7 100644 --- a/README.md +++ b/README.md @@ -248,7 +248,7 @@ the user's point of view. > and up to 30% faster when _libmdbx_ compiled with specific build options > which downgrades several runtime checks to be match with LMDB behaviour. > - > These and other results could be easily reproduced with [ioArena](https://github.com/pmwkaa/ioarena) just by `make bench-quartet` command, + > These and other results could be easily reproduced with [ioArena](https://abf.io/erthink/ioarena.git) just by `make bench-quartet` command, > including comparisons with [RockDB](https://en.wikipedia.org/wiki/RocksDB) > and [WiredTiger](https://en.wikipedia.org/wiki/WiredTiger). @@ -630,7 +630,7 @@ Bindings Performance comparison ====================== -All benchmarks were done in 2015 by [IOArena](https://github.com/pmwkaa/ioarena) +All benchmarks were done in 2015 by [IOArena](https://abf.io/erthink/ioarena.git) and multiple [scripts](https://github.com/pmwkaa/ioarena/tree/HL%2B%2B2015) runs on Lenovo Carbon-2 laptop, i7-4600U 2.1 GHz (2 physical cores, 4 HyperThreading cores), 8 Gb RAM, SSD SAMSUNG MZNTD512HAGL-000L1 (DXT23L0Q) 512 Gb. From f1fdb88938c07037e5bef12c884eca47b7437fac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Sat, 7 Jan 2023 00:00:05 +0300 Subject: [PATCH 321/364] =?UTF-8?q?mdbx:=20=D0=B2=D1=8B=D0=BF=D1=83=D1=81?= =?UTF-8?q?=D0=BA=20v0.12.3=20"=D0=90=D0=BA=D1=83=D0=BB=D0=B0"?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Выпуск с существенными доработками и новой функциональностью в память о закрытом open-source проекте "Акула". Благодарности: -------------- - [Alex Sharov](https://t.me/AskAlexSharov) и команде [Erigon](https://github.com/ledgerwatch/erigon) за тестирование. - [Simon Leier](https://t.me/leisim) за сообщение о сбоях и тестирование. Новое: ------ - Использование адреса [https://libmdbx.dqdkfa.ru/dead-github](https://libmdbx.dqdkfa.ru/dead-github) для отсылки к сохранённым в web.archive.org копиям ресурсов, уничтоженных администрацией Github. - Реализована prefault-запись при выделении страниц для read-write отображений. Это приводит к кратному снижению системных издержек и существенному увеличению производительности в соответствующих сценариях использования, когда: - размер БД и объём данных существенно больше ОЗУ; - используется режим `MDBX_WRITEMAP`; - не-мелкие транзакции (по ходу транзакции выделяется многие сотни или тысячи страниц). В режиме `MDBX_WRITEMAP` выделение/переиспользование страниц приводит к page-fault и чтению страницы с диска, даже если содержимое страницы не нужно (будет перезаписано). Это является следствием работы подсистемы виртуальной памяти, а штатный способ лечения через `MADV_REMOVE` работает не на всех ФС и обычно дороже получаемой экономии. Теперь в libmdbx используется "упреждающая запись" таких страниц, которая на системах с [unified page cache](https://www.opennet.ru/base/dev/ubc.txt.html) приводит к "вталкиванию" данных, устраняя необходимость чтения с диска при обращении к такой странице памяти. Новый функционал работает в согласованности с автоматическим управлением read-ahead и кэшем статуса присутствия страниц в ОЗУ, посредством [mincore()](https://man7.org/linux/man-pages/man2/mincore.2.html). - Добавлена опция `MDBX_opt_prefault_write_enable` для возможности принудительного включения/выключения prefault-записи. - Реализован динамический выбор между сквозной записью на диск и обычной записью с последующим [fdatasync()](https://man7.org/linux/man-pages/man3/fdatasync.3p.html) управляемый опцией `MDBX_opt_writethrough_threshold`. В долговечных (durable) режимах данные на диск могут быть сброшены двумя способами: - сквозной записью через файловый дескриптор открытый с `O_DSYNC`; - обычной записью с последующим вызовом `fdatasync()`. Первый способ выгоднее при записи малого количества страниц и/или если канал взаимодействия с диском/носителем имеет близкую к нулю задержку. Второй способ выгоднее если требуется записать много страниц и/или канал взаимодействия имеет весомую задержку (датацентры, облака). Добавленная опция `MDBX_opt_writethrough_threshold` позволяет во время выполнения задать порог для динамического выбора способа записи в зависимости от объема и конкретных условия использования. - Автоматическая установка `MDBX_opt_rp_augment_limit` в зависимости от размера БД. - Запрещение разного режима `MDBX_WRITEMAP` между процессами в режимах с отложенной/ленивой записью, так как в этом случае невозможно обеспечить сброс данных на диск во всех случаях на всех поддерживаемых платформах. - Добавлена опция сборки `MDBX_MMAP_USE_MS_ASYNC` позволяющая отключить использование системного вызова `msync(MS_ASYNC)`, в использовании которого нет необходимости на подавляющем большинстве актуальных ОС. По-умолчанию `MDBX_MMAP_USE_MS_ASYNC=0` (выключено) на Linux и других системах с unified page cache. Такое поведение (без использования `msync(MS_ASYNC)`) соответствует неизменяемой (hardcoded) логике LMDB. В результате, в простых/наивных бенчмарках, libmdbx опережает LMDB примерна также как при реальном применении. На всякий случай стоит еще раз отметить/напомнить, что на Windows предположительно libmdbx будет отставать от LMDB в сценариях с множеством мелких транзакций, так как libmdbx осознанно использует на Windows файловые блокировки, которые медленные (плохо реализованы в ядре ОС), но позволяют застраховать пользователей от массы неверных действий приводящих к повреждению БД. - Поддержка не-печатных имен для subDb. - Добавлен явный выбор `tls_model("local-dynamic")` для обзода проблемы `relocation R_X86_64_TPOFF32 against FOO cannot be used with -shared` из-за ошибки в CLANG приводящей к использованию неверного режима `ls_model`. - Изменение тактики слияния страниц при удалении. Теперь слияние выполняется преимущественно с уже измененной/грязной страницей. Если же справа и слева обе страницы с одинаковым статусом, то с наименее заполненной, как прежде. В сценариях с массивным удалением это позволяет увеличить производительность до 50%. - Добавлен контроль отсутствия LCK-файлов с альтернативным именованием. Исправления (без корректировок новых функций): ---------------------------------------------- - Изменение размера отображения если это требуется для сброса данных на диск при вызове `mdbx_env_sync()` из параллельного потока выполнения вне работающей транзакции. - Исправление регресса после коммита db72763de049d6e4546f838277fe83b9081ad1de от 2022-10-08 в логике возврата грязных страниц в режиме `MDBX_WRITEMAP`, из-за чего освободившиеся страницы использовались не немедленно, а попадали в retired-список совершаемой транзакции и происходил необоснованный рост размера транзакции. - Устранение SIGSEGV или ошибочного вызова `free()` в ситуациях повторного открытия среды посредством `mdbx_env_open()`. - Устранение ошибки совершенной в коммите fe20de136c22ed3bc4c6d3f673e79c106e824f60 от 2022-09-18, в результате чего на Linux в режиме `MDBX_WRITEMAP` никогда не вызывался `msync()`. Проблема существует только в релизе 0.12.2. - Добавление подсчета грязных страниц в `MDBX_WRITEMAP` для предоставления посредством `mdbx_txn_info()` актуальной информации об объеме изменений в процессе транзакций чтения-записи. - Исправление несущественной опечатки в условиях `#if` определения порядка байт. - Исправление сборки для случая `MDBX_PNL_ASCENDING=1`. Ликвидация технических долгов и мелочи: --------------------------------------- - Доработка поддержки авто-слияния записей GC внутри `page_alloc_slowpath()`. - Устранение несущественных предупреждений Coverity. - Использование единого курсора для поиска в GC. - Переработка внутренних флагов связанных с выделением страниц из GC. - Доработка подготовки резерва перед обновлением GC при включенном BigFoot. - Оптимизация `pnl_merge()` для случаев неперекрывающихся объединяемых списков. - Оптимизация поддержки отсортированного списка страниц в `dpl_append()`. - Ускорение работы `mdbx_chk` при обработке пользовательских записей в `@MAIN`. - Переработка LRU-отметок для спиллинга. - Переработка контроля "некогерентности" Unified page cache для уменьшения накладных расходов. - Рефакторинг и микрооптимизация. 20 files changed, 4504 insertions(+), 2924 deletions(-) Signed-off-by: Леонид Юрьев (Leonid Yuriev) --- ChangeLog.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ChangeLog.md b/ChangeLog.md index 48013d08..cb34bddb 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -5,12 +5,12 @@ English version [by Google](https://gitflic-ru.translate.goog/project/erthink/li and [by Yandex](https://translated.turbopages.org/proxy_u/ru-en.en/https/gitflic.ru/project/erthink/libmdbx/blob?file=ChangeLog.md). -## v0.12.3 (Акула) запланирован на 2023-01-07 +## v0.12.3 (Акула) от 2023-01-07 Выпуск с существенными доработками и новой функциональностью в память о закрытом open-source проекте "Акула". ``` -20 files changed, 4504 insertions(+), 2924 deletions(-) +20 files changed, 4508 insertions(+), 2928 deletions(-) Signed-off-by: Леонид Юрьев (Leonid Yuriev) ``` From 16cda5c2e891fd7a803b69d4befbd67938831e4c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Sun, 8 Jan 2023 12:40:44 +0300 Subject: [PATCH 322/364] =?UTF-8?q?mdbx:=20=D0=B8=D1=81=D0=BF=D1=80=D0=B0?= =?UTF-8?q?=D0=B2=D0=BB=D0=B5=D0=BD=D0=B8=D0=B5=20=D0=BE=D0=BF=D0=B5=D1=87?= =?UTF-8?q?=D0=B0=D1=82=D0=BE=D0=BA=20=D0=B2=20ChangeLog.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ChangeLog.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ChangeLog.md b/ChangeLog.md index cb34bddb..1b8ca7ad 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -77,7 +77,7 @@ Signed-off-by: Леонид Юрьев (Leonid Yuriev) системах с unified page cache. Такое поведение (без использования `msync(MS_ASYNC)`) соответствует неизменяемой (hardcoded) логике LMDB. В результате, в простых/наивных бенчмарках, libmdbx опережает LMDB - примерна также как при реальном применении. + примерно также как при реальном применении. На всякий случай стоит еще раз отметить/напомнить, что на Windows предположительно libmdbx будет отставать от LMDB в сценариях с @@ -88,7 +88,7 @@ Signed-off-by: Леонид Юрьев (Leonid Yuriev) - Поддержка не-печатных имен для subDb. - - Добавлен явный выбор `tls_model("local-dynamic")` для обзода проблемы + - Добавлен явный выбор `tls_model("local-dynamic")` для обхода проблемы `relocation R_X86_64_TPOFF32 against FOO cannot be used with -shared` из-за ошибки в CLANG приводящей к использованию неверного режима `ls_model`. From 3da23da7b310fceca85fcad4c8dde5fa14aa525d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Mon, 9 Jan 2023 21:32:35 +0300 Subject: [PATCH 323/364] =?UTF-8?q?mdbx:=20=D0=BA=D0=BE=D1=81=D0=BC=D0=B5?= =?UTF-8?q?=D1=82=D0=B8=D1=87=D0=B5=D1=81=D0=BA=D0=B8=D0=B9=20=D1=80=D0=B5?= =?UTF-8?q?=D1=84=D0=B0=D0=BA=D1=82=D0=BE=D1=80=D0=B8=D0=BD=D0=B3=20=D0=BA?= =?UTF-8?q?=D0=BE=D0=BD=D1=82=D1=80=D0=BE=D0=BB=D1=8F=20`MDBX=5FAPPEND`.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/core.c b/src/core.c index 1a3ab15d..c17553a0 100644 --- a/src/core.c +++ b/src/core.c @@ -17172,15 +17172,15 @@ static __hot int cursor_put_nochecklen(MDBX_cursor *mc, const MDBX_val *key, if ((flags & MDBX_APPEND) && mc->mc_db->md_entries > 0) { rc = cursor_last(mc, &dkey, &olddata); if (likely(rc == MDBX_SUCCESS)) { - rc = mc->mc_dbx->md_cmp(key, &dkey); - if (likely(rc > 0)) { + const int cmp = mc->mc_dbx->md_cmp(key, &dkey); + if (likely(cmp > 0)) { mc->mc_ki[mc->mc_top]++; /* step forward for appending */ rc = MDBX_NOTFOUND; + } else if (unlikely(cmp != 0)) { + /* new-key < last-key */ + return MDBX_EKEYMISMATCH; } else { - if (unlikely(rc != MDBX_SUCCESS)) - /* new-key < last-key - * or new-key == last-key without MDBX_APPENDDUP */ - return MDBX_EKEYMISMATCH; + rc = MDBX_SUCCESS; exact = true; } } From 702c67fc38667776824a74f7d140b3410a713c09 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Mon, 9 Jan 2023 21:33:08 +0300 Subject: [PATCH 324/364] =?UTF-8?q?mdbx-test:=20=D0=B4=D0=BE=D1=80=D0=B0?= =?UTF-8?q?=D0=B1=D0=BE=D1=82=D0=BA=D0=B0=20append-=D1=82=D0=B5=D1=81?= =?UTF-8?q?=D1=82=D0=B0.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - добавлен speculum-контроль; - с вероятностью 1/8 генерируются не-последовательные/не-упорядоченные ключи для проверки возврата MDBX_EKEYMISMATH; - игнорирование расхождение хеша последовательности для не-последовательных ключей. --- test/append.c++ | 39 +++++++++++++++++++++++++++++++++------ 1 file changed, 33 insertions(+), 6 deletions(-) diff --git a/test/append.c++ b/test/append.c++ index 5ca2245f..59e9ec12 100644 --- a/test/append.c++ +++ b/test/append.c++ @@ -21,7 +21,14 @@ public: bool run() override; static bool review_params(actor_params ¶ms) { - return testcase::review_params(params) && params.make_keygen_linear(); + if (!testcase::review_params(params)) + return false; + const bool ordered = !flipcoin_x3(); + log_notice("the '%s' key-generation mode is selected", + ordered ? "ordered/linear" : "unordered/non-linear"); + if (ordered && !params.make_keygen_linear()) + return false; + return true; } }; REGISTER_TESTCASE(append); @@ -133,8 +140,6 @@ bool testcase_append::run() { } } else failure_perror("mdbx_get_equal_or_great()", err); - - assert(!expect_key_mismatch); } err = mdbx_cursor_put(cursor_guard.get(), &key->value, &data->value, flags); @@ -148,12 +153,25 @@ bool testcase_append::run() { if (!expect_key_mismatch) { if (unlikely(err != MDBX_SUCCESS)) - failure_perror("mdbx_cursor_put(insert-a)", err); + failure_perror("mdbx_cursor_put(append)", err); ++inserted_number; inserted_checksum.push((uint32_t)inserted_number, key->value); inserted_checksum.push(10639, data->value); + + if (config.params.speculum) { + Item item(iov2dataview(key), iov2dataview(data)); + const auto insertion_result = speculum.insert(item); + if (!insertion_result.second) { + char dump_key[32], dump_value[32]; + log_error( + "speculum.append: unexpected %s {%s, %s}", "MDBX_SUCCESS", + mdbx_dump_val(&key->value, dump_key, sizeof(dump_key)), + mdbx_dump_val(&data->value, dump_value, sizeof(dump_value))); + return false; + } + } } else if (unlikely(err != MDBX_EKEYMISMATCH)) - failure_perror("mdbx_cursor_put(insert-a) != MDBX_EKEYMISMATCH", err); + failure_perror("mdbx_cursor_put(append) != MDBX_EKEYMISMATCH", err); if (++txn_nops >= config.params.batch_write) { err = breakable_restart(); @@ -166,6 +184,10 @@ bool testcase_append::run() { committed_inserted_number = inserted_number; committed_inserted_checksum = inserted_checksum; txn_nops = 0; + if (!speculum_verify()) { + log_notice("append: bailout breakable_restart"); + return false; + } } report(1); @@ -181,6 +203,10 @@ bool testcase_append::run() { } //---------------------------------------------------------------------------- txn_begin(true); + if (!speculum_verify()) { + log_notice("append: bailout verify"); + return false; + } cursor_renew(); MDBX_val check_key, check_data; @@ -209,7 +235,8 @@ bool testcase_append::run() { failure("read_count(%" PRIu64 ") != inserted_number(%" PRIu64 ")", read_count, inserted_number); - if (unlikely(read_checksum.value != inserted_checksum.value)) + if (unlikely(read_checksum.value != inserted_checksum.value) && + !keyvalue_maker.is_unordered()) failure("read_checksum(0x%016" PRIu64 ") " "!= inserted_checksum(0x%016" PRIu64 ")", read_checksum.value, inserted_checksum.value); From 525c4a55a4e10018967b4838df202434c381f277 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Tue, 10 Jan 2023 14:16:08 +0300 Subject: [PATCH 325/364] mdbx: fix English typos. Thanks to Dimitris Apostolou --- ChangeLog.md | 4 ++-- GNUmakefile | 2 +- mdbx.h | 20 ++++++++++---------- mdbx.h++ | 2 +- src/core.c | 16 ++++++++-------- src/internals.h | 4 ++-- src/mdbx_chk.c | 2 +- src/options.h | 2 +- src/osal.h | 2 +- 9 files changed, 27 insertions(+), 27 deletions(-) diff --git a/ChangeLog.md b/ChangeLog.md index 1b8ca7ad..53438b7d 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -1172,7 +1172,7 @@ Fixes: - Fix a lot of typos & spelling (Thanks to Josh Soref for PR). - Fix `getopt()` messages for Windows (Thanks to Andrey Sporaw for reporting). - Fix MSVC compiler version requirements (Thanks to Andrey Sporaw for reporting). - - Workarounds for QEMU's bugs to run tests for cross-builded library under QEMU. + - Workarounds for QEMU's bugs to run tests for cross-built[A library under QEMU. - Now C++ compiler optional for building by CMake. @@ -1241,7 +1241,7 @@ Deprecated functions and flags: - Avoid using `pwritev()` for single-writes (up to 10% speedup for some kernels & scenarios). - Avoiding `MDBX_UTTERLY_NOSYNC` as result of flags merge. - Add `mdbx_dbi_dupsort_depthmask()` function. - - Add `MDBX_CP_FORCE_RESIZEABLE` option. + - Add `MDBX_CP_FORCE_RESIZABLE` option. - Add deprecated `MDBX_MAP_RESIZED` for compatibility. - Add `MDBX_BUILD_TOOLS` option (default `ON`). - Refine `mdbx_dbi_open_ex()` to safe concurrently opening the same handle from different threads. diff --git a/GNUmakefile b/GNUmakefile index eee59356..89b8e506 100644 --- a/GNUmakefile +++ b/GNUmakefile @@ -6,7 +6,7 @@ # ################################################################################ # -# Basic internal definitios. For a customizable variables and options see below. +# Basic internal definitions. For a customizable variables and options see below. # $(info // The GNU Make $(MAKE_VERSION)) SHELL := $(shell env bash -c 'echo $$BASH') diff --git a/mdbx.h b/mdbx.h index 77251ab9..c4e0b756 100644 --- a/mdbx.h +++ b/mdbx.h @@ -695,11 +695,11 @@ extern LIBMDBX_VERINFO_API const struct MDBX_build_info { * automatically (de)initialization, releasing reader lock table slots * and so on. * - * If MDBX builded as a DLL this is done out-of-the-box by DllEntry() function, + * If MDBX built as a DLL this is done out-of-the-box by DllEntry() function, * which called automatically by Windows core with passing corresponding reason * argument. * - * Otherwise, if MDBX was builded not as a DLL, some black magic + * Otherwise, if MDBX was built not as a DLL, some black magic * may be required depending of Windows version: * * - Modern Windows versions, including Windows Vista and later, provides @@ -881,7 +881,7 @@ enum MDBX_constants { /* DEBUG & LOGGING ************************************************************/ /** \addtogroup c_debug - * \note Most of debug feature enabled only when libmdbx builded with + * \note Most of debug feature enabled only when libmdbx built with * \ref MDBX_DEBUG build option. @{ */ /** Log level @@ -946,7 +946,7 @@ typedef enum MDBX_log_level_t MDBX_log_level_t; * * \details `MDBX_DBG_DUMP` and `MDBX_DBG_LEGACY_MULTIOPEN` always have an * effect, but `MDBX_DBG_ASSERT`, `MDBX_DBG_AUDIT` and `MDBX_DBG_JITTER` only if - * libmdbx builded with \ref MDBX_DEBUG. */ + * libmdbx built with \ref MDBX_DEBUG. */ enum MDBX_debug_flags_t { MDBX_DBG_NONE = 0, @@ -1682,7 +1682,7 @@ enum MDBX_copy_flags_t { * pages sequentially */ MDBX_CP_COMPACT = 1u, - /** Force to make resizeable copy, i.e. dynamic size instead of fixed */ + /** Force to make resizable copy, i.e. dynamic size instead of fixed */ MDBX_CP_FORCE_DYNAMIC_SIZE = 2u }; #ifndef __cplusplus @@ -2449,7 +2449,7 @@ LIBMDBX_API int mdbx_env_deleteW(const wchar_t *pathnameW, * account skipping free pages. * * - \ref MDBX_CP_FORCE_DYNAMIC_SIZE - * Force to make resizeable copy, i.e. dynamic size instead of fixed. + * Force to make resizable copy, i.e. dynamic size instead of fixed. * * \returns A non-zero error value on failure and 0 on success. */ LIBMDBX_API int mdbx_env_copy(MDBX_env *env, const char *dest, @@ -2907,7 +2907,7 @@ enum MDBX_warmup_flags_t { MDBX_warmup_lock = 4, /** Alters corresponding current resource limits to be enough for lock pages - * by \ref MDBX_warmup_lock. However, this option should be used in simpliest + * by \ref MDBX_warmup_lock. However, this option should be used in simpler * applications since takes into account only current size of this environment * disregarding all other factors. For real-world database application you * will need full-fledged management of resources and their limits with @@ -2943,7 +2943,7 @@ DEFINE_ENUM_FLAG_OPERATORS(MDBX_warmup_flags_t) * \param [in] timeout_seconds_16dot16 Optional timeout which checking only * during explicitly peeking database pages * for loading ones if the \ref MDBX_warmup_force - * option was spefified. + * option was specified. * * \returns A non-zero error value on failure and 0 on success. * Some possible errors are: @@ -3071,7 +3071,7 @@ LIBMDBX_API int mdbx_env_get_fd(const MDBX_env *env, mdbx_filehandle_t *fd); * was called after \ref mdbx_env_open() but OUTSIDE a write transaction, * then MDBX will execute internal pseudo-transaction to apply new parameters * (but only if anything has been changed), and changes be visible to any - * others processes immediately after succesful completion of function. + * others processes immediately after successful completion of function. * * Essentially a concept of "automatic size management" is simple and useful: * - There are the lower and upper bounds of the database file size; @@ -5265,7 +5265,7 @@ mdbx_get_datacmp(MDBX_db_flags_t flags); * \param [in] thread The reader thread ID. * \param [in] bytes_used The number of last used page * in the MVCC-snapshot which being read, - * i.e. database file can't shrinked beyond this. + * i.e. database file can't be shrunk beyond this. * \param [in] bytes_retained The total size of the database pages that were * retired by committed write transactions after * the reader's MVCC-snapshot, diff --git a/mdbx.h++ b/mdbx.h++ index b57e7507..4c591641 100644 --- a/mdbx.h++ +++ b/mdbx.h++ @@ -3450,7 +3450,7 @@ public: /// transactions since the current read /// transaction started. size_t bytes_used; ///< The number of last used page in the MVCC-snapshot - ///< which being read, i.e. database file can't shrinked + ///< which being read, i.e. database file can't be shrunk ///< beyond this. size_t bytes_retained; ///< The total size of the database pages that ///< were retired by committed write transactions diff --git a/src/core.c b/src/core.c index c17553a0..d7ee6493 100644 --- a/src/core.c +++ b/src/core.c @@ -395,7 +395,7 @@ node_largedata_pgno(const MDBX_node *const __restrict node) { * * BRANCH_NODE_MAX * Branch-page must contain at least two nodes, within each a key and a child - * page number. But page can't be splitted if it contains less that 4 keys, + * page number. But page can't be split if it contains less that 4 keys, * i.e. a page should not overflow before adding the fourth key. Therefore, * at least 3 branch-node should fit in the single branch-page. Further, the * first node of a branch-page doesn't contain a key, i.e. the first node @@ -409,8 +409,8 @@ node_largedata_pgno(const MDBX_node *const __restrict node) { * Leaf-node must fit into single leaf-page, where a value could be placed on * a large/overflow page. However, may require to insert a nearly page-sized * node between two large nodes are already fill-up a page. In this case the - * page must be splitted to two if some pair of nodes fits on one page, or - * otherwise the page should be splitted to the THREE with a single node + * page must be split to two if some pair of nodes fits on one page, or + * otherwise the page should be split to the THREE with a single node * per each of ones. Such 1-into-3 page splitting is costly and complex since * requires TWO insertion into the parent page, that could lead to split it * and so on up to the root. Therefore double-splitting is avoided here and @@ -1032,7 +1032,7 @@ static __always_inline bool safe64_reset_compare(MDBX_atomic_uint64_t *p, * the asynchronously cancellation of read transaction. Therefore, * there may be a collision between the cleanup performed here and * asynchronous termination and restarting of the read transaction - * in another proces/thread. In general we MUST NOT reset the `mr_txnid` + * in another process/thread. In general we MUST NOT reset the `mr_txnid` * if a new transaction was started (i.e. if `mr_txnid` was changed). */ #if MDBX_64BIT_CAS bool rc = atomic_cas64(p, compare, UINT64_MAX); @@ -15499,12 +15499,12 @@ __hot static __always_inline int page_get_checker_lite(const uint16_t ILL, assert((ILL & (P_BRANCH | P_LEAF | P_LEAF2)) == 0); assert(page->mp_flags & (P_BRANCH | P_LEAF | P_LEAF2)); return bad_page(page, "unexpected %s instead of %s (%u)\n", - "large/overlow", "branch/leaf/leaf2", page->mp_flags); + "large/overflow", "branch/leaf/leaf2", page->mp_flags); } else if (ILL & (P_BRANCH | P_LEAF | P_LEAF2)) { assert((ILL & P_BRANCH) && (ILL & P_LEAF) && (ILL & P_LEAF2)); assert(page->mp_flags & (P_BRANCH | P_LEAF | P_LEAF2)); return bad_page(page, "unexpected %s instead of %s (%u)\n", - "branch/leaf/leaf2", "large/overlow", page->mp_flags); + "branch/leaf/leaf2", "large/overflow", page->mp_flags); } else { assert(false); } @@ -21353,7 +21353,7 @@ __cold static void compacting_fixup_meta(MDBX_env *env, MDBX_meta *meta) { unaligned_poke_u64(4, meta->mm_sign, meta_sign(meta)); } -/* Make resizeable */ +/* Make resizable */ __cold static void meta_make_sizeable(MDBX_meta *meta) { meta->mm_geo.lower = MIN_PAGENO; if (meta->mm_geo.grow_pv == 0) { @@ -21600,7 +21600,7 @@ __cold static int env_copy_asis(MDBX_env *env, MDBX_txn *read_txn, break; rc = errno; if (rc == EXDEV || rc == /* workaround for ecryptfs bug(s), - maybe usefull for others fs */ + maybe useful for others FS */ EINVAL) not_the_same_filesystem = true; else if (ignore_enosys(rc) == MDBX_RESULT_TRUE) diff --git a/src/internals.h b/src/internals.h index 4484b180..d15c21aa 100644 --- a/src/internals.h +++ b/src/internals.h @@ -35,7 +35,7 @@ /** Disables using GNU/Linux libc extensions. * \ingroup build_option - * \note This option couldn't be moved to the options.h since dependant + * \note This option couldn't be moved to the options.h since dependent * control macros/defined should be prepared before include the options.h */ #ifndef MDBX_DISABLE_GNU_SOURCE #define MDBX_DISABLE_GNU_SOURCE 0 @@ -920,7 +920,7 @@ typedef struct MDBX_lockinfo { /* Paired counter of processes that have mlock()ed part of mmapped DB. * The (mti_mlcnt[0] - mti_mlcnt[1]) > 0 means at least one process - * lock at leat one page, so therefore madvise() could return EINVAL. */ + * lock at least one page, so therefore madvise() could return EINVAL. */ MDBX_atomic_uint32_t mti_mlcnt[2]; MDBX_ALIGNAS(MDBX_CACHELINE_SIZE) /* cacheline ----------------------------*/ diff --git a/src/mdbx_chk.c b/src/mdbx_chk.c index 3a6c59eb..19d50f8c 100644 --- a/src/mdbx_chk.c +++ b/src/mdbx_chk.c @@ -1493,7 +1493,7 @@ int main(int argc, char *argv[]) { alloc_pages = backed_pages; } } else { - /* LY: DB may be shrinked by writer down to the allocated pages. */ + /* LY: DB may be shrunk by writer down to the allocated pages. */ if (alloc_pages > backed_pages) { print(" ! alloc-pages %" PRIu64 " > backed-pages %" PRIu64 "\n", alloc_pages, backed_pages); diff --git a/src/options.h b/src/options.h index a4081e6c..596efdc0 100644 --- a/src/options.h +++ b/src/options.h @@ -166,7 +166,7 @@ /** Controls sort order of internal page number lists. * This mostly experimental/advanced option with not for regular MDBX users. - * \warning The database format depend on this option and libmdbx builded with + * \warning The database format depend on this option and libmdbx built with * different option value are incompatible. */ #ifndef MDBX_PNL_ASCENDING #define MDBX_PNL_ASCENDING 0 diff --git a/src/osal.h b/src/osal.h index 90173bc4..6eb519aa 100644 --- a/src/osal.h +++ b/src/osal.h @@ -704,7 +704,7 @@ MDBX_INTERNAL_FUNC int osal_lck_destroy(MDBX_env *env, MDBX_INTERNAL_FUNC int osal_lck_seize(MDBX_env *env); /// \brief Downgrades the level of initially acquired lock to -/// operational level specified by argument. The reson for such downgrade: +/// operational level specified by argument. The reason for such downgrade: /// - unblocking of other processes that are waiting for access, i.e. /// if (env->me_flags & MDBX_EXCLUSIVE) != 0, then other processes /// should be made aware that access is unavailable rather than From 56050f201fba67ecca7a0211b5131d16eeb93b73 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Tue, 10 Jan 2023 15:03:38 +0300 Subject: [PATCH 326/364] =?UTF-8?q?mdbx:=20=D0=BE=D0=B1=D0=BD=D0=BE=D0=B2?= =?UTF-8?q?=D0=BB=D0=B5=D0=BD=D0=B8=D0=B5=20ChangeLog.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ChangeLog.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/ChangeLog.md b/ChangeLog.md index 53438b7d..dd1921a6 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -4,11 +4,24 @@ ChangeLog English version [by Google](https://gitflic-ru.translate.goog/project/erthink/libmdbx/blob?file=ChangeLog.md&_x_tr_sl=ru&_x_tr_tl=en) and [by Yandex](https://translated.turbopages.org/proxy_u/ru-en.en/https/gitflic.ru/project/erthink/libmdbx/blob?file=ChangeLog.md). +## v0.13 (в разработке) + +Ликвидация технических долгов и мелочи: + + - Исправление опечаток. + - Доработка теста для полной стохастической проверки `MDBX_EKEYMISMATCH` в режиме `MDBX_APPEND`. + + +------------------------------------------------------------------------------- + ## v0.12.3 (Акула) от 2023-01-07 Выпуск с существенными доработками и новой функциональностью в память о закрытом open-source проекте "Акула". +Добавлена prefault-запись, переделан контроль “некогерентности” unified page/buffer cache, изменена тактика слияния страниц и т.д. +Стало ещё быстрее, в некоторых сценариях вдвое. + ``` 20 files changed, 4508 insertions(+), 2928 deletions(-) Signed-off-by: Леонид Юрьев (Leonid Yuriev) From 0159f97e940e29dcc61f0f036f86a0a6d13db8e3 Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Wed, 11 Jan 2023 01:08:30 +0300 Subject: [PATCH 327/364] =?UTF-8?q?mdbx:=20=D0=BE=D0=B3=D1=80=D0=B0=D0=BD?= =?UTF-8?q?=D0=B8=D1=87=D0=B8=D0=B2=D0=B0=D0=B5=D0=BC=20=D1=80=D0=B0=D0=B7?= =?UTF-8?q?=D0=BC=D0=B5=D1=80=20=D0=BE=D1=82=D0=BE=D0=B1=D1=80=D0=B0=D0=B6?= =?UTF-8?q?=D0=B5=D0=BD=D0=B8=D1=8F=20=D0=BF=D1=80=D0=B8=20=D0=BA=D0=BE?= =?UTF-8?q?=D1=80=D0=BE=D1=82=D0=BA=D0=BE=D0=BC=20read-only=20=D1=84=D0=B0?= =?UTF-8?q?=D0=B9=D0=BB=D0=B5.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Цель в предотвращении ошибки ERROR_NOT_ENOUGH_MEMORY в Windows, которая совсем не информативна для пользователя и возникает в этом случае (когда файл открыт read-only и короче запрошенного размера). --- ChangeLog.md | 6 ++++++ src/osal.c | 13 +++++++++---- src/osal.h | 5 ++--- 3 files changed, 17 insertions(+), 7 deletions(-) diff --git a/ChangeLog.md b/ChangeLog.md index dd1921a6..1dbd33b9 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -6,6 +6,12 @@ and [by Yandex](https://translated.turbopages.org/proxy_u/ru-en.en/https/gitflic ## v0.13 (в разработке) +Исправления (без корректировок новых функций): + + - Ограничиваем размер отображения при коротком read-only файле для + предотвращении ошибки ERROR_NOT_ENOUGH_MEMORY в Windows, которая совсем + не информативна для пользователя и возникает в этом случае. + Ликвидация технических долгов и мелочи: - Исправление опечаток. diff --git a/src/osal.c b/src/osal.c index 5e7606c1..dfe528e1 100644 --- a/src/osal.c +++ b/src/osal.c @@ -2160,9 +2160,8 @@ static int check_mmap_limit(const size_t limit) { return MDBX_SUCCESS; } -MDBX_INTERNAL_FUNC int osal_mmap(const int flags, osal_mmap_t *map, - const size_t size, const size_t limit, - const unsigned options) { +MDBX_INTERNAL_FUNC int osal_mmap(const int flags, osal_mmap_t *map, size_t size, + const size_t limit, const unsigned options) { assert(size <= limit); map->limit = 0; map->current = 0; @@ -2192,7 +2191,13 @@ MDBX_INTERNAL_FUNC int osal_mmap(const int flags, osal_mmap_t *map, err = osal_filesize(map->fd, &map->filesize); if (err != MDBX_SUCCESS) return err; -#if !(defined(_WIN32) || defined(_WIN64)) +#if defined(_WIN32) || defined(_WIN64) + if (map->filesize < size) { + WARNING("file size (%zu) less than requested for mapping (%zu)", + (size_t)map->filesize, size); + size = (size_t)map->filesize; + } +#else map->current = (map->filesize > limit) ? limit : (size_t)map->filesize; #endif /* !Windows */ } diff --git a/src/osal.h b/src/osal.h index 6eb519aa..77277279 100644 --- a/src/osal.h +++ b/src/osal.h @@ -577,9 +577,8 @@ MDBX_INTERNAL_FUNC int osal_lockfile(mdbx_filehandle_t fd, bool wait); #define MMAP_OPTION_TRUNCATE 1 #define MMAP_OPTION_SEMAPHORE 2 -MDBX_INTERNAL_FUNC int osal_mmap(const int flags, osal_mmap_t *map, - const size_t must, const size_t limit, - const unsigned options); +MDBX_INTERNAL_FUNC int osal_mmap(const int flags, osal_mmap_t *map, size_t size, + const size_t limit, const unsigned options); MDBX_INTERNAL_FUNC int osal_munmap(osal_mmap_t *map); #define MDBX_MRESIZE_MAY_MOVE 0x00000100 #define MDBX_MRESIZE_MAY_UNMAP 0x00000200 From 9e15bd9b290dc99f63634e95c163c34b4af1cc08 Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Wed, 11 Jan 2023 21:09:50 +0300 Subject: [PATCH 328/364] =?UTF-8?q?mdbx-windows:=20=D1=83=D1=81=D1=82?= =?UTF-8?q?=D1=80=D0=B0=D0=BD=D0=B5=D0=BD=D0=B8=D0=B5=20=D1=80=D0=B5=D0=B3?= =?UTF-8?q?=D1=80=D0=B5=D1=81=D1=81=D0=B0=20ERROR=5FSHARING=5FVIOLATION=20?= =?UTF-8?q?=D0=B2=20=D1=80=D0=B5=D0=B6=D0=B8=D0=BC=D0=B5=20MDBX=5FEXCLUSIV?= =?UTF-8?q?E.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Спасибо maxc0d3r@protonmail.com за сообщение о проблеме. --- ChangeLog.md | 9 +++++++++ src/core.c | 9 +++++++-- src/osal.c | 2 +- 3 files changed, 17 insertions(+), 3 deletions(-) diff --git a/ChangeLog.md b/ChangeLog.md index 1dbd33b9..d6092b1d 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -6,8 +6,17 @@ and [by Yandex](https://translated.turbopages.org/proxy_u/ru-en.en/https/gitflic ## v0.13 (в разработке) +Благодарности: + + - Max за сообщение о проблеме ERROR_SHARING_VIOLATION + в режиме MDBX_EXCLUSIVE на Windows. + Исправления (без корректировок новых функций): + - Устранение регресса после коммита 474391c83c5f81def6fdf3b0b6f5716a87b78fbf + приводящего к возврату ERROR_SHARING_VIOLATION в Windows при открытии БД + в режиме MDBX_EXCLUSIVE для чтения-записи. + - Ограничиваем размер отображения при коротком read-only файле для предотвращении ошибки ERROR_NOT_ENOUGH_MEMORY в Windows, которая совсем не информативна для пользователя и возникает в этом случае. diff --git a/src/core.c b/src/core.c index d7ee6493..4ffe694c 100644 --- a/src/core.c +++ b/src/core.c @@ -14892,7 +14892,8 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, #if defined(_WIN32) || defined(_WIN64) eASSERT(env, env->me_overlapped_fd == 0); bool ior_direct = false; - if (!(flags & (MDBX_RDONLY | MDBX_SAFE_NOSYNC | MDBX_NOMETASYNC))) { + if (!(flags & + (MDBX_RDONLY | MDBX_SAFE_NOSYNC | MDBX_NOMETASYNC | MDBX_EXCLUSIVE))) { if (MDBX_AVOID_MSYNC && (flags & MDBX_WRITEMAP)) { /* Запрошен режим MDBX_SAFE_NOSYNC | MDBX_WRITEMAP при активной опции * MDBX_AVOID_MSYNC. @@ -14964,7 +14965,11 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, osal_fseek(env->me_lfd, safe_parking_lot_offset); eASSERT(env, env->me_dsync_fd == INVALID_HANDLE_VALUE); - if (!(flags & (MDBX_RDONLY | MDBX_SAFE_NOSYNC | MDBX_DEPRECATED_MAPASYNC))) { + if (!(flags & (MDBX_RDONLY | MDBX_SAFE_NOSYNC | MDBX_DEPRECATED_MAPASYNC +#if defined(_WIN32) || defined(_WIN64) + | MDBX_EXCLUSIVE +#endif /* !Windows */ + ))) { rc = osal_openfile(MDBX_OPEN_DXB_DSYNC, env, env_pathname.dxb, &env->me_dsync_fd, 0); if (MDBX_IS_ERROR(rc)) diff --git a/src/osal.c b/src/osal.c index dfe528e1..f1ae6ee7 100644 --- a/src/osal.c +++ b/src/osal.c @@ -1160,7 +1160,7 @@ MDBX_INTERNAL_FUNC void osal_ioring_destroy(osal_ioring_t *ior) { #else osal_free(ior->pool); #endif - memset(ior, -1, sizeof(osal_ioring_t)); + memset(ior, 0, sizeof(osal_ioring_t)); } /*----------------------------------------------------------------------------*/ From a98c73f4f6b71988aaf4dd481fc323caccaadbd9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Thu, 12 Jan 2023 01:42:14 +0300 Subject: [PATCH 329/364] =?UTF-8?q?mdbx-cmake:=20=D0=B2=D1=8B=D0=B7=D0=BE?= =?UTF-8?q?=D0=B2=20mdbx=5Fchk=20=D0=B2=20=D1=80=D0=B5=D0=B6=D0=B8=D0=BC?= =?UTF-8?q?=D0=B5=20=D1=87=D1=82=D0=B5=D0=BD=D0=B8=D1=8F-=D0=B7=D0=B0?= =?UTF-8?q?=D0=BF=D0=B8=D1=81=D0=B8=20=D0=B4=D0=BB=D1=8F=20=D0=BF=D1=80?= =?UTF-8?q?=D0=BE=D0=B2=D0=B5=D1=80=D0=BA=D0=B8=20MDBX=5FEXCLUSIVE=20?= =?UTF-8?q?=D0=B2=20=D1=8D=D1=82=D0=BE=D0=BC=20=D1=80=D0=B5=D0=B6=D0=B8?= =?UTF-8?q?=D0=BC=D0=B5.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ChangeLog.md | 1 + test/CMakeLists.txt | 11 ++++++++--- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/ChangeLog.md b/ChangeLog.md index d6092b1d..fe94a40a 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -25,6 +25,7 @@ and [by Yandex](https://translated.turbopages.org/proxy_u/ru-en.en/https/gitflic - Исправление опечаток. - Доработка теста для полной стохастической проверки `MDBX_EKEYMISMATCH` в режиме `MDBX_APPEND`. + - Добавление в CMake-тесты вызова mdbx_chk в режиме чтения-записи для проверки MDBX_EXCLUSIVE в этом режиме. ------------------------------------------------------------------------------- diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 0a067d09..1889c8b8 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -92,11 +92,13 @@ else() set_tests_properties(smoke_chk PROPERTIES DEPENDS smoke TIMEOUT 60 + FAIL_REGULAR_EXPRESSION "cooperative mode" REQUIRED_FILES smoke.db) add_test(NAME smoke_chk_copy COMMAND ${MDBX_OUTPUT_DIR}/mdbx_chk -nvv smoke.db-copy) set_tests_properties(smoke_chk_copy PROPERTIES DEPENDS smoke TIMEOUT 60 + FAIL_REGULAR_EXPRESSION "cooperative mode" REQUIRED_FILES smoke.db-copy) endif() @@ -109,15 +111,16 @@ else() TIMEOUT 600 RUN_SERIAL OFF) if(MDBX_BUILD_TOOLS) - add_test(NAME dupsort_writemap_chk COMMAND ${MDBX_OUTPUT_DIR}/mdbx_chk -nvv dupsort_writemap.db) + add_test(NAME dupsort_writemap_chk COMMAND ${MDBX_OUTPUT_DIR}/mdbx_chk -nvvwc dupsort_writemap.db) set_tests_properties(dupsort_writemap_chk PROPERTIES DEPENDS dupsort_writemap TIMEOUT 60 REQUIRED_FILES dupsort_writemap.db) - add_test(NAME dupsort_writemap_chk_copy COMMAND ${MDBX_OUTPUT_DIR}/mdbx_chk -nvv dupsort_writemap.db-copy) + add_test(NAME dupsort_writemap_chk_copy COMMAND ${MDBX_OUTPUT_DIR}/mdbx_chk -nvvc dupsort_writemap.db-copy) set_tests_properties(dupsort_writemap_chk_copy PROPERTIES DEPENDS dupsort_writemap TIMEOUT 60 + FAIL_REGULAR_EXPRESSION "monopolistic mode" REQUIRED_FILES dupsort_writemap.db-copy) endif() @@ -128,15 +131,17 @@ else() TIMEOUT 1800 RUN_SERIAL OFF) if(MDBX_BUILD_TOOLS) - add_test(NAME uniq_nested_chk COMMAND ${MDBX_OUTPUT_DIR}/mdbx_chk -nvv uniq_nested.db) + add_test(NAME uniq_nested_chk COMMAND ${MDBX_OUTPUT_DIR}/mdbx_chk -nvvw uniq_nested.db) set_tests_properties(uniq_nested_chk PROPERTIES DEPENDS uniq_nested TIMEOUT 60 + FAIL_REGULAR_EXPRESSION "cooperative mode" REQUIRED_FILES uniq_nested.db) add_test(NAME uniq_nested_chk_copy COMMAND ${MDBX_OUTPUT_DIR}/mdbx_chk -nvv uniq_nested.db-copy) set_tests_properties(uniq_nested_chk_copy PROPERTIES DEPENDS uniq_nested TIMEOUT 60 + FAIL_REGULAR_EXPRESSION "cooperative mode" REQUIRED_FILES uniq_nested.db-copy) endif() From 0979a93a789b6f9d96cd1586e5121e6a2c60b29b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Thu, 12 Jan 2023 13:40:11 +0300 Subject: [PATCH 330/364] =?UTF-8?q?mdbx:=20=D0=B4=D0=BE=D0=B1=D0=B0=D0=B2?= =?UTF-8?q?=D0=BB=D0=B5=D0=BD=D0=BE=20=D0=BF=D1=80=D0=B8=D0=BC=D0=B5=D1=87?= =?UTF-8?q?=D0=B0=D0=BD=D0=B8=D0=B5=20=D0=BE=D0=B1=20=D0=BE=D1=88=D0=B8?= =?UTF-8?q?=D0=B1=D0=BA=D0=B5=20MinGW=20MSYS2.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/mdbx_chk.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/mdbx_chk.c b/src/mdbx_chk.c index 19d50f8c..008de33c 100644 --- a/src/mdbx_chk.c +++ b/src/mdbx_chk.c @@ -153,6 +153,10 @@ static const char *sdb_name(const MDBX_val *val) { return ""; if (len > 65536) { static char buf[64]; + /* NOTE: There is MSYS2 MinGW bug if you here got + * the "unknown conversion type character ‘z’ in format [-Werror=format=]" + * https://stackoverflow.com/questions/74504432/whats-the-proper-way-to-tell-mingw-based-gcc-to-use-ansi-stdio-output-on-windo + */ snprintf(buf, sizeof(buf), "", len); return buf; } From a484a1f89bcbf38aeb7a81d6080605f86ddc7933 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Sun, 15 Jan 2023 20:35:09 +0300 Subject: [PATCH 331/364] =?UTF-8?q?mdbx:=20=D1=80=D0=B5=D1=84=D0=B0=D0=BA?= =?UTF-8?q?=D1=82=D0=BE=D1=80=D0=B8=D0=BD=D0=B3=20`dxb=5Fresize()`=20?= =?UTF-8?q?=D0=B8=20=D1=81=D0=B2=D1=8F=D0=B7=D0=B0=D0=BD=D0=BD=D0=BE=D0=B3?= =?UTF-8?q?=D0=BE=20=D0=BA=D0=BE=D0=B4=D0=B0.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit В том числе, для устранения срабатывания assert-проверки `size_bytes == env->me_dxb_mmap.current` в специфических многопоточных сценариях использования. Проверка срабатывала только в отладочных сборках, при специфическом наложении во времени читающей и пишущей транзакции в разных потоках, одновременно с изменением размера БД. Кроме срабатывание проверки, каких-либо других последствий не возникало. --- ChangeLog.md | 8 ++ src/core.c | 255 ++++++++++++++++++++++++++++------------------ src/lck-windows.c | 8 +- src/osal.c | 73 +++++++------ 4 files changed, 209 insertions(+), 135 deletions(-) diff --git a/ChangeLog.md b/ChangeLog.md index fe94a40a..9fff6cf1 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -10,6 +10,7 @@ and [by Yandex](https://translated.turbopages.org/proxy_u/ru-en.en/https/gitflic - Max за сообщение о проблеме ERROR_SHARING_VIOLATION в режиме MDBX_EXCLUSIVE на Windows. + - Alisher Ashyrov https://t.me/a1is43ras4 за сообщение о проблеме с assert-проверкой и содействие в отладке. Исправления (без корректировок новых функций): @@ -21,6 +22,13 @@ and [by Yandex](https://translated.turbopages.org/proxy_u/ru-en.en/https/gitflic предотвращении ошибки ERROR_NOT_ENOUGH_MEMORY в Windows, которая совсем не информативна для пользователя и возникает в этом случае. + - Рефакторинг `dxb_resize()`. В том числе, для устранения срабатывания + assert-проверки `size_bytes == env->me_dxb_mmap.current` в специфических + многопоточных сценариях использования. Проверка срабатывала только в + отладочных сборках, при специфическом наложении во времени читающей и + пишущей транзакции в разных потоках, одновременно с изменением размера БД. + Кроме срабатывание проверки, каких-либо других последствий не возникало. + Ликвидация технических долгов и мелочи: - Исправление опечаток. diff --git a/src/core.c b/src/core.c index 4ffe694c..46033253 100644 --- a/src/core.c +++ b/src/core.c @@ -6011,21 +6011,43 @@ static void adjust_defaults(MDBX_env *env) { bytes2pgno(env, bytes_align2os_bytes(env, threshold)); } -__cold static int map_resize(MDBX_env *env, const pgno_t used_pgno, - const pgno_t size_pgno, const pgno_t limit_pgno, - const bool implicit) { - const size_t limit_bytes = pgno_align2os_bytes(env, limit_pgno); - const size_t size_bytes = pgno_align2os_bytes(env, size_pgno); +enum resize_mode { implicit_grow, impilict_shrink, explicit_resize }; + +__cold static int dxb_resize(MDBX_env *const env, const pgno_t used_pgno, + const pgno_t size_pgno, pgno_t limit_pgno, + const enum resize_mode mode) { + /* Acquire guard to avoid collision between read and write txns + * around me_dbgeo and me_dxb_mmap */ +#if defined(_WIN32) || defined(_WIN64) + osal_srwlock_AcquireExclusive(&env->me_remap_guard); + int rc = MDBX_SUCCESS; + mdbx_handle_array_t *suspended = NULL; + mdbx_handle_array_t array_onstack; +#else + int rc = osal_fastmutex_acquire(&env->me_remap_guard); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; +#endif + const size_t prev_size = env->me_dxb_mmap.current; const size_t prev_limit = env->me_dxb_mmap.limit; + const pgno_t prev_limit_pgno = bytes2pgno(env, prev_limit); + eASSERT(env, prev_limit_pgno >= used_pgno); + if (mode < explicit_resize && size_pgno <= prev_limit_pgno) { + /* The actual mapsize may be less since the geo.upper may be changed + * by other process. Avoids remapping until it necessary. */ + limit_pgno = prev_limit_pgno; + } + const size_t limit_bytes = pgno_align2os_bytes(env, limit_pgno); + const size_t size_bytes = pgno_align2os_bytes(env, size_pgno); #if MDBX_ENABLE_MADVISE || defined(MDBX_USE_VALGRIND) const void *const prev_map = env->me_dxb_mmap.base; #endif /* MDBX_ENABLE_MADVISE || MDBX_USE_VALGRIND */ - VERBOSE("resize datafile/mapping: " + VERBOSE("resize/%d datafile/mapping: " "present %" PRIuPTR " -> %" PRIuPTR ", " "limit %" PRIuPTR " -> %" PRIuPTR, - prev_size, size_bytes, prev_limit, limit_bytes); + mode, prev_size, size_bytes, prev_limit, limit_bytes); eASSERT(env, limit_bytes >= size_bytes); eASSERT(env, bytes2pgno(env, size_bytes) >= size_pgno); @@ -6033,20 +6055,18 @@ __cold static int map_resize(MDBX_env *env, const pgno_t used_pgno, unsigned mresize_flags = env->me_flags & (MDBX_RDONLY | MDBX_WRITEMAP | MDBX_UTTERLY_NOSYNC); -#if defined(_WIN32) || defined(_WIN64) - /* Acquire guard in exclusive mode for: - * - to avoid collision between read and write txns around env->me_dbgeo; - * - to avoid attachment of new reading threads (see osal_rdt_lock); */ - osal_srwlock_AcquireExclusive(&env->me_remap_guard); - mdbx_handle_array_t *suspended = NULL; - mdbx_handle_array_t array_onstack; - int rc = MDBX_SUCCESS; + if (mode >= impilict_shrink) + mresize_flags |= MDBX_SHRINK_ALLOWED; + if (limit_bytes == env->me_dxb_mmap.limit && size_bytes == env->me_dxb_mmap.current && size_bytes == env->me_dxb_mmap.filesize) goto bailout; - if ((env->me_flags & MDBX_NOTLS) == 0) { +#if defined(_WIN32) || defined(_WIN64) + if ((env->me_flags & MDBX_NOTLS) == 0 && + ((size_bytes < env->me_dxb_mmap.current && mode > implicit_grow) || + limit_bytes != env->me_dxb_mmap.limit)) { /* 1) Windows allows only extending a read-write section, but not a * corresponding mapped view. Therefore in other cases we must suspend * the local threads for safe remap. @@ -6064,65 +6084,61 @@ __cold static int map_resize(MDBX_env *env, const pgno_t used_pgno, ERROR("failed suspend-for-remap: errcode %d", rc); goto bailout; } - mresize_flags |= implicit ? MDBX_MRESIZE_MAY_UNMAP - : MDBX_MRESIZE_MAY_UNMAP | MDBX_MRESIZE_MAY_MOVE; + mresize_flags |= (mode < explicit_resize) + ? MDBX_MRESIZE_MAY_UNMAP + : MDBX_MRESIZE_MAY_UNMAP | MDBX_MRESIZE_MAY_MOVE; } #else /* Windows */ - /* Acquire guard to avoid collision between read and write txns - * around env->me_dbgeo */ - int rc = osal_fastmutex_acquire(&env->me_remap_guard); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - if (limit_bytes == env->me_dxb_mmap.limit && - size_bytes == env->me_dxb_mmap.current) - goto bailout; - MDBX_lockinfo *const lck = env->me_lck_mmap.lck; - if (limit_bytes != env->me_dxb_mmap.limit && !(env->me_flags & MDBX_NOTLS) && - lck && !implicit) { - int err = osal_rdt_lock(env) /* lock readers table until remap done */; - if (unlikely(MDBX_IS_ERROR(err))) { - rc = err; - goto bailout; - } - - /* looking for readers from this process */ - const size_t snap_nreaders = - atomic_load32(&lck->mti_numreaders, mo_AcquireRelease); - eASSERT(env, !implicit); + if (mode == explicit_resize && limit_bytes != env->me_dxb_mmap.limit && + !(env->me_flags & MDBX_NOTLS)) { mresize_flags |= MDBX_MRESIZE_MAY_UNMAP | MDBX_MRESIZE_MAY_MOVE; - for (size_t i = 0; i < snap_nreaders; ++i) { - if (lck->mti_readers[i].mr_pid.weak == env->me_pid && - lck->mti_readers[i].mr_tid.weak != osal_thread_self()) { - /* the base address of the mapping can't be changed since - * the other reader thread from this process exists. */ - osal_rdt_unlock(env); - mresize_flags &= ~(MDBX_MRESIZE_MAY_UNMAP | MDBX_MRESIZE_MAY_MOVE); - break; + if (lck) { + int err = osal_rdt_lock(env) /* lock readers table until remap done */; + if (unlikely(MDBX_IS_ERROR(err))) { + rc = err; + goto bailout; + } + + /* looking for readers from this process */ + const size_t snap_nreaders = + atomic_load32(&lck->mti_numreaders, mo_AcquireRelease); + eASSERT(env, mode == explicit_resize); + for (size_t i = 0; i < snap_nreaders; ++i) { + if (lck->mti_readers[i].mr_pid.weak == env->me_pid && + lck->mti_readers[i].mr_tid.weak != osal_thread_self()) { + /* the base address of the mapping can't be changed since + * the other reader thread from this process exists. */ + osal_rdt_unlock(env); + mresize_flags &= ~(MDBX_MRESIZE_MAY_UNMAP | MDBX_MRESIZE_MAY_MOVE); + break; + } } } } #endif /* ! Windows */ - if ((env->me_flags & MDBX_WRITEMAP) && env->me_lck->mti_unsynced_pages.weak) { -#if MDBX_ENABLE_PGOP_STAT - env->me_lck->mti_pgop_stat.msync.weak += 1; -#endif /* MDBX_ENABLE_PGOP_STAT */ - rc = osal_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, used_pgno), - MDBX_SYNC_NONE); - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; - } - const pgno_t aligned_munlock_pgno = (mresize_flags & (MDBX_MRESIZE_MAY_UNMAP | MDBX_MRESIZE_MAY_MOVE)) ? 0 : bytes2pgno(env, size_bytes); + if (mresize_flags & (MDBX_MRESIZE_MAY_UNMAP | MDBX_MRESIZE_MAY_MOVE)) { + mincore_clean_cache(env); + if ((env->me_flags & MDBX_WRITEMAP) && + env->me_lck->mti_unsynced_pages.weak) { +#if MDBX_ENABLE_PGOP_STAT + env->me_lck->mti_pgop_stat.msync.weak += 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ + rc = osal_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, used_pgno), + MDBX_SYNC_NONE); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + } + } munlock_after(env, aligned_munlock_pgno, size_bytes); - mincore_clean_cache(env); #if MDBX_ENABLE_MADVISE - if (size_bytes < prev_size) { + if (size_bytes < prev_size && mode > implicit_grow) { NOTICE("resize-MADV_%s %u..%u", (env->me_flags & MDBX_WRITEMAP) ? "REMOVE" : "DONTNEED", size_pgno, bytes2pgno(env, prev_size)); @@ -6181,7 +6197,10 @@ __cold static int map_resize(MDBX_env *env, const pgno_t used_pgno, if (rc == MDBX_SUCCESS) { eASSERT(env, limit_bytes == env->me_dxb_mmap.limit); eASSERT(env, size_bytes <= env->me_dxb_mmap.filesize); - eASSERT(env, size_bytes == env->me_dxb_mmap.current); + if (mode == explicit_resize) + eASSERT(env, size_bytes == env->me_dxb_mmap.current); + else + eASSERT(env, size_bytes <= env->me_dxb_mmap.current); env->me_lck->mti_discarded_tail.weak = size_pgno; const bool readahead = !(env->me_flags & MDBX_NORDAHEAD) && @@ -6200,7 +6219,10 @@ bailout: if (rc == MDBX_SUCCESS) { eASSERT(env, limit_bytes == env->me_dxb_mmap.limit); eASSERT(env, size_bytes <= env->me_dxb_mmap.filesize); - eASSERT(env, size_bytes == env->me_dxb_mmap.current); + if (mode == explicit_resize) + eASSERT(env, size_bytes == env->me_dxb_mmap.current); + else + eASSERT(env, size_bytes <= env->me_dxb_mmap.current); /* update env-geo to avoid influences */ env->me_dbgeo.now = env->me_dxb_mmap.current; env->me_dbgeo.upper = env->me_dxb_mmap.limit; @@ -6255,21 +6277,6 @@ bailout: return rc; } -__cold static int map_resize_implicit(MDBX_env *env, const pgno_t used_pgno, - const pgno_t size_pgno, - const pgno_t limit_pgno) { - const pgno_t mapped_pgno = bytes2pgno(env, env->me_dxb_mmap.limit); - eASSERT(env, mapped_pgno >= used_pgno); - return map_resize( - env, used_pgno, size_pgno, - (size_pgno > mapped_pgno) - ? limit_pgno - : /* The actual mapsize may be less since the geo.upper may be changed - by other process. So, avoids remapping until it necessary. */ - mapped_pgno, - true); -} - static int meta_unsteady(int err, MDBX_env *env, const txnid_t early_than, const pgno_t pgno) { MDBX_meta *const meta = METAPAGE(env, pgno); @@ -7649,8 +7656,8 @@ no_gc: VERBOSE("try growth datafile to %zu pages (+%zu)", aligned, aligned - txn->mt_end_pgno); - ret.err = map_resize_implicit(env, txn->mt_next_pgno, (pgno_t)aligned, - txn->mt_geo.upper); + ret.err = dxb_resize(env, txn->mt_next_pgno, (pgno_t)aligned, + txn->mt_geo.upper, implicit_grow); if (ret.err != MDBX_SUCCESS) { ERROR("unable growth datafile to %zu pages (+%zu), errcode %d", aligned, aligned - txn->mt_end_pgno, ret.err); @@ -8095,8 +8102,8 @@ retry:; if (!inside_txn && locked && (env->me_flags & MDBX_WRITEMAP) && unlikely(head.ptr_c->mm_geo.next > bytes2pgno(env, env->me_dxb_mmap.current))) { - rc = map_resize_implicit(env, head.ptr_c->mm_geo.next, - head.ptr_c->mm_geo.now, head.ptr_c->mm_geo.upper); + rc = dxb_resize(env, head.ptr_c->mm_geo.next, head.ptr_c->mm_geo.now, + head.ptr_c->mm_geo.upper, implicit_grow); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } @@ -8974,6 +8981,8 @@ static int txn_renew(MDBX_txn *txn, const unsigned flags) { txn->mt_dbistate[MAIN_DBI] = DBI_VALID | DBI_USRVALID; rc = setup_dbx(&txn->mt_dbxs[MAIN_DBI], &txn->mt_dbs[MAIN_DBI], env->me_psize); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; txn->mt_dbistate[FREE_DBI] = DBI_VALID; txn->mt_front = txn->mt_txnid + ((flags & (MDBX_WRITEMAP | MDBX_RDONLY)) == 0); @@ -8982,34 +8991,80 @@ static int txn_renew(MDBX_txn *txn, const unsigned flags) { WARNING("%s", "environment had fatal error, must shutdown!"); rc = MDBX_PANIC; } else { - const size_t size = - pgno2bytes(env, (txn->mt_flags & MDBX_TXN_RDONLY) ? txn->mt_next_pgno - : txn->mt_end_pgno); - if (unlikely(size > env->me_dxb_mmap.limit)) { + const size_t size_bytes = pgno2bytes(env, txn->mt_end_pgno); + const size_t used_bytes = pgno2bytes(env, txn->mt_next_pgno); + const size_t required_bytes = + (txn->mt_flags & MDBX_TXN_RDONLY) ? used_bytes : size_bytes; + if (unlikely(required_bytes > env->me_dxb_mmap.current)) { + /* Размер БД (для пишущих транзакций) или используемых данных (для + * читающих транзакций) больше предыдущего/текущего размера внутри + * процесса, увеличиваем. Сюда также попадает случай увеличения верхней + * границы размера БД и отображения. В читающих транзакциях нельзя + * изменять размер файла, который может быть больше необходимого этой + * транзакции. */ if (txn->mt_geo.upper > MAX_PAGENO + 1 || bytes2pgno(env, pgno2bytes(env, txn->mt_geo.upper)) != txn->mt_geo.upper) { rc = MDBX_UNABLE_EXTEND_MAPSIZE; goto bailout; } - rc = map_resize(env, txn->mt_next_pgno, txn->mt_end_pgno, - txn->mt_geo.upper, - (txn->mt_flags & MDBX_TXN_RDONLY) ? true : false); - if (rc != MDBX_SUCCESS) + rc = dxb_resize(env, txn->mt_next_pgno, txn->mt_end_pgno, + txn->mt_geo.upper, implicit_grow); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + } else if (unlikely(size_bytes < env->me_dxb_mmap.current)) { + /* Размер БД меньше предыдущего/текущего размера внутри процесса, можно + * уменьшить, но всё сложнее: + * - размер файла согласован со всеми читаемыми снимками на момент + * коммита последней транзакции; + * - в читающей транзакции размер файла может быть больше и него нельзя + * изменять, в том числе менять madvise (меньша размера файла нельзя, + * а за размером нет смысла). + * - в пишущей транзакции уменьшать размер файла можно только после + * проверки размера читаемых снимков, но в этом нет смысла, так как + * это будет сделано при фиксации транзакции. + * + * В сухом остатке, можно только установить dxb_mmap.current равным + * размеру файла, а это проще сделать без вызова dxb_resize() и усложения + * внутренней логики. + * + * В этой тактике есть недостаток: если пишущите транзакции не регулярны, + * и при завершении такой транзакции файл БД остаётся не-уменьшеным из-за + * читающих транзакций использующих предыдущие снимки. */ +#if defined(_WIN32) || defined(_WIN64) + osal_srwlock_AcquireShared(&env->me_remap_guard); +#else + rc = osal_fastmutex_acquire(&env->me_remap_guard); +#endif + if (likely(rc == MDBX_SUCCESS)) { + rc = osal_filesize(env->me_dxb_mmap.fd, &env->me_dxb_mmap.filesize); + if (likely(rc == MDBX_SUCCESS)) { + eASSERT(env, env->me_dxb_mmap.filesize >= required_bytes); + if (env->me_dxb_mmap.current > env->me_dxb_mmap.filesize) + env->me_dxb_mmap.current = (size_t)env->me_dxb_mmap.filesize; + } +#if defined(_WIN32) || defined(_WIN64) + osal_srwlock_ReleaseShared(&env->me_remap_guard); +#else + int err = osal_fastmutex_release(&env->me_remap_guard); + if (unlikely(err) && likely(rc == MDBX_SUCCESS)) + rc = err; +#endif + } + if (unlikely(rc != MDBX_SUCCESS)) goto bailout; - } else { - env->me_dxb_mmap.current = size; - env->me_dxb_mmap.filesize = - (env->me_dxb_mmap.filesize < size) ? size : env->me_dxb_mmap.filesize; } + eASSERT(env, + pgno2bytes(env, txn->mt_next_pgno) <= env->me_dxb_mmap.current); + eASSERT(env, env->me_dxb_mmap.limit >= env->me_dxb_mmap.current); if (txn->mt_flags & MDBX_TXN_RDONLY) { #if defined(_WIN32) || defined(_WIN64) - if (((size > env->me_dbgeo.lower && env->me_dbgeo.shrink) || + if (((used_bytes > env->me_dbgeo.lower && env->me_dbgeo.shrink) || (mdbx_RunningUnderWine() && /* under Wine acquisition of remap_guard is always required, * since Wine don't support section extending, * i.e. in both cases unmap+map are required. */ - size < env->me_dbgeo.upper && env->me_dbgeo.grow)) && + used_bytes < env->me_dbgeo.upper && env->me_dbgeo.grow)) && /* avoid recursive use SRW */ (txn->mt_flags & MDBX_NOTLS) == 0) { txn->mt_flags |= MDBX_SHRINK_ALLOWED; osal_srwlock_AcquireShared(&env->me_remap_guard); @@ -9799,8 +9854,8 @@ static int txn_end(MDBX_txn *txn, const unsigned mode) { if (parent->mt_geo.upper != txn->mt_geo.upper || parent->mt_geo.now != txn->mt_geo.now) { /* undo resize performed by child txn */ - rc = map_resize_implicit(env, parent->mt_next_pgno, parent->mt_geo.now, - parent->mt_geo.upper); + rc = dxb_resize(env, parent->mt_next_pgno, parent->mt_geo.now, + parent->mt_geo.upper, impilict_shrink); if (rc == MDBX_EPERM) { /* unable undo resize (it is regular for Windows), * therefore promote size changes from child to the parent txn */ @@ -12859,8 +12914,8 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending, if (unlikely(shrink)) { VERBOSE("shrink to %" PRIaPGNO " pages (-%" PRIaPGNO ")", pending->mm_geo.now, shrink); - rc = map_resize_implicit(env, pending->mm_geo.next, pending->mm_geo.now, - pending->mm_geo.upper); + rc = dxb_resize(env, pending->mm_geo.next, pending->mm_geo.now, + pending->mm_geo.upper, impilict_shrink); if (rc != MDBX_SUCCESS && rc != MDBX_EPERM) goto fail; eASSERT(env, coherency_check_meta(env, target, true)); @@ -13453,8 +13508,8 @@ mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t size_now, if (new_geo.now != current_geo->now || new_geo.upper != current_geo->upper) { - rc = map_resize(env, current_geo->next, new_geo.now, new_geo.upper, - false); + rc = dxb_resize(env, current_geo->next, new_geo.now, new_geo.upper, + explicit_resize); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } diff --git a/src/lck-windows.c b/src/lck-windows.c index e6ae78d2..5cbf10cf 100644 --- a/src/lck-windows.c +++ b/src/lck-windows.c @@ -752,7 +752,7 @@ static void WINAPI stub_srwlock_AcquireShared(osal_srwlock_t *srwl) { // If there's a writer already, spin without unnecessarily // interlocking the CPUs if (srwl->writerCount != 0) { - YieldProcessor(); + SwitchToThread(); continue; } @@ -766,7 +766,7 @@ static void WINAPI stub_srwlock_AcquireShared(osal_srwlock_t *srwl) { // Remove from the readers list, spin, try again _InterlockedDecrement(&srwl->readerCount); - YieldProcessor(); + SwitchToThread(); } } @@ -782,7 +782,7 @@ static void WINAPI stub_srwlock_AcquireExclusive(osal_srwlock_t *srwl) { // If there's a writer already, spin without unnecessarily // interlocking the CPUs if (srwl->writerCount != 0) { - YieldProcessor(); + SwitchToThread(); continue; } @@ -797,7 +797,7 @@ static void WINAPI stub_srwlock_AcquireExclusive(osal_srwlock_t *srwl) { // that we're the writer. while (srwl->readerCount != 0) { assert(srwl->writerCount >= 0 && srwl->readerCount >= 0); - YieldProcessor(); + SwitchToThread(); } } diff --git a/src/osal.c b/src/osal.c index f1ae6ee7..cd0e1dde 100644 --- a/src/osal.c +++ b/src/osal.c @@ -2181,6 +2181,7 @@ MDBX_INTERNAL_FUNC int osal_mmap(const int flags, osal_mmap_t *map, size_t size, if ((flags & MDBX_RDONLY) == 0 && (options & MMAP_OPTION_TRUNCATE) != 0) { err = osal_ftruncate(map->fd, size); + VERBOSE("ftruncate %zu, err %d", size, err); if (err != MDBX_SUCCESS) return err; map->filesize = size; @@ -2189,6 +2190,7 @@ MDBX_INTERNAL_FUNC int osal_mmap(const int flags, osal_mmap_t *map, size_t size, #endif /* !Windows */ } else { err = osal_filesize(map->fd, &map->filesize); + VERBOSE("filesize %" PRIu64 ", err %d", map->filesize, err); if (err != MDBX_SUCCESS) return err; #if defined(_WIN32) || defined(_WIN64) @@ -2306,8 +2308,7 @@ MDBX_INTERNAL_FUNC int osal_munmap(osal_mmap_t *map) { VALGRIND_MAKE_MEM_NOACCESS(map->base, map->current); /* Unpoisoning is required for ASAN to avoid false-positive diagnostic * when this memory will re-used by malloc or another mmapping. - * See https://libmdbx.dqdkfa.ru/dead-github/pull/93#issuecomment-613687203 - */ + * See https://libmdbx.dqdkfa.ru/dead-github/pull/93#issuecomment-613687203 */ MDBX_ASAN_UNPOISON_MEMORY_REGION( map->base, (map->filesize && map->filesize < map->limit) ? map->filesize : map->limit); @@ -2332,25 +2333,38 @@ MDBX_INTERNAL_FUNC int osal_munmap(osal_mmap_t *map) { MDBX_INTERNAL_FUNC int osal_mresize(const int flags, osal_mmap_t *map, size_t size, size_t limit) { + int rc = osal_filesize(map->fd, &map->filesize); + VERBOSE("flags 0x%x, size %zu, limit %zu, filesize %" PRIu64, flags, size, + limit, map->filesize); assert(size <= limit); + if (rc != MDBX_SUCCESS) { + map->filesize = 0; + return rc; + } + #if defined(_WIN32) || defined(_WIN64) assert(size != map->current || limit != map->limit || size < map->filesize); NTSTATUS status; LARGE_INTEGER SectionSize; - int err, rc = MDBX_SUCCESS; + int err; - if (!(flags & MDBX_RDONLY) && limit == map->limit && size > map->current && - /* workaround for Wine */ mdbx_NtExtendSection) { - /* growth rw-section */ - SectionSize.QuadPart = size; - status = mdbx_NtExtendSection(map->section, &SectionSize); - if (!NT_SUCCESS(status)) - return ntstatus2errcode(status); - map->current = size; - if (map->filesize < size) - map->filesize = size; - return MDBX_SUCCESS; + if (limit == map->limit && size > map->current) { + if ((flags & MDBX_RDONLY) && map->filesize >= size) { + map->current = size; + return MDBX_SUCCESS; + } else if (!(flags & MDBX_RDONLY) && + /* workaround for Wine */ mdbx_NtExtendSection) { + /* growth rw-section */ + SectionSize.QuadPart = size; + status = mdbx_NtExtendSection(map->section, &SectionSize); + if (!NT_SUCCESS(status)) + return ntstatus2errcode(status); + map->current = size; + if (map->filesize < size) + map->filesize = size; + return MDBX_SUCCESS; + } } if (limit > map->limit) { @@ -2379,13 +2393,15 @@ MDBX_INTERNAL_FUNC int osal_mresize(const int flags, osal_mmap_t *map, * - change size of mapped view; * - extend read-only mapping; * Therefore we should unmap/map entire section. */ - if ((flags & MDBX_MRESIZE_MAY_UNMAP) == 0) + if ((flags & MDBX_MRESIZE_MAY_UNMAP) == 0) { + if (size <= map->current && limit == map->limit) + return MDBX_SUCCESS; return MDBX_EPERM; + } /* Unpoisoning is required for ASAN to avoid false-positive diagnostic * when this memory will re-used by malloc or another mmapping. - * See https://libmdbx.dqdkfa.ru/dead-github/pull/93#issuecomment-613687203 - */ + * See https://libmdbx.dqdkfa.ru/dead-github/pull/93#issuecomment-613687203 */ MDBX_ASAN_UNPOISON_MEMORY_REGION(map->base, map->limit); status = NtUnmapViewOfSection(GetCurrentProcess(), map->base); if (!NT_SUCCESS(status)) @@ -2398,7 +2414,6 @@ MDBX_INTERNAL_FUNC int osal_mresize(const int flags, osal_mmap_t *map, if (!NT_SUCCESS(status)) { bailout_ntstatus: err = ntstatus2errcode(status); - bailout: map->base = NULL; map->current = map->limit = 0; if (ReservedAddress) { @@ -2427,10 +2442,6 @@ retry_file_and_section: map->base = NULL; } - err = osal_filesize(map->fd, &map->filesize); - if (err != MDBX_SUCCESS) - goto bailout; - if ((flags & MDBX_RDONLY) == 0 && map->filesize != size) { err = osal_ftruncate(map->fd, size); if (err == MDBX_SUCCESS) @@ -2507,18 +2518,17 @@ retry_mapview:; #else /* Windows */ - map->filesize = 0; - int rc = osal_filesize(map->fd, &map->filesize); - if (rc != MDBX_SUCCESS) - return rc; - if (flags & MDBX_RDONLY) { + if (size > map->filesize) + rc = MDBX_UNABLE_EXTEND_MAPSIZE; + else if (size < map->filesize && map->filesize > limit) + rc = MDBX_EPERM; map->current = (map->filesize > limit) ? limit : (size_t)map->filesize; - if (map->current != size) - rc = (size > map->current) ? MDBX_UNABLE_EXTEND_MAPSIZE : MDBX_EPERM; } else { - if (map->filesize != size) { + if (size > map->filesize || + (size < map->filesize && (flags & MDBX_SHRINK_ALLOWED))) { rc = osal_ftruncate(map->fd, size); + VERBOSE("ftruncate %zu, err %d", size, rc); if (rc != MDBX_SUCCESS) return rc; map->filesize = size; @@ -2713,7 +2723,8 @@ retry_mapview:; assert(rc != MDBX_SUCCESS || (map->base != nullptr && map->base != MAP_FAILED && - map->current == size && map->limit == limit)); + map->current == size && map->limit == limit && + map->filesize >= size)); return rc; } From c01f025bfa3a5ef0d924536313f92774815d9db3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Mon, 16 Jan 2023 16:24:51 +0300 Subject: [PATCH 332/364] =?UTF-8?q?mdbx:=20=D0=BE=D0=B1=D0=BD=D0=BE=D0=B2?= =?UTF-8?q?=D0=BB=D0=B5=D0=BD=D0=B8=D0=B5=20=D0=B3=D0=BE=D0=B4=D0=B0=20?= =?UTF-8?q?=D0=BD=D0=B0=202023.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CMakeLists.txt | 2 +- COPYRIGHT | 2 +- cmake/compiler.cmake | 2 +- cmake/profile.cmake | 2 +- cmake/utils.cmake | 2 +- example/example-mdbx.c | 2 +- example/sample-bdb.txt | 2 +- mdbx.h | 2 +- mdbx.h++ | 2 +- src/alloy.c | 2 +- src/base.h | 2 +- src/core.c | 2 +- src/internals.h | 2 +- src/lck-posix.c | 2 +- src/lck-windows.c | 2 +- src/man1/mdbx_chk.1 | 4 ++-- src/man1/mdbx_copy.1 | 4 ++-- src/man1/mdbx_drop.1 | 4 ++-- src/man1/mdbx_dump.1 | 4 ++-- src/man1/mdbx_load.1 | 4 ++-- src/man1/mdbx_stat.1 | 4 ++-- src/mdbx.c++ | 2 +- src/mdbx_chk.c | 2 +- src/mdbx_copy.c | 2 +- src/mdbx_drop.c | 4 ++-- src/mdbx_dump.c | 2 +- src/mdbx_load.c | 2 +- src/mdbx_stat.c | 2 +- src/osal.c | 2 +- src/osal.h | 2 +- test/append.c++ | 2 +- test/base.h++ | 2 +- test/cases.c++ | 2 +- test/chrono.c++ | 2 +- test/chrono.h++ | 2 +- test/config.c++ | 2 +- test/config.h++ | 2 +- test/dead.c++ | 2 +- test/hill.c++ | 2 +- test/jitter.c++ | 2 +- test/keygen.c++ | 2 +- test/keygen.h++ | 2 +- test/log.c++ | 2 +- test/log.h++ | 2 +- test/main.c++ | 2 +- test/nested.c++ | 2 +- test/osal-unix.c++ | 2 +- test/osal-windows.c++ | 2 +- test/osal.h++ | 2 +- test/pcrf/pcrf_test.c | 2 +- test/test.c++ | 2 +- test/test.h++ | 2 +- test/ttl.c++ | 2 +- test/utils.c++ | 2 +- test/utils.h++ | 2 +- 55 files changed, 62 insertions(+), 62 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index a5167d62..777a3f30 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,5 +1,5 @@ ## -## Copyright 2020-2022 Leonid Yuriev +## Copyright 2020-2023 Leonid Yuriev ## and other libmdbx authors: please see AUTHORS file. ## All rights reserved. ## diff --git a/COPYRIGHT b/COPYRIGHT index bd3acace..352beaed 100644 --- a/COPYRIGHT +++ b/COPYRIGHT @@ -1,4 +1,4 @@ -Copyright 2015-2022 Leonid Yuriev . +Copyright 2015-2023 Leonid Yuriev . Copyright 2011-2015 Howard Chu, Symas Corp. Copyright 2015,2016 Peter-Service R&D LLC. All rights reserved. diff --git a/cmake/compiler.cmake b/cmake/compiler.cmake index 78a31946..dd6f71c8 100644 --- a/cmake/compiler.cmake +++ b/cmake/compiler.cmake @@ -1,4 +1,4 @@ -## Copyright (c) 2012-2022 Leonid Yuriev . +## Copyright (c) 2012-2023 Leonid Yuriev . ## ## Licensed under the Apache License, Version 2.0 (the "License"); ## you may not use this file except in compliance with the License. diff --git a/cmake/profile.cmake b/cmake/profile.cmake index c9b8bed4..f13b6976 100644 --- a/cmake/profile.cmake +++ b/cmake/profile.cmake @@ -1,4 +1,4 @@ -## Copyright (c) 2012-2022 Leonid Yuriev . +## Copyright (c) 2012-2023 Leonid Yuriev . ## ## Licensed under the Apache License, Version 2.0 (the "License"); ## you may not use this file except in compliance with the License. diff --git a/cmake/utils.cmake b/cmake/utils.cmake index 6a3315e1..aa8aef01 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -1,4 +1,4 @@ -## Copyright (c) 2012-2022 Leonid Yuriev . +## Copyright (c) 2012-2023 Leonid Yuriev . ## ## Licensed under the Apache License, Version 2.0 (the "License"); ## you may not use this file except in compliance with the License. diff --git a/example/example-mdbx.c b/example/example-mdbx.c index a3735f9a..0e6148d9 100644 --- a/example/example-mdbx.c +++ b/example/example-mdbx.c @@ -4,7 +4,7 @@ */ /* - * Copyright 2015-2022 Leonid Yuriev . + * Copyright 2015-2023 Leonid Yuriev . * Copyright 2017 Ilya Shipitsin . * Copyright 2012-2015 Howard Chu, Symas Corp. * All rights reserved. diff --git a/example/sample-bdb.txt b/example/sample-bdb.txt index 503d97cb..d3478a16 100644 --- a/example/sample-bdb.txt +++ b/example/sample-bdb.txt @@ -4,7 +4,7 @@ */ /* - * Copyright 2015-2022 Leonid Yuriev . + * Copyright 2015-2023 Leonid Yuriev . * Copyright 2012-2015 Howard Chu, Symas Corp. * Copyright 2015,2016 Peter-Service R&D LLC. * All rights reserved. diff --git a/mdbx.h b/mdbx.h index c4e0b756..76a42d48 100644 --- a/mdbx.h +++ b/mdbx.h @@ -25,7 +25,7 @@ _The Future will (be) [Positive](https://www.ptsecurity.com). Всё будет \section copyright LICENSE & COPYRIGHT -\authors Copyright (c) 2015-2022, Leonid Yuriev +\authors Copyright (c) 2015-2023, Leonid Yuriev and other _libmdbx_ authors: please see [AUTHORS](./AUTHORS) file. \copyright Redistribution and use in source and binary forms, with or without diff --git a/mdbx.h++ b/mdbx.h++ index 4c591641..704688b9 100644 --- a/mdbx.h++ +++ b/mdbx.h++ @@ -1,7 +1,7 @@ /// \file mdbx.h++ /// \brief The libmdbx C++ API header file. /// -/// \author Copyright (c) 2020-2022, Leonid Yuriev . +/// \author Copyright (c) 2020-2023, Leonid Yuriev . /// \copyright SPDX-License-Identifier: Apache-2.0 /// /// Tested with: diff --git a/src/alloy.c b/src/alloy.c index 1e770f23..7d0cf636 100644 --- a/src/alloy.c +++ b/src/alloy.c @@ -1,5 +1,5 @@ /* - * Copyright 2015-2022 Leonid Yuriev + * Copyright 2015-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * diff --git a/src/base.h b/src/base.h index bf5a5007..753ad005 100644 --- a/src/base.h +++ b/src/base.h @@ -1,5 +1,5 @@ /* - * Copyright 2015-2022 Leonid Yuriev + * Copyright 2015-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * diff --git a/src/core.c b/src/core.c index 46033253..ef6a3e59 100644 --- a/src/core.c +++ b/src/core.c @@ -1,5 +1,5 @@ /* - * Copyright 2015-2022 Leonid Yuriev . + * Copyright 2015-2023 Leonid Yuriev . * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * diff --git a/src/internals.h b/src/internals.h index d15c21aa..b5b8ead6 100644 --- a/src/internals.h +++ b/src/internals.h @@ -1,5 +1,5 @@ /* - * Copyright 2015-2022 Leonid Yuriev + * Copyright 2015-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * diff --git a/src/lck-posix.c b/src/lck-posix.c index afbe542b..cb55727e 100644 --- a/src/lck-posix.c +++ b/src/lck-posix.c @@ -1,5 +1,5 @@ /* - * Copyright 2015-2022 Leonid Yuriev + * Copyright 2015-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * diff --git a/src/lck-windows.c b/src/lck-windows.c index 5cbf10cf..500510d9 100644 --- a/src/lck-windows.c +++ b/src/lck-windows.c @@ -1,5 +1,5 @@ /* - * Copyright 2015-2022 Leonid Yuriev + * Copyright 2015-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * diff --git a/src/man1/mdbx_chk.1 b/src/man1/mdbx_chk.1 index e0587e99..9141bf7a 100644 --- a/src/man1/mdbx_chk.1 +++ b/src/man1/mdbx_chk.1 @@ -1,6 +1,6 @@ -.\" Copyright 2015-2022 Leonid Yuriev . +.\" Copyright 2015-2023 Leonid Yuriev . .\" Copying restrictions apply. See COPYRIGHT/LICENSE. -.TH MDBX_CHK 1 "2022-11-11" "MDBX 0.12.2" +.TH MDBX_CHK 1 "2023-01-07" "MDBX 0.12.3" .SH NAME mdbx_chk \- MDBX checking tool .SH SYNOPSIS diff --git a/src/man1/mdbx_copy.1 b/src/man1/mdbx_copy.1 index 49e2b4d4..b83c0a27 100644 --- a/src/man1/mdbx_copy.1 +++ b/src/man1/mdbx_copy.1 @@ -1,8 +1,8 @@ -.\" Copyright 2015-2022 Leonid Yuriev . +.\" Copyright 2015-2023 Leonid Yuriev . .\" Copyright 2012-2015 Howard Chu, Symas Corp. All Rights Reserved. .\" Copyright 2015,2016 Peter-Service R&D LLC . .\" Copying restrictions apply. See COPYRIGHT/LICENSE. -.TH MDBX_COPY 1 "2022-11-11" "MDBX 0.12.2" +.TH MDBX_COPY 1 "2023-01-07" "MDBX 0.12.3" .SH NAME mdbx_copy \- MDBX environment copy tool .SH SYNOPSIS diff --git a/src/man1/mdbx_drop.1 b/src/man1/mdbx_drop.1 index ec01905b..e2beaef3 100644 --- a/src/man1/mdbx_drop.1 +++ b/src/man1/mdbx_drop.1 @@ -1,7 +1,7 @@ -.\" Copyright 2021-2022 Leonid Yuriev . +.\" Copyright 2021-2023 Leonid Yuriev . .\" Copyright 2014-2021 Howard Chu, Symas Corp. All Rights Reserved. .\" Copying restrictions apply. See COPYRIGHT/LICENSE. -.TH MDBX_DROP 1 "2022-11-11" "MDBX 0.12.2" +.TH MDBX_DROP 1 "2023-01-07" "MDBX 0.12.3" .SH NAME mdbx_drop \- MDBX database delete tool .SH SYNOPSIS diff --git a/src/man1/mdbx_dump.1 b/src/man1/mdbx_dump.1 index 5e173903..007705a3 100644 --- a/src/man1/mdbx_dump.1 +++ b/src/man1/mdbx_dump.1 @@ -1,8 +1,8 @@ -.\" Copyright 2015-2022 Leonid Yuriev . +.\" Copyright 2015-2023 Leonid Yuriev . .\" Copyright 2014-2015 Howard Chu, Symas Corp. All Rights Reserved. .\" Copyright 2015,2016 Peter-Service R&D LLC . .\" Copying restrictions apply. See COPYRIGHT/LICENSE. -.TH MDBX_DUMP 1 "2022-11-11" "MDBX 0.12.2" +.TH MDBX_DUMP 1 "2023-01-07" "MDBX 0.12.3" .SH NAME mdbx_dump \- MDBX environment export tool .SH SYNOPSIS diff --git a/src/man1/mdbx_load.1 b/src/man1/mdbx_load.1 index 44dbe7d7..65ed20aa 100644 --- a/src/man1/mdbx_load.1 +++ b/src/man1/mdbx_load.1 @@ -1,8 +1,8 @@ -.\" Copyright 2015-2022 Leonid Yuriev . +.\" Copyright 2015-2023 Leonid Yuriev . .\" Copyright 2014-2015 Howard Chu, Symas Corp. All Rights Reserved. .\" Copyright 2015,2016 Peter-Service R&D LLC . .\" Copying restrictions apply. See COPYRIGHT/LICENSE. -.TH MDBX_LOAD 1 "2022-11-11" "MDBX 0.12.2" +.TH MDBX_LOAD 1 "2023-01-07" "MDBX 0.12.3" .SH NAME mdbx_load \- MDBX environment import tool .SH SYNOPSIS diff --git a/src/man1/mdbx_stat.1 b/src/man1/mdbx_stat.1 index 3bc3664a..31302e03 100644 --- a/src/man1/mdbx_stat.1 +++ b/src/man1/mdbx_stat.1 @@ -1,8 +1,8 @@ -.\" Copyright 2015-2022 Leonid Yuriev . +.\" Copyright 2015-2023 Leonid Yuriev . .\" Copyright 2012-2015 Howard Chu, Symas Corp. All Rights Reserved. .\" Copyright 2015,2016 Peter-Service R&D LLC . .\" Copying restrictions apply. See COPYRIGHT/LICENSE. -.TH MDBX_STAT 1 "2022-11-11" "MDBX 0.12.2" +.TH MDBX_STAT 1 "2023-01-07" "MDBX 0.12.3" .SH NAME mdbx_stat \- MDBX environment status tool .SH SYNOPSIS diff --git a/src/mdbx.c++ b/src/mdbx.c++ index 17716c4b..8a74e412 100644 --- a/src/mdbx.c++ +++ b/src/mdbx.c++ @@ -1,5 +1,5 @@ // -// Copyright (c) 2020-2022, Leonid Yuriev . +// Copyright (c) 2020-2023, Leonid Yuriev . // SPDX-License-Identifier: Apache-2.0 // // Non-inline part of the libmdbx C++ API diff --git a/src/mdbx_chk.c b/src/mdbx_chk.c index 008de33c..7a13e733 100644 --- a/src/mdbx_chk.c +++ b/src/mdbx_chk.c @@ -1,7 +1,7 @@ /* mdbx_chk.c - memory-mapped database check tool */ /* - * Copyright 2015-2022 Leonid Yuriev + * Copyright 2015-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * diff --git a/src/mdbx_copy.c b/src/mdbx_copy.c index b070449b..52adc312 100644 --- a/src/mdbx_copy.c +++ b/src/mdbx_copy.c @@ -1,7 +1,7 @@ /* mdbx_copy.c - memory-mapped database backup tool */ /* - * Copyright 2015-2022 Leonid Yuriev + * Copyright 2015-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * diff --git a/src/mdbx_drop.c b/src/mdbx_drop.c index 0680fc11..859710a6 100644 --- a/src/mdbx_drop.c +++ b/src/mdbx_drop.c @@ -1,10 +1,10 @@ /* mdbx_drop.c - memory-mapped database delete tool */ /* - * Copyright 2021 Leonid Yuriev + * Copyright 2021-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * - * Copyright 2016-2022 Howard Chu, Symas Corp. + * Copyright 2016-2021 Howard Chu, Symas Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/mdbx_dump.c b/src/mdbx_dump.c index 8c266aed..21a695e2 100644 --- a/src/mdbx_dump.c +++ b/src/mdbx_dump.c @@ -1,7 +1,7 @@ /* mdbx_dump.c - memory-mapped database dump tool */ /* - * Copyright 2015-2022 Leonid Yuriev + * Copyright 2015-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * diff --git a/src/mdbx_load.c b/src/mdbx_load.c index b9fdfd8c..8a7a191a 100644 --- a/src/mdbx_load.c +++ b/src/mdbx_load.c @@ -1,7 +1,7 @@ /* mdbx_load.c - memory-mapped database load tool */ /* - * Copyright 2015-2022 Leonid Yuriev + * Copyright 2015-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * diff --git a/src/mdbx_stat.c b/src/mdbx_stat.c index ebf53324..860cf54d 100644 --- a/src/mdbx_stat.c +++ b/src/mdbx_stat.c @@ -1,7 +1,7 @@ /* mdbx_stat.c - memory-mapped database status tool */ /* - * Copyright 2015-2022 Leonid Yuriev + * Copyright 2015-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * diff --git a/src/osal.c b/src/osal.c index cd0e1dde..45cb92c8 100644 --- a/src/osal.c +++ b/src/osal.c @@ -1,7 +1,7 @@ /* https://en.wikipedia.org/wiki/Operating_system_abstraction_layer */ /* - * Copyright 2015-2022 Leonid Yuriev + * Copyright 2015-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * diff --git a/src/osal.h b/src/osal.h index 77277279..02447376 100644 --- a/src/osal.h +++ b/src/osal.h @@ -1,7 +1,7 @@ /* https://en.wikipedia.org/wiki/Operating_system_abstraction_layer */ /* - * Copyright 2015-2022 Leonid Yuriev + * Copyright 2015-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * diff --git a/test/append.c++ b/test/append.c++ index 59e9ec12..d2486001 100644 --- a/test/append.c++ +++ b/test/append.c++ @@ -1,5 +1,5 @@ /* - * Copyright 2017-2022 Leonid Yuriev + * Copyright 2017-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * diff --git a/test/base.h++ b/test/base.h++ index fa22ce60..f4e083bd 100644 --- a/test/base.h++ +++ b/test/base.h++ @@ -1,5 +1,5 @@ /* - * Copyright 2017-2022 Leonid Yuriev + * Copyright 2017-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * diff --git a/test/cases.c++ b/test/cases.c++ index 51d84b86..97421e7d 100644 --- a/test/cases.c++ +++ b/test/cases.c++ @@ -1,5 +1,5 @@ /* - * Copyright 2017-2022 Leonid Yuriev + * Copyright 2017-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * diff --git a/test/chrono.c++ b/test/chrono.c++ index 33456867..71273e92 100644 --- a/test/chrono.c++ +++ b/test/chrono.c++ @@ -1,5 +1,5 @@ /* - * Copyright 2017-2022 Leonid Yuriev + * Copyright 2017-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * diff --git a/test/chrono.h++ b/test/chrono.h++ index 4f86cf65..5d29b1c2 100644 --- a/test/chrono.h++ +++ b/test/chrono.h++ @@ -1,5 +1,5 @@ /* - * Copyright 2017-2022 Leonid Yuriev + * Copyright 2017-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * diff --git a/test/config.c++ b/test/config.c++ index e86984d4..31cf9395 100644 --- a/test/config.c++ +++ b/test/config.c++ @@ -1,5 +1,5 @@ /* - * Copyright 2017-2022 Leonid Yuriev + * Copyright 2017-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * diff --git a/test/config.h++ b/test/config.h++ index 1e6e57e5..f57dce7c 100644 --- a/test/config.h++ +++ b/test/config.h++ @@ -1,5 +1,5 @@ /* - * Copyright 2017-2022 Leonid Yuriev + * Copyright 2017-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * diff --git a/test/dead.c++ b/test/dead.c++ index 0d698d91..d0f8cb09 100644 --- a/test/dead.c++ +++ b/test/dead.c++ @@ -1,5 +1,5 @@ /* - * Copyright 2017-2022 Leonid Yuriev + * Copyright 2017-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * diff --git a/test/hill.c++ b/test/hill.c++ index 3a5c29f3..79234b7d 100644 --- a/test/hill.c++ +++ b/test/hill.c++ @@ -1,5 +1,5 @@ /* - * Copyright 2017-2022 Leonid Yuriev + * Copyright 2017-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * diff --git a/test/jitter.c++ b/test/jitter.c++ index 391d5deb..b25599b0 100644 --- a/test/jitter.c++ +++ b/test/jitter.c++ @@ -1,5 +1,5 @@ /* - * Copyright 2017-2022 Leonid Yuriev + * Copyright 2017-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * diff --git a/test/keygen.c++ b/test/keygen.c++ index 2cd7e574..e8e53262 100644 --- a/test/keygen.c++ +++ b/test/keygen.c++ @@ -1,5 +1,5 @@ /* - * Copyright 2017-2022 Leonid Yuriev + * Copyright 2017-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * diff --git a/test/keygen.h++ b/test/keygen.h++ index 54122ab1..9e2410fd 100644 --- a/test/keygen.h++ +++ b/test/keygen.h++ @@ -1,5 +1,5 @@ /* - * Copyright 2017-2022 Leonid Yuriev + * Copyright 2017-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * diff --git a/test/log.c++ b/test/log.c++ index f9cb1194..bc52432e 100644 --- a/test/log.c++ +++ b/test/log.c++ @@ -1,5 +1,5 @@ /* - * Copyright 2017-2022 Leonid Yuriev + * Copyright 2017-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * diff --git a/test/log.h++ b/test/log.h++ index 0ff12ec2..aa111ac9 100644 --- a/test/log.h++ +++ b/test/log.h++ @@ -1,5 +1,5 @@ /* - * Copyright 2017-2022 Leonid Yuriev + * Copyright 2017-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * diff --git a/test/main.c++ b/test/main.c++ index 0cd82dbd..c96fc58f 100644 --- a/test/main.c++ +++ b/test/main.c++ @@ -1,5 +1,5 @@ /* - * Copyright 2017-2022 Leonid Yuriev + * Copyright 2017-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * diff --git a/test/nested.c++ b/test/nested.c++ index 60c02ae9..48299c79 100644 --- a/test/nested.c++ +++ b/test/nested.c++ @@ -1,5 +1,5 @@ /* - * Copyright 2017-2022 Leonid Yuriev + * Copyright 2017-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * diff --git a/test/osal-unix.c++ b/test/osal-unix.c++ index 1711518e..094d6769 100644 --- a/test/osal-unix.c++ +++ b/test/osal-unix.c++ @@ -1,5 +1,5 @@ /* - * Copyright 2017-2022 Leonid Yuriev + * Copyright 2017-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * diff --git a/test/osal-windows.c++ b/test/osal-windows.c++ index f656d70a..57746532 100644 --- a/test/osal-windows.c++ +++ b/test/osal-windows.c++ @@ -1,5 +1,5 @@ /* - * Copyright 2017-2022 Leonid Yuriev + * Copyright 2017-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * diff --git a/test/osal.h++ b/test/osal.h++ index 0fe44f68..ef3b5562 100644 --- a/test/osal.h++ +++ b/test/osal.h++ @@ -1,5 +1,5 @@ /* - * Copyright 2017-2022 Leonid Yuriev + * Copyright 2017-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * diff --git a/test/pcrf/pcrf_test.c b/test/pcrf/pcrf_test.c index 96bb631d..1d1f1e7e 100644 --- a/test/pcrf/pcrf_test.c +++ b/test/pcrf/pcrf_test.c @@ -1,5 +1,5 @@ /* - * Copyright 2016-2022 Leonid Yuriev . + * Copyright 2016-2023 Leonid Yuriev . * Copyright 2015 Vladimir Romanov * , Yota Lab. * diff --git a/test/test.c++ b/test/test.c++ index c42b598f..1e8429c5 100644 --- a/test/test.c++ +++ b/test/test.c++ @@ -1,5 +1,5 @@ /* - * Copyright 2017-2022 Leonid Yuriev + * Copyright 2017-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * diff --git a/test/test.h++ b/test/test.h++ index 4442aaa7..52a2add3 100644 --- a/test/test.h++ +++ b/test/test.h++ @@ -1,5 +1,5 @@ /* - * Copyright 2017-2022 Leonid Yuriev + * Copyright 2017-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * diff --git a/test/ttl.c++ b/test/ttl.c++ index f8239e94..a7049022 100644 --- a/test/ttl.c++ +++ b/test/ttl.c++ @@ -1,5 +1,5 @@ /* - * Copyright 2017-2022 Leonid Yuriev + * Copyright 2017-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * diff --git a/test/utils.c++ b/test/utils.c++ index 9e61e4bf..71d56eb8 100644 --- a/test/utils.c++ +++ b/test/utils.c++ @@ -1,5 +1,5 @@ /* - * Copyright 2017-2022 Leonid Yuriev + * Copyright 2017-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * diff --git a/test/utils.h++ b/test/utils.h++ index 3fbf4513..98763536 100644 --- a/test/utils.h++ +++ b/test/utils.h++ @@ -1,5 +1,5 @@ /* - * Copyright 2017-2022 Leonid Yuriev + * Copyright 2017-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * From 3ade7c7ba114cb2d4a35487ce8fd2c31a4b04c4d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Mon, 16 Jan 2023 19:12:08 +0300 Subject: [PATCH 333/364] =?UTF-8?q?mdbx:=20=D0=BE=D0=B1=D0=BD=D0=BE=D0=B2?= =?UTF-8?q?=D0=BB=D0=B5=D0=BD=D0=B8=D0=B5=20=D1=81=D1=82=D0=B0=D1=82=D1=83?= =?UTF-8?q?=D1=81=D0=B0=20MithrilDB.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 41 +++++++++++++++++++++++++++++++++++------ docs/_toc.md | 2 +- 2 files changed, 36 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 818bf0d7..46e1c549 100644 --- a/README.md +++ b/README.md @@ -81,19 +81,48 @@ Historically, _libmdbx_ is a deeply revised and extended descendant of the amazi [Lightning Memory-Mapped Database](https://en.wikipedia.org/wiki/Lightning_Memory-Mapped_Database). _libmdbx_ inherits all benefits from _LMDB_, but resolves some issues and adds [a set of improvements](#improvements-beyond-lmdb). +### MithrilDB and Future + -The next version is under active non-public development from scratch and will be +The next version is under non-public development from scratch and will be released as **MithrilDB** and `libmithrildb` for libraries & packages. Admittedly mythical [Mithril](https://en.wikipedia.org/wiki/Mithril) is resembling silver but being stronger and lighter than steel. Therefore _MithrilDB_ is a rightly relevant name. - > _MithrilDB_ will be radically different from _libmdbx_ by the new - > database format and API based on C++17, as well as the [Apache 2.0 - > License](https://www.apache.org/licenses/LICENSE-2.0). The goal of this - > revolution is to provide a clearer and robust API, add more features and - > new valuable properties of the database. +_MithrilDB_ is radically different from _libmdbx_ by the new database +format and API based on C++20. The goal of this revolution is to provide +a clearer and robust API, add more features and new valuable properties +of the database. All fundamental architectural problems of libmdbx/LMDB +have been solved there, but now the active development has been +suspended for top-three reasons: + +1. For now _libmdbx_ «mostly» enough for all [our products](https://www.ptsecurity.com/ww-en/products/), +and I’m busy in development of replication for scalability. +2. Waiting for fresh [Elbrus CPU](https://wiki.elbrus.ru/) of [e2k architecture](https://en.wikipedia.org/wiki/Elbrus_2000), +especially with hardware acceleration of [Streebog](https://en.wikipedia.org/wiki/Streebog) and +[Kuznyechik](https://en.wikipedia.org/wiki/Kuznyechik), which are required for Merkle tree, etc. +3. The expectation of needs and opportunities due to the wide use of NVDIMM (aka persistent memory), +modern NVMe and [Ангара](https://ru.wikipedia.org/wiki/Ангара_(интерконнект)). + +However, _MithrilDB_ will not be available for countries unfriendly to +Russia (i.e. acceded the sanctions, devil adepts and/or NATO). But it is +not yet known whether such restriction will be implemented only through +a license and support, either the source code will not be open at all. +Basically we are not inclined to allow our work to contribute to the +profit that goes to weapons that kill our relatives and friends. +NO OPTIONS. + +Nonetheless, I try not to make any promises regarding _MithrilDB_ until release. + +Contrary to _MithrilDB_, _libmdbx_ will forever free and open source. +Moreover with high-quality support whenever possible. Tu deviens +responsable pour toujours de ce que tu as apprivois. So we will continue +to comply with the original open license and the principles of +constructive cooperation, in spite of outright Github sabotage and +sanctions. I will also try to keep (not drop) Windows support, despite +it is an unused obsolete technology for us. diff --git a/docs/_toc.md b/docs/_toc.md index 434806f0..45482dc6 100644 --- a/docs/_toc.md +++ b/docs/_toc.md @@ -38,4 +38,4 @@ including creating [merge-request](https://gitflic.ru/project/erthink/libmdbx/me --- -\section MithrilDB MithrilDB +\section MithrilDB MithrilDB and Future From 486711945d574a9177ea3f2f5841d316fe1e3d93 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Tue, 17 Jan 2023 21:52:57 +0300 Subject: [PATCH 334/364] =?UTF-8?q?mdbx-doc:=20=D0=B8=D1=81=D0=BF=D1=80?= =?UTF-8?q?=D0=B0=D0=B2=D0=BB=D0=B5=D0=BD=D0=B8=D0=B5=20copy&paste=20?= =?UTF-8?q?=D0=BE=D0=BF=D0=B5=D1=87=D0=B0=D1=82=D0=BA=D0=B8=20=D0=B2=20"Ge?= =?UTF-8?q?tting=20started".?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/_starting.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/_starting.md b/docs/_starting.md index 95783fd0..30336857 100644 --- a/docs/_starting.md +++ b/docs/_starting.md @@ -67,7 +67,7 @@ the end is hit. To retrieve all keys starting from a specified key value, use \ref MDBX_SET. For more cursor operations, see the \ref c_api reference. -When using \ref mdbx_cursor_put()\ref , either the function will position the cursor +When using \ref mdbx_cursor_put(), either the function will position the cursor for you based on the key, or you can use operation \ref MDBX_CURRENT to use the current position of the cursor. \note Note that key must then match the current position's key. From 68ebbe1fdebe78c8ed61080a055709032c462cf4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Wed, 18 Jan 2023 18:34:52 +0300 Subject: [PATCH 335/364] =?UTF-8?q?mdbx:=20=D0=9E=D0=B1=D0=BD=D0=BE=D0=B2?= =?UTF-8?q?=D0=BB=D0=B5=D0=BD=D0=B8=D0=B5=20ChangeLog.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ChangeLog.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/ChangeLog.md b/ChangeLog.md index 9fff6cf1..3e7c2c7e 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -29,6 +29,8 @@ and [by Yandex](https://translated.turbopages.org/proxy_u/ru-en.en/https/gitflic пишущей транзакции в разных потоках, одновременно с изменением размера БД. Кроме срабатывание проверки, каких-либо других последствий не возникало. + - Исправление copy&paste опечатки в разделе "Getting started" документации. + Ликвидация технических долгов и мелочи: - Исправление опечаток. @@ -41,7 +43,8 @@ and [by Yandex](https://translated.turbopages.org/proxy_u/ru-en.en/https/gitflic ## v0.12.3 (Акула) от 2023-01-07 -Выпуск с существенными доработками и новой функциональностью в память о закрытом open-source проекте "Акула". +Выпуск с существенными доработками и новой функциональностью в память о закрытом open-source +[проекте "Акула"](https://erigon.substack.com/p/winding-down-support-for-akula-project). Добавлена prefault-запись, переделан контроль “некогерентности” unified page/buffer cache, изменена тактика слияния страниц и т.д. Стало ещё быстрее, в некоторых сценариях вдвое. From 209f784ee71e32fa657480f8c24fce5058dbd8dd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Mon, 23 Jan 2023 23:52:04 +0300 Subject: [PATCH 336/364] =?UTF-8?q?mdbx:=20=D0=B8=D1=81=D0=BF=D1=80=D0=B0?= =?UTF-8?q?=D0=B2=D0=BB=D0=B5=D0=BD=D0=B8=D0=B5=20assert-=D0=BF=D1=80?= =?UTF-8?q?=D0=BE=D0=B2=D0=B5=D1=80=D0=BE=D0=BA=20=D0=B2=D0=BD=D1=83=D1=82?= =?UTF-8?q?=D1=80=D0=B8=20`dxb=5Fresize()`.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Устранение регресса после a484a1f89bcbf38aeb7a81d6080605f86ddc7933. Проверка `prev_limit_pgno >= used_pgno` правомочна только в части сценариев, но не в общем случае. --- src/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/core.c b/src/core.c index ef6a3e59..94adb823 100644 --- a/src/core.c +++ b/src/core.c @@ -6032,7 +6032,8 @@ __cold static int dxb_resize(MDBX_env *const env, const pgno_t used_pgno, const size_t prev_size = env->me_dxb_mmap.current; const size_t prev_limit = env->me_dxb_mmap.limit; const pgno_t prev_limit_pgno = bytes2pgno(env, prev_limit); - eASSERT(env, prev_limit_pgno >= used_pgno); + eASSERT(env, limit_pgno >= size_pgno); + eASSERT(env, size_pgno >= used_pgno); if (mode < explicit_resize && size_pgno <= prev_limit_pgno) { /* The actual mapsize may be less since the geo.upper may be changed * by other process. Avoids remapping until it necessary. */ From fb827959a9d2d12f1145d7409f08199809481e69 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Wed, 1 Feb 2023 01:03:51 +0300 Subject: [PATCH 337/364] =?UTF-8?q?mdbx:=20=D0=B8=D1=81=D0=BF=D1=80=D0=B0?= =?UTF-8?q?=D0=B2=D0=BB=D0=B5=D0=BD=D0=B8=D0=B5=20`put(MDBX=5FUPSERT+MDBX?= =?UTF-8?q?=5FALLDUPS)`=20=D0=B4=D0=BB=D1=8F=20=D1=81=D0=BB=D1=83=D1=87?= =?UTF-8?q?=D0=B0=D1=8F=20=D0=B7=D0=B0=D0=BC=D0=B5=D0=BD=D1=8B=20=D0=B2?= =?UTF-8?q?=D1=81=D0=B5=D1=85=20=D0=B7=D0=BD=D0=B0=D1=87=D0=B5=D0=BD=D0=B8?= =?UTF-8?q?=D0=B9=20=D0=B2=20subDb.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixed cursor_put_nochecklen() internals for case when dupsort'ed named subDb contains a single key with multiple values (aka duplicates), which are replaced with a single value by put-operation with the `MDBX_UPSERT+MDBX_ALLDUPS` flags. In this case, the database becomes completely empty, without any pages. However exactly this condition was not considered and thus wasn't handled correctly. Fixes https://gitflic.ru/project/erthink/libmdbx/issue/8 Thanks Masatoshi Fukunaga for reporting. --- ChangeLog.md | 11 ++++++++++- src/core.c | 2 +- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/ChangeLog.md b/ChangeLog.md index 3e7c2c7e..e16aea42 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -10,7 +10,10 @@ and [by Yandex](https://translated.turbopages.org/proxy_u/ru-en.en/https/gitflic - Max за сообщение о проблеме ERROR_SHARING_VIOLATION в режиме MDBX_EXCLUSIVE на Windows. - - Alisher Ashyrov https://t.me/a1is43ras4 за сообщение о проблеме с assert-проверкой и содействие в отладке. + - Alisher Ashyrov за сообщение о проблеме + с assert-проверкой и содействие в отладке. + - Masatoshi Fukunaga за сообщение о проблеме + `put(MDBX_UPSERT+MDBX_ALLDUPS)` для случая замены всех значений в subDb. Исправления (без корректировок новых функций): @@ -31,6 +34,12 @@ and [by Yandex](https://translated.turbopages.org/proxy_u/ru-en.en/https/gitflic - Исправление copy&paste опечатки в разделе "Getting started" документации. + - Устранение проблемы `put(MDBX_UPSERT+MDBX_ALLDUPS)` для случая замены + всех значений единственного ключа в subDb. В ходе этой операции subDb + становится полностью пустой, без каких-либо страниц и именно эта + ситуация не была учтена в коде, что приводило к повреждению БД + при фиксации такой транзакции. + Ликвидация технических долгов и мелочи: - Исправление опечаток. diff --git a/src/core.c b/src/core.c index 94adb823..4eb57d6f 100644 --- a/src/core.c +++ b/src/core.c @@ -17275,7 +17275,7 @@ static __hot int cursor_put_nochecklen(MDBX_cursor *mc, const MDBX_val *key, if (unlikely(err != MDBX_SUCCESS)) return err; flags -= MDBX_ALLDUPS; - rc = MDBX_NOTFOUND; + rc = mc->mc_snum ? MDBX_NOTFOUND : MDBX_NO_ROOT; exact = false; } else if (!(flags & (MDBX_RESERVE | MDBX_MULTIPLE))) { /* checking for early exit without dirtying pages */ From 2a41b2487612c3e3b5f4b26d9eb3d1186ee0d47d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Tue, 7 Feb 2023 19:10:23 +0300 Subject: [PATCH 338/364] =?UTF-8?q?mdbx++:=20=D1=83=D1=82=D0=BE=D1=87?= =?UTF-8?q?=D0=BD=D0=B5=D0=BD=D0=B8=D0=B5=20`const`=20=D0=B8=20`noexcept`?= =?UTF-8?q?=20=D0=B4=D0=BB=D1=8F=20=D0=BD=D0=B5=D1=81=D0=BA=D0=BE=D0=BB?= =?UTF-8?q?=D1=8C=D0=BA=D0=B8=D1=85=20=D0=BC=D0=B5=D1=82=D0=BE=D0=B4=D0=BE?= =?UTF-8?q?=D0=B2.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mdbx.h++ | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mdbx.h++ b/mdbx.h++ index 704688b9..47d2648f 100644 --- a/mdbx.h++ +++ b/mdbx.h++ @@ -2639,7 +2639,7 @@ public: return buffer(src, make_reference); } - static buffer key_from(const silo &&src) noexcept { + static buffer key_from(silo &&src) noexcept { return buffer(::std::move(src)); } @@ -3591,7 +3591,7 @@ public: void close(bool dont_sync = false); env_managed(env_managed &&) = default; - env_managed &operator=(env_managed &&other) { + env_managed &operator=(env_managed &&other) noexcept { if (MDBX_UNLIKELY(handle_)) MDBX_CXX20_UNLIKELY { assert(handle_ != other.handle_); @@ -3890,7 +3890,7 @@ class LIBMDBX_API_TYPE txn_managed : public txn { public: MDBX_CXX11_CONSTEXPR txn_managed() noexcept = default; txn_managed(txn_managed &&) = default; - txn_managed &operator=(txn_managed &&other) { + txn_managed &operator=(txn_managed &&other) noexcept { if (MDBX_UNLIKELY(handle_)) MDBX_CXX20_UNLIKELY { assert(handle_ != other.handle_); @@ -4112,7 +4112,7 @@ public: void close(); cursor_managed(cursor_managed &&) = default; - cursor_managed &operator=(cursor_managed &&other) { + cursor_managed &operator=(cursor_managed &&other) noexcept { if (MDBX_UNLIKELY(handle_)) MDBX_CXX20_UNLIKELY { assert(handle_ != other.handle_); From 351a30f1861c0596ed4834fadc96c88ccd35fa98 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Wed, 8 Feb 2023 00:28:24 +0300 Subject: [PATCH 339/364] =?UTF-8?q?mdbx-windows:=20=D0=BD=D0=B5=20=D1=80?= =?UTF-8?q?=D0=B0=D1=81=D1=85=D0=BE=D0=B4=D1=83=D0=B5=D0=BC=20=D1=81=D1=82?= =?UTF-8?q?=D0=B5=D0=BA=20=D0=BF=D0=BE=D0=B4=20=D0=B1=D1=83=D1=84=D0=B5?= =?UTF-8?q?=D1=80=D1=8B=20=D0=B4=D0=BB=D1=8F=20wchar-=D0=BF=D1=80=D0=B5?= =?UTF-8?q?=D0=BE=D0=B1=D1=80=D0=B0=D0=B7=D0=BE=D0=B2=D0=B0=D0=BD=D0=B8?= =?UTF-8?q?=D1=8F=20=D0=BF=D1=83=D1=82=D0=B5=D0=B9.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core.c | 24 ++++++++++++++++-------- src/osal.h | 12 ++++++++---- 2 files changed, 24 insertions(+), 12 deletions(-) diff --git a/src/core.c b/src/core.c index 4eb57d6f..e32e9ba8 100644 --- a/src/core.c +++ b/src/core.c @@ -14486,9 +14486,11 @@ __cold int mdbx_env_turn_for_recovery(MDBX_env *env, unsigned target) { __cold int mdbx_env_open_for_recovery(MDBX_env *env, const char *pathname, unsigned target_meta, bool writeable) { #if defined(_WIN32) || defined(_WIN64) - const wchar_t *pathnameW = nullptr; + wchar_t *pathnameW = nullptr; OSAL_MB2WIDE(pathname, pathnameW); - return mdbx_env_open_for_recoveryW(env, pathnameW, target_meta, writeable); + int rc = mdbx_env_open_for_recoveryW(env, pathnameW, target_meta, writeable); + osal_free(pathnameW); + return rc; } __cold int mdbx_env_open_for_recoveryW(MDBX_env *env, const wchar_t *pathname, @@ -14675,9 +14677,11 @@ __cold static int handle_env_pathname(MDBX_handle_env_pathname *ctx, __cold int mdbx_env_delete(const char *pathname, MDBX_env_delete_mode_t mode) { #if defined(_WIN32) || defined(_WIN64) - const wchar_t *pathnameW = nullptr; + wchar_t *pathnameW = nullptr; OSAL_MB2WIDE(pathname, pathnameW); - return mdbx_env_deleteW(pathnameW, mode); + int rc = mdbx_env_deleteW(pathnameW, mode); + osal_free(pathnameW); + return rc; } __cold int mdbx_env_deleteW(const wchar_t *pathname, @@ -14766,9 +14770,11 @@ __cold int mdbx_env_deleteW(const wchar_t *pathname, __cold int mdbx_env_open(MDBX_env *env, const char *pathname, MDBX_env_flags_t flags, mdbx_mode_t mode) { #if defined(_WIN32) || defined(_WIN64) - const wchar_t *pathnameW = nullptr; + wchar_t *pathnameW = nullptr; OSAL_MB2WIDE(pathname, pathnameW); - return mdbx_env_openW(env, pathnameW, flags, mode); + int rc = mdbx_env_openW(env, pathnameW, flags, mode); + osal_free(pathnameW); + return rc; } __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, @@ -21772,9 +21778,11 @@ __cold int mdbx_env_copy2fd(MDBX_env *env, mdbx_filehandle_t fd, __cold int mdbx_env_copy(MDBX_env *env, const char *dest_path, MDBX_copy_flags_t flags) { #if defined(_WIN32) || defined(_WIN64) - const wchar_t *dest_pathW = nullptr; + wchar_t *dest_pathW = nullptr; OSAL_MB2WIDE(dest_path, dest_pathW); - return mdbx_env_copyW(env, dest_pathW, flags); + int rc = mdbx_env_copyW(env, dest_pathW, flags); + osal_free(dest_pathW); + return rc; } LIBMDBX_API int mdbx_env_copyW(MDBX_env *env, const wchar_t *dest_path, diff --git a/src/osal.h b/src/osal.h index 02447376..a8e50a20 100644 --- a/src/osal.h +++ b/src/osal.h @@ -765,12 +765,16 @@ MDBX_INTERNAL_FUNC size_t osal_mb2w(wchar_t *dst, size_t dst_n, const char *src, const char *const from_tmp = (FROM); \ const size_t from_mblen = strlen(from_tmp); \ const size_t to_wlen = osal_mb2w(nullptr, 0, from_tmp, from_mblen); \ - if (to_wlen < 1 || to_wlen > /* MAX_PATH */ INT16_MAX) \ + if (unlikely(to_wlen < 1 || to_wlen > /* MAX_PATH */ INT16_MAX)) \ return ERROR_INVALID_NAME; \ - wchar_t *const to_tmp = _alloca((to_wlen + 1) * sizeof(wchar_t)); \ - if (to_wlen + 1 != \ - osal_mb2w(to_tmp, to_wlen + 1, from_tmp, from_mblen + 1)) \ + wchar_t *const to_tmp = osal_malloc((to_wlen + 1) * sizeof(wchar_t)); \ + if (unlikely(!to_tmp)) \ + return MDBX_ENOMEM; \ + if (unlikely(to_wlen + 1 != \ + osal_mb2w(to_tmp, to_wlen + 1, from_tmp, from_mblen + 1))) { \ + osal_free(to_tmp); \ return ERROR_INVALID_NAME; \ + } \ (TO) = to_tmp; \ } while (0) From ebbe98afa56283127c22854977a7aed2c16d9436 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Thu, 9 Feb 2023 14:57:50 +0300 Subject: [PATCH 340/364] =?UTF-8?q?mdbx-windows:=20=D0=BB=D0=B8=D0=BA?= =?UTF-8?q?=D0=B2=D0=B8=D0=B4=D0=B0=D1=86=D0=B8=D1=8F=20=D0=BC=D0=B0=D0=BA?= =?UTF-8?q?=D1=80=D0=BE=D1=81=D0=B0=20`OSAL=5FMB2WIDE()`.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core.c | 32 ++++++++++++++++++++------------ src/osal.c | 37 +++++++++++++++++++++++++------------ src/osal.h | 21 +-------------------- 3 files changed, 46 insertions(+), 44 deletions(-) diff --git a/src/core.c b/src/core.c index e32e9ba8..42727794 100644 --- a/src/core.c +++ b/src/core.c @@ -14487,9 +14487,11 @@ __cold int mdbx_env_open_for_recovery(MDBX_env *env, const char *pathname, unsigned target_meta, bool writeable) { #if defined(_WIN32) || defined(_WIN64) wchar_t *pathnameW = nullptr; - OSAL_MB2WIDE(pathname, pathnameW); - int rc = mdbx_env_open_for_recoveryW(env, pathnameW, target_meta, writeable); - osal_free(pathnameW); + int rc = osal_mb2w(pathname, &pathnameW); + if (likely(rc == MDBX_SUCCESS)) { + rc = mdbx_env_open_for_recoveryW(env, pathnameW, target_meta, writeable); + osal_free(pathnameW); + } return rc; } @@ -14678,9 +14680,11 @@ __cold static int handle_env_pathname(MDBX_handle_env_pathname *ctx, __cold int mdbx_env_delete(const char *pathname, MDBX_env_delete_mode_t mode) { #if defined(_WIN32) || defined(_WIN64) wchar_t *pathnameW = nullptr; - OSAL_MB2WIDE(pathname, pathnameW); - int rc = mdbx_env_deleteW(pathnameW, mode); - osal_free(pathnameW); + int rc = osal_mb2w(pathname, &pathnameW); + if (likely(rc == MDBX_SUCCESS)) { + rc = mdbx_env_deleteW(pathnameW, mode); + osal_free(pathnameW); + } return rc; } @@ -14771,9 +14775,11 @@ __cold int mdbx_env_open(MDBX_env *env, const char *pathname, MDBX_env_flags_t flags, mdbx_mode_t mode) { #if defined(_WIN32) || defined(_WIN64) wchar_t *pathnameW = nullptr; - OSAL_MB2WIDE(pathname, pathnameW); - int rc = mdbx_env_openW(env, pathnameW, flags, mode); - osal_free(pathnameW); + int rc = osal_mb2w(pathname, &pathnameW); + if (likely(rc == MDBX_SUCCESS)) { + rc = mdbx_env_openW(env, pathnameW, flags, mode); + osal_free(pathnameW); + } return rc; } @@ -21779,9 +21785,11 @@ __cold int mdbx_env_copy(MDBX_env *env, const char *dest_path, MDBX_copy_flags_t flags) { #if defined(_WIN32) || defined(_WIN64) wchar_t *dest_pathW = nullptr; - OSAL_MB2WIDE(dest_path, dest_pathW); - int rc = mdbx_env_copyW(env, dest_pathW, flags); - osal_free(dest_pathW); + int rc = osal_mb2w(dest_path, &dest_pathW); + if (likely(rc == MDBX_SUCCESS)) { + rc = mdbx_env_copyW(env, dest_pathW, flags); + osal_free(dest_pathW); + } return rc; } diff --git a/src/osal.c b/src/osal.c index 45cb92c8..f5630324 100644 --- a/src/osal.c +++ b/src/osal.c @@ -546,19 +546,32 @@ MDBX_INTERNAL_FUNC int osal_fastmutex_release(osal_fastmutex_t *fastmutex) { #if defined(_WIN32) || defined(_WIN64) -#ifndef WC_ERR_INVALID_CHARS -static const DWORD WC_ERR_INVALID_CHARS = - (6 /* Windows Vista */ <= /* MajorVersion */ LOBYTE(LOWORD(GetVersion()))) - ? 0x00000080 - : 0; -#endif /* WC_ERR_INVALID_CHARS */ +MDBX_INTERNAL_FUNC int osal_mb2w(const char *const src, wchar_t **const pdst) { + const size_t dst_wlen = MultiByteToWideChar( + CP_THREAD_ACP, MB_ERR_INVALID_CHARS, src, -1, nullptr, 0); + wchar_t *dst = *pdst; + int rc = ERROR_INVALID_NAME; + if (unlikely(dst_wlen < 2 || dst_wlen > /* MAX_PATH */ INT16_MAX)) + goto bailout; -MDBX_MAYBE_UNUSED MDBX_INTERNAL_FUNC size_t osal_mb2w(wchar_t *dst, - size_t dst_n, - const char *src, - size_t src_n) { - return MultiByteToWideChar(CP_THREAD_ACP, MB_ERR_INVALID_CHARS, src, - (int)src_n, dst, (int)dst_n); + dst = osal_realloc(dst, dst_wlen * sizeof(wchar_t)); + rc = MDBX_ENOMEM; + if (unlikely(!dst)) + goto bailout; + + *pdst = dst; + if (likely(dst_wlen == (size_t)MultiByteToWideChar(CP_THREAD_ACP, + MB_ERR_INVALID_CHARS, src, + -1, dst, (int)dst_wlen))) + return MDBX_SUCCESS; + + rc = ERROR_INVALID_NAME; +bailout: + if (*pdst) { + osal_free(*pdst); + *pdst = nullptr; + } + return rc; } #endif /* Windows */ diff --git a/src/osal.h b/src/osal.h index a8e50a20..4e228ed7 100644 --- a/src/osal.h +++ b/src/osal.h @@ -757,26 +757,7 @@ MDBX_INTERNAL_FUNC int osal_rpid_check(MDBX_env *env, uint32_t pid); #if defined(_WIN32) || defined(_WIN64) -MDBX_INTERNAL_FUNC size_t osal_mb2w(wchar_t *dst, size_t dst_n, const char *src, - size_t src_n); - -#define OSAL_MB2WIDE(FROM, TO) \ - do { \ - const char *const from_tmp = (FROM); \ - const size_t from_mblen = strlen(from_tmp); \ - const size_t to_wlen = osal_mb2w(nullptr, 0, from_tmp, from_mblen); \ - if (unlikely(to_wlen < 1 || to_wlen > /* MAX_PATH */ INT16_MAX)) \ - return ERROR_INVALID_NAME; \ - wchar_t *const to_tmp = osal_malloc((to_wlen + 1) * sizeof(wchar_t)); \ - if (unlikely(!to_tmp)) \ - return MDBX_ENOMEM; \ - if (unlikely(to_wlen + 1 != \ - osal_mb2w(to_tmp, to_wlen + 1, from_tmp, from_mblen + 1))) { \ - osal_free(to_tmp); \ - return ERROR_INVALID_NAME; \ - } \ - (TO) = to_tmp; \ - } while (0) +MDBX_INTERNAL_FUNC int osal_mb2w(const char *const src, wchar_t **const pdst); typedef void(WINAPI *osal_srwlock_t_function)(osal_srwlock_t *); MDBX_INTERNAL_VAR osal_srwlock_t_function osal_srwlock_Init, From 1684d17b0f82782748de932558045342cf4d085f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Thu, 9 Feb 2023 17:19:25 +0300 Subject: [PATCH 341/364] =?UTF-8?q?mdbx-windows:=20=D0=BF=D0=BE=D0=B4?= =?UTF-8?q?=D0=B4=D0=B5=D1=80=D0=B6=D0=BA=D0=B0=20char-=D0=B2=D0=B5=D1=80?= =?UTF-8?q?=D1=81=D0=B8=D0=B8=20`mdbx=5Fenv=5Fget=5Fpath()`.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mdbx.h | 3 +-- src/core.c | 68 +++++++++++++++++++++++++++++++++++++++---------- src/internals.h | 2 ++ 3 files changed, 58 insertions(+), 15 deletions(-) diff --git a/mdbx.h b/mdbx.h index 76a42d48..190e4113 100644 --- a/mdbx.h +++ b/mdbx.h @@ -3003,9 +3003,8 @@ LIBMDBX_API int mdbx_env_get_flags(const MDBX_env *env, unsigned *flags); * \returns A non-zero error value on failure and 0 on success, * some possible errors are: * \retval MDBX_EINVAL An invalid parameter was specified. */ -#if !(defined(_WIN32) || defined(_WIN64)) LIBMDBX_API int mdbx_env_get_path(const MDBX_env *env, const char **dest); -#else +#if defined(_WIN32) || defined(_WIN64) LIBMDBX_API int mdbx_env_get_pathW(const MDBX_env *env, const wchar_t **dest); #endif /* Windows */ diff --git a/src/core.c b/src/core.c index 42727794..f9f3d35e 100644 --- a/src/core.c +++ b/src/core.c @@ -14779,6 +14779,9 @@ __cold int mdbx_env_open(MDBX_env *env, const char *pathname, if (likely(rc == MDBX_SUCCESS)) { rc = mdbx_env_openW(env, pathnameW, flags, mode); osal_free(pathnameW); + if (rc == MDBX_SUCCESS) + /* force to make cache of the multi-byte pathname representation */ + mdbx_env_get_path(env, &pathname); } return rc; } @@ -15334,6 +15337,12 @@ __cold static int env_close(MDBX_env *env) { osal_free(env->me_pathname); env->me_pathname = nullptr; } +#if defined(_WIN32) || defined(_WIN64) + if (env->me_pathname_char) { + osal_free(env->me_pathname_char); + env->me_pathname_char = nullptr; + } +#endif /* Windows */ if (env->me_txn0) { dpl_free(env->me_txn0); txl_free(env->me_txn0->tw.lifo_reclaimed); @@ -21929,19 +21938,7 @@ __cold int mdbx_env_set_assert(MDBX_env *env, MDBX_assert_func *func) { #endif } -#if !(defined(_WIN32) || defined(_WIN64)) -__cold int mdbx_env_get_path(const MDBX_env *env, const char **arg) { - int rc = check_env(env, true); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - - if (unlikely(!arg)) - return MDBX_EINVAL; - - *arg = env->me_pathname; - return MDBX_SUCCESS; -} -#else +#if defined(_WIN32) || defined(_WIN64) __cold int mdbx_env_get_pathW(const MDBX_env *env, const wchar_t **arg) { int rc = check_env(env, true); if (unlikely(rc != MDBX_SUCCESS)) @@ -21955,6 +21952,51 @@ __cold int mdbx_env_get_pathW(const MDBX_env *env, const wchar_t **arg) { } #endif /* Windows */ +__cold int mdbx_env_get_path(const MDBX_env *env, const char **arg) { + int rc = check_env(env, true); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + if (unlikely(!arg)) + return MDBX_EINVAL; + +#if defined(_WIN32) || defined(_WIN64) + if (!env->me_pathname_char) { + *arg = nullptr; + DWORD flags = /* WC_ERR_INVALID_CHARS */ 0x80; + size_t mb_len = WideCharToMultiByte(CP_THREAD_ACP, flags, env->me_pathname, + -1, nullptr, 0, nullptr, nullptr); + rc = mb_len ? MDBX_SUCCESS : (int)GetLastError(); + if (rc == ERROR_INVALID_FLAGS) { + mb_len = WideCharToMultiByte(CP_THREAD_ACP, flags = 0, env->me_pathname, + -1, nullptr, 0, nullptr, nullptr); + rc = mb_len ? MDBX_SUCCESS : (int)GetLastError(); + } + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + char *const mb_pathname = osal_malloc(mb_len); + if (!mb_pathname) + return MDBX_ENOMEM; + if (mb_len != (size_t)WideCharToMultiByte(CP_THREAD_ACP, flags, + env->me_pathname, -1, mb_pathname, + (int)mb_len, nullptr, nullptr)) { + rc = (int)GetLastError(); + osal_free(mb_pathname); + return rc; + } + if (env->me_pathname_char || + InterlockedCompareExchangePointer( + (PVOID volatile *)&env->me_pathname_char, mb_pathname, nullptr)) + osal_free(mb_pathname); + } + *arg = env->me_pathname_char; +#else + *arg = env->me_pathname; +#endif /* Windows */ + return MDBX_SUCCESS; +} + __cold int mdbx_env_get_fd(const MDBX_env *env, mdbx_filehandle_t *arg) { int rc = check_env(env, true); if (unlikely(rc != MDBX_SUCCESS)) diff --git a/src/internals.h b/src/internals.h index b5b8ead6..06cfbfd3 100644 --- a/src/internals.h +++ b/src/internals.h @@ -1463,6 +1463,8 @@ struct MDBX_env { osal_srwlock_t me_remap_guard; /* Workaround for LockFileEx and WriteFile multithread bug */ CRITICAL_SECTION me_windowsbug_lock; + char *me_pathname_char; /* cache of multi-byte representation of pathname + to the DB files */ #else osal_fastmutex_t me_remap_guard; #endif From bd35fe8970aa73c9ba010c60a549eeded8cbf5c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Thu, 9 Feb 2023 20:02:17 +0300 Subject: [PATCH 342/364] =?UTF-8?q?mdbx-doc:=20=D0=B4=D0=BE=D0=B1=D0=B0?= =?UTF-8?q?=D0=B2=D0=BB=D0=B5=D0=BD=D0=B8=D0=B5=20doxygen-=D0=BE=D0=BF?= =?UTF-8?q?=D0=B8=D1=81=D0=B0=D0=BD=D0=B8=D1=8F=20=D0=B4=D0=BB=D1=8F=20API?= =?UTF-8?q?=20=D1=81=20=D1=88=D0=B8=D1=80=D0=BE=D0=BA=D0=B8=D0=BC=D0=B8=20?= =?UTF-8?q?=D1=81=D0=B8=D0=BC=D0=B2=D0=BE=D0=BB=D0=B0=D0=BC=D0=B8.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mdbx.h | 48 +++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 39 insertions(+), 9 deletions(-) diff --git a/mdbx.h b/mdbx.h index 190e4113..ce46b6f6 100644 --- a/mdbx.h +++ b/mdbx.h @@ -2302,6 +2302,8 @@ LIBMDBX_API int mdbx_env_get_option(const MDBX_env *env, * be called later to discard the \ref MDBX_env handle and release associated * resources. * + * \note On Windows the \ref mdbx_env_openW() is recommended to use. + * * \param [in] env An environment handle returned * by \ref mdbx_env_create() * @@ -2369,8 +2371,11 @@ LIBMDBX_API int mdbx_env_get_option(const MDBX_env *env, LIBMDBX_API int mdbx_env_open(MDBX_env *env, const char *pathname, MDBX_env_flags_t flags, mdbx_mode_t mode); -#if defined(_WIN32) || defined(_WIN64) -LIBMDBX_API int mdbx_env_openW(MDBX_env *env, const wchar_t *pathnameW, +#if defined(_WIN32) || defined(_WIN64) || defined(DOXYGEN) +/** \copydoc mdbx_env_open() + * \note Available only on Windows. + * \see mdbx_env_open() */ +LIBMDBX_API int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, MDBX_env_flags_t flags, mdbx_mode_t mode); #endif /* Windows */ @@ -2400,6 +2405,8 @@ typedef enum MDBX_env_delete_mode_t MDBX_env_delete_mode_t; /** \brief Delete the environment's files in a proper and multiprocess-safe way. * \ingroup c_extra * + * \note On Windows the \ref mdbx_env_deleteW() is recommended to use. + * * \param [in] pathname The pathname for the database or the directory in which * the database files reside. * @@ -2416,8 +2423,12 @@ typedef enum MDBX_env_delete_mode_t MDBX_env_delete_mode_t; * so no deletion was performed. */ LIBMDBX_API int mdbx_env_delete(const char *pathname, MDBX_env_delete_mode_t mode); -#if defined(_WIN32) || defined(_WIN64) -LIBMDBX_API int mdbx_env_deleteW(const wchar_t *pathnameW, + +#if defined(_WIN32) || defined(_WIN64) || defined(DOXYGEN) +/** \copydoc mdbx_env_delete() + * \note Available only on Windows. + * \see mdbx_env_delete() */ +LIBMDBX_API int mdbx_env_deleteW(const wchar_t *pathname, MDBX_env_delete_mode_t mode); #endif /* Windows */ @@ -2430,6 +2441,8 @@ LIBMDBX_API int mdbx_env_deleteW(const wchar_t *pathnameW, * parallel with write transactions, because it employs a read-only * transaction. See long-lived transactions under \ref restrictions section. * + * \note On Windows the \ref mdbx_env_copyW() is recommended to use. + * * \param [in] env An environment handle returned by mdbx_env_create(). * It must have already been opened successfully. * \param [in] dest The pathname of a file in which the copy will reside. @@ -2454,7 +2467,11 @@ LIBMDBX_API int mdbx_env_deleteW(const wchar_t *pathnameW, * \returns A non-zero error value on failure and 0 on success. */ LIBMDBX_API int mdbx_env_copy(MDBX_env *env, const char *dest, MDBX_copy_flags_t flags); -#if defined(_WIN32) || defined(_WIN64) + +#if defined(_WIN32) || defined(_WIN64) || defined(DOXYGEN) +/** \copydoc mdbx_env_copy() + * \note Available only on Windows. + * \see mdbx_env_copy() */ LIBMDBX_API int mdbx_env_copyW(MDBX_env *env, const wchar_t *dest, MDBX_copy_flags_t flags); #endif /* Windows */ @@ -2995,6 +3012,8 @@ LIBMDBX_API int mdbx_env_get_flags(const MDBX_env *env, unsigned *flags); /** \brief Return the path that was used in mdbx_env_open(). * \ingroup c_statinfo * + * \note On Windows the \ref mdbx_env_get_pathW() is recommended to use. + * * \param [in] env An environment handle returned by \ref mdbx_env_create() * \param [out] dest Address of a string pointer to contain the path. * This is the actual string in the environment, not a @@ -3004,7 +3023,11 @@ LIBMDBX_API int mdbx_env_get_flags(const MDBX_env *env, unsigned *flags); * some possible errors are: * \retval MDBX_EINVAL An invalid parameter was specified. */ LIBMDBX_API int mdbx_env_get_path(const MDBX_env *env, const char **dest); -#if defined(_WIN32) || defined(_WIN64) + +#if defined(_WIN32) || defined(_WIN64) || defined(DOXYGEN) +/** \copydoc mdbx_env_get_path() + * \note Available only on Windows. + * \see mdbx_env_get_path() */ LIBMDBX_API int mdbx_env_get_pathW(const MDBX_env *env, const wchar_t **dest); #endif /* Windows */ @@ -5510,13 +5533,20 @@ LIBMDBX_API int mdbx_env_pgwalk(MDBX_txn *txn, MDBX_pgvisitor_func *visitor, * * This function mostly of internal API for `mdbx_chk` utility and subject to * change at any time. Do not use this function to avoid shooting your own - * leg(s). */ + * leg(s). + * + * \note On Windows the \ref mdbx_env_open_for_recoveryW() is recommended + * to use. */ LIBMDBX_API int mdbx_env_open_for_recovery(MDBX_env *env, const char *pathname, unsigned target_meta, bool writeable); -#if defined(_WIN32) || defined(_WIN64) + +#if defined(_WIN32) || defined(_WIN64) || defined(DOXYGEN) +/** \copydoc mdbx_env_open_for_recovery() + * \note Available only on Windows. + * \see mdbx_env_open_for_recovery() */ LIBMDBX_API int mdbx_env_open_for_recoveryW(MDBX_env *env, - const wchar_t *pathnameW, + const wchar_t *pathname, unsigned target_meta, bool writeable); #endif /* Windows */ From e51140fe48e7802ea4b11ee8ae6be0b5cbbd86e9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Thu, 9 Feb 2023 20:05:11 +0300 Subject: [PATCH 343/364] =?UTF-8?q?mdbx-doc:=20=D0=BA=D0=BE=D1=80=D1=80?= =?UTF-8?q?=D0=B5=D0=BA=D1=82=D0=B8=D1=80=D0=BE=D0=B2=D0=BA=D0=B0=20doxyge?= =?UTF-8?q?n-=D0=BE=D0=BF=D0=B8=D1=81=D0=B0=D0=BD=D0=B8=D1=8F=20C++=20API,?= =?UTF-8?q?=20=D0=B2=20=D0=BE=D1=81=D0=BE=D0=B1=D0=B5=D0=BD=D0=BD=D0=BE?= =?UTF-8?q?=D1=81=D1=82=D0=B8=20C++20=20concepts.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mdbx.h++ | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/mdbx.h++ b/mdbx.h++ index 47d2648f..8e5e5ecf 100644 --- a/mdbx.h++ +++ b/mdbx.h++ @@ -223,17 +223,18 @@ #endif /* MDBX_CXX20_UNLIKELY */ #ifndef MDBX_HAVE_CXX20_CONCEPTS -#if defined(DOXYGEN) || \ - (defined(__cpp_lib_concepts) && __cpp_lib_concepts >= 202002L) +#if defined(__cpp_lib_concepts) && __cpp_lib_concepts >= 202002L #include #define MDBX_HAVE_CXX20_CONCEPTS 1 +#elif defined(DOXYGEN) +#define MDBX_HAVE_CXX20_CONCEPTS 1 #else #define MDBX_HAVE_CXX20_CONCEPTS 0 #endif /* */ #endif /* MDBX_HAVE_CXX20_CONCEPTS */ #ifndef MDBX_CXX20_CONCEPT -#if MDBX_HAVE_CXX20_CONCEPTS +#if MDBX_HAVE_CXX20_CONCEPTS || defined(DOXYGEN) #define MDBX_CXX20_CONCEPT(CONCEPT, NAME) CONCEPT NAME #else #define MDBX_CXX20_CONCEPT(CONCEPT, NAME) typename NAME @@ -241,7 +242,7 @@ #endif /* MDBX_CXX20_CONCEPT */ #ifndef MDBX_ASSERT_CXX20_CONCEPT_SATISFIED -#if MDBX_HAVE_CXX20_CONCEPTS +#if MDBX_HAVE_CXX20_CONCEPTS || defined(DOXYGEN) #define MDBX_ASSERT_CXX20_CONCEPT_SATISFIED(CONCEPT, TYPE) \ static_assert(CONCEPT) #else @@ -551,8 +552,11 @@ static MDBX_CXX14_CONSTEXPR size_t check_length(size_t headroom, size_t payload, /// \defgroup cxx_data slices and buffers /// @{ -#if MDBX_HAVE_CXX20_CONCEPTS +#if MDBX_HAVE_CXX20_CONCEPTS || defined(DOXYGEN) +/** \concept MutableByteProducer + * \interface MutableByteProducer + * \brief MutableByteProducer C++20 concept */ template concept MutableByteProducer = requires(T a, char array[42]) { { a.is_empty() } -> std::same_as; @@ -560,6 +564,9 @@ concept MutableByteProducer = requires(T a, char array[42]) { { a.write_bytes(&array[0], size_t(42)) } -> std::same_as; }; +/** \concept ImmutableByteProducer + * \interface ImmutableByteProducer + * \brief ImmutableByteProducer C++20 concept */ template concept ImmutableByteProducer = requires(const T &a, char array[42]) { { a.is_empty() } -> std::same_as; @@ -567,6 +574,9 @@ concept ImmutableByteProducer = requires(const T &a, char array[42]) { { a.write_bytes(&array[0], size_t(42)) } -> std::same_as; }; +/** \concept SliceTranscoder + * \interface SliceTranscoder + * \brief SliceTranscoder C++20 concept */ template concept SliceTranscoder = ImmutableByteProducer && requires(const slice &source, const T &a) { @@ -3106,10 +3116,12 @@ public: operate_parameters(const operate_parameters &) noexcept = default; MDBX_CXX14_CONSTEXPR operate_parameters & operator=(const operate_parameters &) noexcept = default; - MDBX_env_flags_t - make_flags(bool accede = true, ///< \copydoc MDBX_ACCEDE - bool use_subdirectory = - false ///< use subdirectory to place the DB files + MDBX_env_flags_t make_flags( + bool accede = true, ///< Allows accepting incompatible operating options + ///< in case the database is already being used by + ///< another process(es) \see MDBX_ACCEDE + bool use_subdirectory = + false ///< use subdirectory to place the DB files ) const; static env::mode mode_from_flags(MDBX_env_flags_t) noexcept; static env::durability durability_from_flags(MDBX_env_flags_t) noexcept; From 7f5ea6d3b862dfa7061391028fec51ce3abf0bb6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Fri, 10 Feb 2023 13:03:23 +0300 Subject: [PATCH 344/364] =?UTF-8?q?mdbx:=20=D0=BA=D0=BE=D1=80=D1=80=D0=B5?= =?UTF-8?q?=D0=BA=D1=82=D0=B8=D1=80=D0=BE=D0=B2=D0=BA=D0=B0=20=D0=BF=D1=80?= =?UTF-8?q?=D0=BE=D1=82=D0=BE=D1=82=D0=B8=D0=BF=D0=B0=20`=5F=5Fasan=5Fdefa?= =?UTF-8?q?ult=5Foptions()`.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/core.c b/src/core.c index f9f3d35e..97eebbdd 100644 --- a/src/core.c +++ b/src/core.c @@ -25623,7 +25623,7 @@ __dll_export }; #ifdef __SANITIZE_ADDRESS__ -LIBMDBX_API __attribute__((__weak__)) const char *__asan_default_options() { +LIBMDBX_API __attribute__((__weak__)) const char *__asan_default_options(void) { return "symbolize=1:allow_addr2line=1:" #if MDBX_DEBUG "debug=1:" From 25e958f081af7413e813d1692862f64b412f87e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Sat, 11 Feb 2023 00:25:14 +0300 Subject: [PATCH 345/364] =?UTF-8?q?mdbx:=20=D1=83=D1=81=D1=82=D1=80=D0=B0?= =?UTF-8?q?=D0=BD=D0=B5=D0=BD=D0=B8=D0=B5=20=D0=B2=D1=81=D0=B5=D1=85=20?= =?UTF-8?q?=D0=BF=D1=80=D0=B5=D0=B4=D1=83=D0=BF=D1=80=D0=B5=D0=B6=D0=B4?= =?UTF-8?q?=D0=B5=D0=BD=D0=B8=D0=B9=20=D1=81=D1=82=D0=B0=D1=82=D0=B8=D1=87?= =?UTF-8?q?=D0=B5=D1=81=D0=BA=D0=BE=D0=B3=D0=BE=20=D0=B0=D0=BD=D0=B0=D0=BB?= =?UTF-8?q?=D0=B8=D0=B7=D0=B0=D1=82=D0=BE=D1=80=D0=B0=20MSVC=20(=D0=B2?= =?UTF-8?q?=D1=81=D0=B5=20=D0=BD=D0=B5=D1=81=D1=83=D1=89=D0=B5=D1=81=D1=82?= =?UTF-8?q?=D0=B2=D0=B5=D0=BD=D0=BD=D1=8B=D0=B5=20=D0=B8=D0=BB=D0=B8=20?= =?UTF-8?q?=D0=BB=D0=BE=D0=B6=D0=BD=D1=8B=D0=B5).?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/base.h | 17 +++++ src/core.c | 168 +++++++++++++++++++++++++----------------- src/internals.h | 12 ++- src/lck-windows.c | 87 +++++++++++++--------- src/mdbx_chk.c | 10 +-- src/mdbx_load.c | 2 +- src/osal.c | 26 +++++-- test/chrono.c++ | 9 ++- test/keygen.c++ | 10 ++- test/log.c++ | 6 +- test/osal-windows.c++ | 4 +- test/test.c++ | 2 +- test/test.h++ | 6 +- 13 files changed, 223 insertions(+), 136 deletions(-) diff --git a/src/base.h b/src/base.h index 753ad005..09aa7ff3 100644 --- a/src/base.h +++ b/src/base.h @@ -669,6 +669,23 @@ __extern_C key_t ftok(const char *, int); #endif #endif /* MDBX_WEAK_IMPORT_ATTRIBUTE */ +#ifndef MDBX_GOOFY_MSVC_STATIC_ANALYZER +#ifdef _PREFAST_ +#define MDBX_GOOFY_MSVC_STATIC_ANALYZER 1 +#else +#define MDBX_GOOFY_MSVC_STATIC_ANALYZER 0 +#endif +#endif /* MDBX_GOOFY_MSVC_STATIC_ANALYZER */ + +#if MDBX_GOOFY_MSVC_STATIC_ANALYZER +#define MDBX_ANALYSIS_ASSUME(expr) __analysis_assume(expr) +#define MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(warn_id, note) \ + _Pragma(MDBX_STRINGIFY(prefast(suppress : warn_id))) +#else +#define MDBX_ANALYSIS_ASSUME(expr) assert(expr) +#define MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(warn_id, note) +#endif + /*----------------------------------------------------------------------------*/ #if defined(MDBX_USE_VALGRIND) diff --git a/src/core.c b/src/core.c index 97eebbdd..b32ae17a 100644 --- a/src/core.c +++ b/src/core.c @@ -1529,7 +1529,7 @@ __cold int rthc_alloc(osal_thread_key_t *pkey, MDBX_reader *begin, goto bailout; } if (rthc_table == rthc_table_static) - memcpy(new_table, rthc_table_static, sizeof(rthc_table_static)); + memcpy(new_table, rthc_table, sizeof(rthc_entry_t) * rthc_limit); rthc_table = new_table; rthc_limit *= 2; } @@ -2250,8 +2250,8 @@ static MDBX_PNL pnl_alloc(size_t size) { #endif /* malloc_usable_size */ pl[0] = pnl_bytes2size(bytes); assert(pl[0] >= size); - pl[1] = 0; pl += 1; + *pl = 0; } return pl; } @@ -2435,7 +2435,7 @@ pnl_merge_inner(pgno_t *__restrict dst, const pgno_t *__restrict src_a, // clang<=5: cmov×2, set+add/sub // clang>=6: cmov, set+add/sub *dst = flag ? *src_a : *src_b; - src_b += flag - 1; + src_b += (ptrdiff_t)flag - 1; src_a -= flag; #endif --dst; @@ -2593,7 +2593,7 @@ static __inline size_t search_spilled(const MDBX_txn *txn, pgno_t pgno) { if (likely(!pnl)) return 0; pgno <<= 1; - size_t n = pnl_search(pnl, pgno, (size_t)(MAX_PAGENO + 1) << 1); + size_t n = pnl_search(pnl, pgno, (size_t)MAX_PAGENO + MAX_PAGENO + 1); return (n <= MDBX_PNL_GETSIZE(pnl) && pnl[n] == pgno) ? n : 0; } @@ -2620,7 +2620,7 @@ static __inline bool intersect_spilled(const MDBX_txn *txn, pgno_t pgno, const bool rc = n <= MDBX_PNL_GETSIZE(pnl) && pnl[n] <= spilled_range_last; #else const size_t n = - pnl_search(pnl, spilled_range_last, (size_t)(MAX_PAGENO + 1) << 1); + pnl_search(pnl, spilled_range_last, (size_t)MAX_PAGENO + MAX_PAGENO + 1); assert(n && (n == MDBX_PNL_GETSIZE(pnl) + 1 || spilled_range_last >= pnl[n])); const bool rc = n <= MDBX_PNL_GETSIZE(pnl) && pnl[n] >= spilled_range_begin; #endif @@ -2659,8 +2659,8 @@ static MDBX_TXL txl_alloc(void) { #endif /* malloc_usable_size */ tl[0] = txl_bytes2size(bytes); assert(tl[0] >= MDBX_TXL_INITIAL); - tl[1] = 0; tl += 1; + *tl = 0; } return tl; } @@ -2884,7 +2884,7 @@ __hot __noinline static MDBX_dpl *dpl_sort_slowpath(const MDBX_txn *txn) { #else *w = cmp ? *l : *r; l -= cmp; - r += cmp - 1; + r += (ptrdiff_t)cmp - 1; #endif } while (likely(--w > l)); assert(r == tmp - 1); @@ -4886,7 +4886,7 @@ spill_prio(const MDBX_txn *txn, const size_t i, const uint32_t reciprocal) { factor |= factor >> 4; factor |= factor >> 8; factor |= factor >> 16; - factor = prio * log2n_powerof2(factor + 1) + /* golden ratio */ 157; + factor = (size_t)prio * log2n_powerof2(factor + 1) + /* golden ratio */ 157; factor = (factor < 256) ? 255 - factor : 0; tASSERT(txn, factor < 256 && factor < (256 - prio)); return prio = (unsigned)factor; @@ -5007,6 +5007,7 @@ __cold static int txn_spill_slowpath(MDBX_txn *const txn, MDBX_cursor *const m0, if (unlikely(rc != MDBX_SUCCESS)) goto bailout; #if MDBX_AVOID_MSYNC + MDBX_ANALYSIS_ASSUME(txn->tw.dirtylist != nullptr); tASSERT(txn, dirtylist_check(txn)); env->me_lck->mti_unsynced_pages.weak += txn->tw.dirtylist->pages_including_loose - txn->tw.loose_count; @@ -5032,6 +5033,7 @@ __cold static int txn_spill_slowpath(MDBX_txn *const txn, MDBX_cursor *const m0, NOTICE("%s-spilling %zu dirty-entries, %zu dirty-npages", "write", need_spill_entries, need_spill_npages); + MDBX_ANALYSIS_ASSUME(txn->tw.dirtylist != nullptr); tASSERT(txn, txn->tw.dirtylist->length - txn->tw.loose_count >= 1); tASSERT(txn, txn->tw.dirtylist->pages_including_loose - txn->tw.loose_count >= need_spill_npages); @@ -5549,6 +5551,7 @@ meta_prefer_steady(const MDBX_env *env, const meta_troika_t *troika) { static __always_inline meta_ptr_t meta_tail(const MDBX_env *env, const meta_troika_t *troika) { const uint8_t tail = troika->tail_and_flags & 3; + MDBX_ANALYSIS_ASSUME(tail < NUM_METAS); meta_ptr_t r; r.txnid = troika->txnid[tail]; r.ptr_v = METAPAGE(env, tail); @@ -5998,7 +6001,7 @@ static void adjust_defaults(MDBX_env *env) { const size_t basis = env->me_dbgeo.now; /* TODO: use options? */ const unsigned factor = 9; - size_t threshold = (basis < (65536ul << factor)) + size_t threshold = (basis < ((size_t)65536 << factor)) ? 65536 /* minimal threshold */ : (basis > (MEGABYTE * 4 << factor)) ? MEGABYTE * 4 /* maximal threshold */ @@ -8300,22 +8303,24 @@ static int cursor_shadow(MDBX_txn *parent, MDBX_txn *nested) { static void cursors_eot(MDBX_txn *txn, const bool merge) { tASSERT(txn, txn->mt_cursors[FREE_DBI] == nullptr); for (intptr_t i = txn->mt_numdbs; --i > FREE_DBI;) { - MDBX_cursor *next, *mc = txn->mt_cursors[i]; + MDBX_cursor *mc = txn->mt_cursors[i]; if (!mc) continue; - txn->mt_cursors[i] = NULL; + txn->mt_cursors[i] = nullptr; do { const unsigned stage = mc->mc_signature; - MDBX_cursor *bk = mc->mc_backup; - next = mc->mc_next; + MDBX_cursor *const next = mc->mc_next; + MDBX_cursor *const bk = mc->mc_backup; ENSURE(txn->mt_env, stage == MDBX_MC_LIVE || (stage == MDBX_MC_WAIT4EOT && bk)); cASSERT(mc, mc->mc_dbi == (MDBX_dbi)i); if (bk) { MDBX_xcursor *mx = mc->mc_xcursor; - cASSERT(mc, mx == bk->mc_xcursor); tASSERT(txn, txn->mt_parent != NULL); + MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER( + 6001, "Using uninitialized memory '*mc->mc_backup'."); ENSURE(txn->mt_env, bk->mc_signature == MDBX_MC_LIVE); + tASSERT(txn, mx == bk->mc_xcursor); if (stage == MDBX_MC_WAIT4EOT /* Cursor was closed by user */) mc->mc_signature = stage /* Promote closed state to parent txn */; else if (merge) { @@ -8345,7 +8350,8 @@ static void cursors_eot(MDBX_txn *txn, const bool merge) { mc->mc_signature = MDBX_MC_READY4CLOSE /* Cursor may be reused */; mc->mc_flags = 0 /* reset C_UNTRACK */; } - } while ((mc = next) != NULL); + mc = next; + } while (mc); } } @@ -9187,9 +9193,6 @@ void *mdbx_txn_get_userctx(const MDBX_txn *txn) { int mdbx_txn_begin_ex(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags, MDBX_txn **ret, void *context) { - MDBX_txn *txn; - size_t size, tsize; - if (unlikely(!ret)) return MDBX_EINVAL; *ret = NULL; @@ -9208,6 +9211,7 @@ int mdbx_txn_begin_ex(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags, flags |= env->me_flags & MDBX_WRITEMAP; + MDBX_txn *txn = nullptr; if (parent) { /* Nested transactions: Max 1 child, write txns only, no writemap */ rc = check_txn_rw(parent, @@ -9238,9 +9242,13 @@ int mdbx_txn_begin_ex(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags, goto renew; } - size = env->me_maxdbs * (sizeof(MDBX_db) + sizeof(MDBX_cursor *) + 1); - size += tsize = sizeof(MDBX_txn); - if (unlikely((txn = osal_malloc(size)) == NULL)) { + const size_t base = (flags & MDBX_TXN_RDONLY) + ? sizeof(MDBX_txn) - sizeof(txn->tw) + sizeof(txn->to) + : sizeof(MDBX_txn); + const size_t size = + base + env->me_maxdbs * (sizeof(MDBX_db) + sizeof(MDBX_cursor *) + 1); + txn = osal_malloc(size); + if (unlikely(txn == nullptr)) { DEBUG("calloc: %s", "failed"); return MDBX_ENOMEM; } @@ -9248,11 +9256,13 @@ int mdbx_txn_begin_ex(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags, memset(txn, 0xCD, size); VALGRIND_MAKE_MEM_UNDEFINED(txn, size); #endif /* MDBX_DEBUG */ - memset(txn, 0, tsize); - txn->mt_dbxs = env->me_dbxs; /* static */ - txn->mt_dbs = ptr_disp(txn, tsize); + MDBX_ANALYSIS_ASSUME(size > base); + memset(txn, 0, + (MDBX_GOOFY_MSVC_STATIC_ANALYZER && base > size) ? size : base); + txn->mt_dbs = ptr_disp(txn, base); txn->mt_cursors = ptr_disp(txn->mt_dbs, sizeof(MDBX_db) * env->me_maxdbs); txn->mt_dbistate = ptr_disp(txn, size - env->me_maxdbs); + txn->mt_dbxs = env->me_dbxs; /* static */ txn->mt_flags = flags; txn->mt_env = env; @@ -9987,8 +9997,9 @@ __cold static int audit_ex(MDBX_txn *txn, size_t retired_stored, txn->mt_dbistate[i] |= DBI_AUDITED; if (txn->mt_dbs[i].md_root == P_INVALID) continue; - used += txn->mt_dbs[i].md_branch_pages + txn->mt_dbs[i].md_leaf_pages + - txn->mt_dbs[i].md_overflow_pages; + used += (size_t)txn->mt_dbs[i].md_branch_pages + + (size_t)txn->mt_dbs[i].md_leaf_pages + + (size_t)txn->mt_dbs[i].md_overflow_pages; if (i != MAIN_DBI) continue; @@ -10016,8 +10027,8 @@ __cold static int audit_ex(MDBX_txn *txn, size_t retired_stored, } } } - used += - db->md_branch_pages + db->md_leaf_pages + db->md_overflow_pages; + used += (size_t)db->md_branch_pages + (size_t)db->md_leaf_pages + + (size_t)db->md_overflow_pages; } } rc = cursor_sibling(&cx.outer, SIBLING_RIGHT); @@ -10031,11 +10042,13 @@ __cold static int audit_ex(MDBX_txn *txn, size_t retired_stored, continue; for (MDBX_txn *t = txn; t; t = t->mt_parent) if (F_ISSET(t->mt_dbistate[i], DBI_DIRTY | DBI_CREAT)) { - used += t->mt_dbs[i].md_branch_pages + t->mt_dbs[i].md_leaf_pages + - t->mt_dbs[i].md_overflow_pages; + used += (size_t)t->mt_dbs[i].md_branch_pages + + (size_t)t->mt_dbs[i].md_leaf_pages + + (size_t)t->mt_dbs[i].md_overflow_pages; txn->mt_dbistate[i] |= DBI_AUDITED; break; } + MDBX_ANALYSIS_ASSUME(txn != nullptr); if (!(txn->mt_dbistate[i] & DBI_AUDITED)) { WARNING("audit %s@%" PRIaTXN ": unable account dbi %zd / \"%*s\", state 0x%02x", @@ -10787,8 +10800,9 @@ retry: : INT16_MAX; if (avail_gc_slots > 1) { #if MDBX_ENABLE_BIGFOOT - chunk = (chunk < env->me_maxgc_ov1page * 2) ? chunk / 2 - : env->me_maxgc_ov1page; + chunk = (chunk < env->me_maxgc_ov1page * (size_t)2) + ? chunk / 2 + : env->me_maxgc_ov1page; #else if (chunk < env->me_maxgc_ov1page * 2) chunk /= 2; @@ -12641,8 +12655,9 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending, ? /* grow_step */ pv2pages(pending->mm_geo.grow_pv) : shrink_step; const pgno_t with_backlog_gap = largest_pgno + backlog_gap; - const pgno_t aligned = pgno_align2os_pgno( - env, with_backlog_gap + aligner - with_backlog_gap % aligner); + const pgno_t aligned = + pgno_align2os_pgno(env, (size_t)with_backlog_gap + aligner - + with_backlog_gap % aligner); const pgno_t bottom = (aligned > pending->mm_geo.lower) ? aligned : pending->mm_geo.lower; @@ -13566,10 +13581,10 @@ __cold int mdbx_env_get_maxreaders(const MDBX_env *env, unsigned *readers) { #endif /* LIBMDBX_NO_EXPORTS_LEGACY_API */ __cold static int alloc_page_buf(MDBX_env *env) { - return env->me_pbuf - ? MDBX_SUCCESS - : osal_memalign_alloc(env->me_os_psize, env->me_psize * NUM_METAS, - &env->me_pbuf); + return env->me_pbuf ? MDBX_SUCCESS + : osal_memalign_alloc(env->me_os_psize, + env->me_psize * (size_t)NUM_METAS, + &env->me_pbuf); } /* Further setup required for opening an MDBX environment */ @@ -13599,8 +13614,8 @@ __cold static int setup_dxb(MDBX_env *env, const int lck_rc, return err; header = *init_metas(env, env->me_pbuf); - err = osal_pwrite(env->me_lazy_fd, env->me_pbuf, env->me_psize * NUM_METAS, - 0); + err = osal_pwrite(env->me_lazy_fd, env->me_pbuf, + env->me_psize * (size_t)NUM_METAS, 0); if (unlikely(err != MDBX_SUCCESS)) return err; @@ -14640,7 +14655,8 @@ __cold static int handle_env_pathname(MDBX_handle_env_pathname *ctx, pathchar_t *const buf = ctx->buffer_for_free; rc = MDBX_SUCCESS; if (ctx->ent_len) { - memcpy(buf, pathname, sizeof(pathchar_t) * pathname_len); + memcpy(buf + /* shutting up goofy MSVC static analyzer */ 0, pathname, + sizeof(pathchar_t) * pathname_len); if (*flags & MDBX_NOSUBDIR) { const pathchar_t *const lck_ext = osal_fileext(lck_name, ARRAY_LENGTH(lck_name)); @@ -14657,7 +14673,8 @@ __cold static int handle_env_pathname(MDBX_handle_env_pathname *ctx, rc = check_alternative_lck_absent(buf); } - memcpy(ctx->dxb, pathname, sizeof(pathchar_t) * (ctx->ent_len + 1)); + memcpy(ctx->dxb + /* shutting up goofy MSVC static analyzer */ 0, pathname, + sizeof(pathchar_t) * (ctx->ent_len + 1)); memcpy(ctx->lck, pathname, sizeof(pathchar_t) * ctx->ent_len); if (*flags & MDBX_NOSUBDIR) { memcpy(ctx->lck + ctx->ent_len, lock_suffix, sizeof(lock_suffix)); @@ -14667,11 +14684,13 @@ __cold static int handle_env_pathname(MDBX_handle_env_pathname *ctx, } } else { assert(!(*flags & MDBX_NOSUBDIR)); - memcpy(buf, dxb_name + 1, sizeof(dxb_name) - sizeof(pathchar_t)); + memcpy(buf + /* shutting up goofy MSVC static analyzer */ 0, dxb_name + 1, + sizeof(dxb_name) - sizeof(pathchar_t)); memcpy(buf + dxb_name_len - 1, lock_suffix, sizeof(lock_suffix)); rc = check_alternative_lck_absent(buf); - memcpy(ctx->dxb, dxb_name + 1, sizeof(dxb_name) - sizeof(pathchar_t)); + memcpy(ctx->dxb + /* shutting up goofy MSVC static analyzer */ 0, + dxb_name + 1, sizeof(dxb_name) - sizeof(pathchar_t)); memcpy(ctx->lck, lck_name + 1, sizeof(lck_name) - sizeof(pathchar_t)); } return rc; @@ -15187,8 +15206,9 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, sizeof(MDBX_atomic_uint32_t) + 1); rc = alloc_page_buf(env); if (rc == MDBX_SUCCESS) { - memset(env->me_pbuf, -1, env->me_psize * 2); - memset(ptr_disp(env->me_pbuf, env->me_psize * 2), 0, env->me_psize); + memset(env->me_pbuf, -1, env->me_psize * (size_t)2); + memset(ptr_disp(env->me_pbuf, env->me_psize * (size_t)2), 0, + env->me_psize); MDBX_txn *txn = osal_calloc(1, size); if (txn) { txn->mt_dbs = ptr_disp(txn, tsize); @@ -15735,7 +15755,7 @@ __hot __noinline static int page_search_root(MDBX_cursor *mc, } else { const struct node_result nsr = node_search(mc, key); if (likely(nsr.node)) - i = mc->mc_ki[mc->mc_top] + nsr.exact - 1; + i = mc->mc_ki[mc->mc_top] + (intptr_t)nsr.exact - 1; else i = page_numkeys(mp) - 1; DEBUG("following index %zu for key [%s]", i, DKEY_DEBUG(key)); @@ -16137,9 +16157,9 @@ static int cursor_sibling(MDBX_cursor *mc, int dir) { DEBUG("parent page is page %" PRIaPGNO ", index %u", mc->mc_pg[mc->mc_top]->mp_pgno, mc->mc_ki[mc->mc_top]); - if ((dir == SIBLING_RIGHT) - ? (mc->mc_ki[mc->mc_top] + 1u >= page_numkeys(mc->mc_pg[mc->mc_top])) - : (mc->mc_ki[mc->mc_top] == 0)) { + if ((dir == SIBLING_RIGHT) ? (mc->mc_ki[mc->mc_top] + (size_t)1 >= + page_numkeys(mc->mc_pg[mc->mc_top])) + : (mc->mc_ki[mc->mc_top] == 0)) { DEBUG("no more keys aside, moving to next %s sibling", dir ? "right" : "left"); if (unlikely((rc = cursor_sibling(mc, dir)) != MDBX_SUCCESS)) { @@ -16188,7 +16208,7 @@ static int cursor_next(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, mp = mc->mc_pg[mc->mc_top]; if (unlikely(mc->mc_flags & C_EOF)) { - if (mc->mc_ki[mc->mc_top] + 1u >= page_numkeys(mp)) + if (mc->mc_ki[mc->mc_top] + (size_t)1 >= page_numkeys(mp)) return MDBX_NOTFOUND; mc->mc_flags ^= C_EOF; } @@ -16503,7 +16523,7 @@ cursor_set(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, MDBX_cursor_op op) { return ret; } } else { - mc->mc_pg[0] = 0; + mc->mc_pg[0] = nullptr; } ret.err = page_search(mc, &aligned_key, 0); @@ -16511,6 +16531,7 @@ cursor_set(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, MDBX_cursor_op op) { return ret; mp = mc->mc_pg[mc->mc_top]; + MDBX_ANALYSIS_ASSUME(mp != nullptr); cASSERT(mc, IS_LEAF(mp)); search_node:; @@ -16568,10 +16589,12 @@ got_node: if (unlikely(ret.err != MDBX_SUCCESS)) return ret; if (op == MDBX_SET || op == MDBX_SET_KEY || op == MDBX_SET_RANGE) { + MDBX_ANALYSIS_ASSUME(mc->mc_xcursor != nullptr); ret.err = cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); if (unlikely(ret.err != MDBX_SUCCESS)) return ret; } else { + MDBX_ANALYSIS_ASSUME(mc->mc_xcursor != nullptr); ret = cursor_set(&mc->mc_xcursor->mx_cursor, data, NULL, MDBX_SET_RANGE); if (unlikely(ret.err != MDBX_SUCCESS)) return ret; @@ -16679,6 +16702,7 @@ static int cursor_first(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data) { rc = cursor_xinit1(mc, node, mp); if (unlikely(rc != MDBX_SUCCESS)) return rc; + MDBX_ANALYSIS_ASSUME(mc->mc_xcursor != nullptr); rc = cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); if (unlikely(rc)) return rc; @@ -16728,6 +16752,7 @@ static int cursor_last(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data) { rc = cursor_xinit1(mc, node, mp); if (unlikely(rc != MDBX_SUCCESS)) return rc; + MDBX_ANALYSIS_ASSUME(mc->mc_xcursor != nullptr); rc = cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL); if (unlikely(rc)) return rc; @@ -17141,10 +17166,10 @@ static __hot int cursor_touch(MDBX_cursor *const mc, const MDBX_val *key, size_t need = CURSOR_STACK + 3; /* 2) GC/FreeDB for any payload */ if (mc->mc_dbi > FREE_DBI) { - need += txn->mt_dbs[FREE_DBI].md_depth + 3; + need += txn->mt_dbs[FREE_DBI].md_depth + (size_t)3; /* 3) Named DBs also dirty the main DB */ if (mc->mc_dbi > MAIN_DBI) - need += txn->mt_dbs[MAIN_DBI].md_depth + 3; + need += txn->mt_dbs[MAIN_DBI].md_depth + (size_t)3; } #if xMDBX_DEBUG_SPILLING != 2 /* production mode */ @@ -17152,7 +17177,7 @@ static __hot int cursor_touch(MDBX_cursor *const mc, const MDBX_val *key, * for extensively splitting, rebalance and merging */ need += need; /* 5) Factor the key+data which to be put in */ - need += bytes2pgno(txn->mt_env, node_size(key, data)) + 1; + need += bytes2pgno(txn->mt_env, node_size(key, data)) + (size_t)1; #else /* debug mode */ (void)key; @@ -18242,6 +18267,7 @@ __hot static int __must_check_result node_add_leaf2(MDBX_cursor *mc, size_t indx, const MDBX_val *key) { MDBX_page *mp = mc->mc_pg[mc->mc_top]; + MDBX_ANALYSIS_ASSUME(key != nullptr); DKBUF_DEBUG; DEBUG("add to leaf2-%spage %" PRIaPGNO " index %zi, " " key size %" PRIuPTR " [%s]", @@ -18322,6 +18348,8 @@ __hot static int __must_check_result node_add_leaf(MDBX_cursor *mc, size_t indx, const MDBX_val *key, MDBX_val *data, unsigned flags) { + MDBX_ANALYSIS_ASSUME(key != nullptr); + MDBX_ANALYSIS_ASSUME(data != nullptr); MDBX_page *mp = mc->mc_pg[mc->mc_top]; DKBUF_DEBUG; DEBUG("add to leaf-%spage %" PRIaPGNO " index %zi, data size %" PRIuPTR @@ -19659,7 +19687,7 @@ static int rebalance(MDBX_cursor *mc) { const int pagetype = PAGETYPE_WHOLE(mc->mc_pg[mc->mc_top]); STATIC_ASSERT(P_BRANCH == 1); - const size_t minkeys = (pagetype & P_BRANCH) + 1; + const size_t minkeys = (pagetype & P_BRANCH) + (size_t)1; /* Pages emptier than this are candidates for merging. */ size_t room_threshold = likely(mc->mc_dbi != FREE_DBI) @@ -19797,9 +19825,10 @@ static int rebalance(MDBX_cursor *mc) { return rc; cASSERT(mc, PAGETYPE_WHOLE(left) == PAGETYPE_WHOLE(mc->mc_pg[mc->mc_top])); } - if (mn.mc_ki[pre_top] + 1u < page_numkeys(mn.mc_pg[pre_top])) { + if (mn.mc_ki[pre_top] + (size_t)1 < page_numkeys(mn.mc_pg[pre_top])) { rc = page_get( - &mn, node_pgno(page_node(mn.mc_pg[pre_top], mn.mc_ki[pre_top] + 1)), + &mn, + node_pgno(page_node(mn.mc_pg[pre_top], mn.mc_ki[pre_top] + (size_t)1)), &right, mc->mc_pg[mc->mc_top]->mp_txnid); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -20454,7 +20483,7 @@ static int page_split(MDBX_cursor *mc, const MDBX_val *const newkey, return rc; } STATIC_ASSERT(P_BRANCH == 1); - const size_t minkeys = (mp->mp_flags & P_BRANCH) + 1; + const size_t minkeys = (mp->mp_flags & P_BRANCH) + (size_t)1; DEBUG(">> splitting %s-page %" PRIaPGNO " and adding %zu+%zu [%s] at %i, nkeys %zi", @@ -20758,7 +20787,7 @@ static int page_split(MDBX_cursor *mc, const MDBX_val *const newkey, } /* root split? */ - ptop += mc->mc_snum - snum; + ptop += mc->mc_snum - (size_t)snum; /* Right page might now have changed parent. * Check if left page also changed parent. */ @@ -20811,7 +20840,7 @@ static int page_split(MDBX_cursor *mc, const MDBX_val *const newkey, if (unlikely(rc != MDBX_SUCCESS)) goto done; - MDBX_node *node = page_node(mc->mc_pg[ptop], mc->mc_ki[ptop] + 1); + MDBX_node *node = page_node(mc->mc_pg[ptop], mc->mc_ki[ptop] + (size_t)1); cASSERT(mc, node_pgno(node) == mp->mp_pgno && mc->mc_pg[ptop] == ptop_page); } else { mn.mc_top--; @@ -21416,7 +21445,7 @@ __cold static void compacting_fixup_meta(MDBX_env *env, MDBX_meta *meta) { /* Calculate filesize taking in account shrink/growing thresholds */ if (meta->mm_geo.next != meta->mm_geo.now) { meta->mm_geo.now = meta->mm_geo.next; - const pgno_t aligner = pv2pages( + const size_t aligner = pv2pages( meta->mm_geo.grow_pv ? meta->mm_geo.grow_pv : meta->mm_geo.shrink_pv); if (aligner) { const pgno_t aligned = pgno_align2os_pgno( @@ -22874,9 +22903,10 @@ static int drop_tree(MDBX_cursor *mc, const bool may_have_subDBs) { if (!(may_have_subDBs | mc->mc_db->md_overflow_pages)) cursor_pop(mc); - rc = pnl_need(&txn->tw.retired_pages, mc->mc_db->md_branch_pages + - mc->mc_db->md_leaf_pages + - mc->mc_db->md_overflow_pages); + rc = pnl_need(&txn->tw.retired_pages, + (size_t)mc->mc_db->md_branch_pages + + (size_t)mc->mc_db->md_leaf_pages + + (size_t)mc->mc_db->md_overflow_pages); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; @@ -24647,8 +24677,8 @@ uint64_t mdbx_key_from_jsonInteger(const int64_t json_integer) { assert(mantissa >= IEEE754_DOUBLE_IMPLICIT_LEAD && mantissa <= IEEE754_DOUBLE_MANTISSA_AMAX); - const uint64_t exponent = - IEEE754_DOUBLE_EXPONENTA_BIAS + IEEE754_DOUBLE_MANTISSA_SIZE - shift; + const uint64_t exponent = (uint64_t)IEEE754_DOUBLE_EXPONENTA_BIAS + + IEEE754_DOUBLE_MANTISSA_SIZE - shift; assert(exponent > 0 && exponent <= IEEE754_DOUBLE_EXPONENTA_MAX); const uint64_t key = bias + (exponent << IEEE754_DOUBLE_MANTISSA_SIZE) + (mantissa - IEEE754_DOUBLE_IMPLICIT_LEAD); @@ -24673,8 +24703,8 @@ uint64_t mdbx_key_from_jsonInteger(const int64_t json_integer) { assert(mantissa >= IEEE754_DOUBLE_IMPLICIT_LEAD && mantissa <= IEEE754_DOUBLE_MANTISSA_AMAX); - const uint64_t exponent = - IEEE754_DOUBLE_EXPONENTA_BIAS + IEEE754_DOUBLE_MANTISSA_SIZE - shift; + const uint64_t exponent = (uint64_t)IEEE754_DOUBLE_EXPONENTA_BIAS + + IEEE754_DOUBLE_MANTISSA_SIZE - shift; assert(exponent > 0 && exponent <= IEEE754_DOUBLE_EXPONENTA_MAX); const uint64_t key = bias - 1 - (exponent << IEEE754_DOUBLE_MANTISSA_SIZE) - (mantissa - IEEE754_DOUBLE_IMPLICIT_LEAD); diff --git a/src/internals.h b/src/internals.h index 06cfbfd3..7bd0f96d 100644 --- a/src/internals.h +++ b/src/internals.h @@ -86,14 +86,18 @@ #pragma warning(disable : 4464) /* relative include path contains '..' */ #endif #if _MSC_VER > 1913 -#pragma warning(disable : 5045) /* Compiler will insert Spectre mitigation... \ - */ +#pragma warning(disable : 5045) /* will insert Spectre mitigation... */ #endif #if _MSC_VER > 1914 #pragma warning( \ disable : 5105) /* winbase.h(9531): warning C5105: macro expansion \ producing 'defined' has undefined behavior */ #endif +#if _MSC_VER > 1930 +#pragma warning(disable : 6235) /* is always a constant */ +#pragma warning(disable : 6237) /* is never evaluated and might \ + have side effects */ +#endif #pragma warning(disable : 4710) /* 'xyz': function not inlined */ #pragma warning(disable : 4711) /* function 'xyz' selected for automatic \ inline expansion */ @@ -1693,14 +1697,14 @@ int64pgno(int64_t i64) { MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline pgno_t pgno_add(size_t base, size_t augend) { assert(base <= MAX_PAGENO + 1 && augend < MAX_PAGENO); - return int64pgno(base + augend); + return int64pgno((int64_t)base + (int64_t)augend); } MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline pgno_t pgno_sub(size_t base, size_t subtrahend) { assert(base >= MIN_PAGENO && base <= MAX_PAGENO + 1 && subtrahend < MAX_PAGENO); - return int64pgno(base - subtrahend); + return int64pgno((int64_t)base - (int64_t)subtrahend); } MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __always_inline bool diff --git a/src/lck-windows.c b/src/lck-windows.c index 500510d9..10c24503 100644 --- a/src/lck-windows.c +++ b/src/lck-windows.c @@ -175,7 +175,7 @@ static int funlock(mdbx_filehandle_t fd, size_t offset, size_t bytes) { #else #define DXB_MAXLEN UINT32_C(0x7ff00000) #endif -#define DXB_BODY (env->me_psize * NUM_METAS), DXB_MAXLEN +#define DXB_BODY (env->me_psize * (size_t)NUM_METAS), DXB_MAXLEN #define DXB_WHOLE 0, DXB_MAXLEN int mdbx_txn_lock(MDBX_env *env, bool dontwait) { @@ -194,8 +194,12 @@ int mdbx_txn_lock(MDBX_env *env, bool dontwait) { } } - if (env->me_flags & MDBX_EXCLUSIVE) + if (env->me_flags & MDBX_EXCLUSIVE) { + MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER( + 26115, "Failing to release lock 'env->me_windowsbug_lock' in function " + "'mdbx_txn_lock'"); return MDBX_SUCCESS; + } const HANDLE fd4data = env->me_overlapped_fd ? env->me_overlapped_fd : env->me_lazy_fd; @@ -213,8 +217,12 @@ int mdbx_txn_lock(MDBX_env *env, bool dontwait) { LCK_EXCLUSIVE | LCK_DONTWAIT, DXB_BODY); } } - if (rc == MDBX_SUCCESS) + if (rc == MDBX_SUCCESS) { + MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER( + 26115, "Failing to release lock 'env->me_windowsbug_lock' in function " + "'mdbx_txn_lock'"); return rc; + } LeaveCriticalSection(&env->me_windowsbug_lock); return (!dontwait || rc != ERROR_LOCK_VIOLATION) ? rc : MDBX_BUSY; @@ -281,17 +289,18 @@ static int suspend_and_append(mdbx_handle_array_t **array, const DWORD ThreadId) { const unsigned limit = (*array)->limit; if ((*array)->count == limit) { - void *ptr = osal_realloc( - (limit > ARRAY_LENGTH((*array)->handles)) - ? *array - : /* don't free initial array on the stack */ NULL, - sizeof(mdbx_handle_array_t) + - sizeof(HANDLE) * (limit * 2 - ARRAY_LENGTH((*array)->handles))); + mdbx_handle_array_t *const ptr = + osal_realloc((limit > ARRAY_LENGTH((*array)->handles)) + ? *array + : /* don't free initial array on the stack */ NULL, + sizeof(mdbx_handle_array_t) + + sizeof(HANDLE) * (limit * (size_t)2 - + ARRAY_LENGTH((*array)->handles))); if (!ptr) return MDBX_ENOMEM; if (limit == ARRAY_LENGTH((*array)->handles)) - memcpy(ptr, *array, sizeof(mdbx_handle_array_t)); - *array = (mdbx_handle_array_t *)ptr; + *ptr = **array; + *array = ptr; (*array)->limit = limit * 2; } @@ -839,38 +848,40 @@ MDBX_SetFileIoOverlappedRange mdbx_SetFileIoOverlappedRange; #endif /* GCC/MINGW */ static void mdbx_winnt_import(void) { - const HINSTANCE hNtdll = GetModuleHandleA("ntdll.dll"); - #define GET_PROC_ADDR(dll, ENTRY) \ mdbx_##ENTRY = (MDBX_##ENTRY)GetProcAddress(dll, #ENTRY) - if (GetProcAddress(hNtdll, "wine_get_version")) { - assert(mdbx_RunningUnderWine()); - } else { - GET_PROC_ADDR(hNtdll, NtFsControlFile); - GET_PROC_ADDR(hNtdll, NtExtendSection); - assert(!mdbx_RunningUnderWine()); + const HINSTANCE hNtdll = GetModuleHandleA("ntdll.dll"); + if (hNtdll) { + if (GetProcAddress(hNtdll, "wine_get_version")) { + assert(mdbx_RunningUnderWine()); + } else { + GET_PROC_ADDR(hNtdll, NtFsControlFile); + GET_PROC_ADDR(hNtdll, NtExtendSection); + assert(!mdbx_RunningUnderWine()); + } } const HINSTANCE hKernel32dll = GetModuleHandleA("kernel32.dll"); - GET_PROC_ADDR(hKernel32dll, GetFileInformationByHandleEx); - GET_PROC_ADDR(hKernel32dll, GetTickCount64); - if (!mdbx_GetTickCount64) - mdbx_GetTickCount64 = stub_GetTickCount64; - if (!mdbx_RunningUnderWine()) { - GET_PROC_ADDR(hKernel32dll, SetFileInformationByHandle); - GET_PROC_ADDR(hKernel32dll, GetVolumeInformationByHandleW); - GET_PROC_ADDR(hKernel32dll, GetFinalPathNameByHandleW); - GET_PROC_ADDR(hKernel32dll, PrefetchVirtualMemory); - GET_PROC_ADDR(hKernel32dll, SetFileIoOverlappedRange); + if (hKernel32dll) { + GET_PROC_ADDR(hKernel32dll, GetFileInformationByHandleEx); + GET_PROC_ADDR(hKernel32dll, GetTickCount64); + if (!mdbx_GetTickCount64) + mdbx_GetTickCount64 = stub_GetTickCount64; + if (!mdbx_RunningUnderWine()) { + GET_PROC_ADDR(hKernel32dll, SetFileInformationByHandle); + GET_PROC_ADDR(hKernel32dll, GetVolumeInformationByHandleW); + GET_PROC_ADDR(hKernel32dll, GetFinalPathNameByHandleW); + GET_PROC_ADDR(hKernel32dll, PrefetchVirtualMemory); + GET_PROC_ADDR(hKernel32dll, SetFileIoOverlappedRange); + } } - const HINSTANCE hAdvapi32dll = GetModuleHandleA("advapi32.dll"); - GET_PROC_ADDR(hAdvapi32dll, RegGetValueA); -#undef GET_PROC_ADDR - - const osal_srwlock_t_function init = (osal_srwlock_t_function)GetProcAddress( - hKernel32dll, "InitializeSRWLock"); + const osal_srwlock_t_function init = + (osal_srwlock_t_function)(hKernel32dll + ? GetProcAddress(hKernel32dll, + "InitializeSRWLock") + : nullptr); if (init != NULL) { osal_srwlock_Init = init; osal_srwlock_AcquireShared = (osal_srwlock_t_function)GetProcAddress( @@ -888,6 +899,12 @@ static void mdbx_winnt_import(void) { osal_srwlock_AcquireExclusive = stub_srwlock_AcquireExclusive; osal_srwlock_ReleaseExclusive = stub_srwlock_ReleaseExclusive; } + + const HINSTANCE hAdvapi32dll = GetModuleHandleA("advapi32.dll"); + if (hAdvapi32dll) { + GET_PROC_ADDR(hAdvapi32dll, RegGetValueA); + } +#undef GET_PROC_ADDR } #if __GNUC_PREREQ(8, 0) diff --git a/src/mdbx_chk.c b/src/mdbx_chk.c index 7a13e733..a8c97372 100644 --- a/src/mdbx_chk.c +++ b/src/mdbx_chk.c @@ -571,8 +571,8 @@ static int pgvisitor(const uint64_t pgno, const unsigned pgnumber, data_tree_problems += !is_gc_tree; gc_tree_problems += is_gc_tree; } else { - dbi->payload_bytes += payload_bytes + header_bytes; - walk.total_payload_bytes += payload_bytes + header_bytes; + dbi->payload_bytes += (uint64_t)payload_bytes + header_bytes; + walk.total_payload_bytes += (uint64_t)payload_bytes + header_bytes; } } } @@ -632,7 +632,7 @@ static int handle_freedb(const uint64_t record_number, const MDBX_val *key, pgno_t prev = MDBX_PNL_ASCENDING ? NUM_METAS - 1 : txn->mt_next_pgno; pgno_t span = 1; - for (unsigned i = 0; i < number; ++i) { + for (size_t i = 0; i < number; ++i) { if (check_user_break()) return MDBX_EINTR; const pgno_t pgno = iptr[i]; @@ -651,7 +651,7 @@ static int handle_freedb(const uint64_t record_number, const MDBX_val *key, if (MDBX_PNL_DISORDERED(prev, pgno)) { bad = " [bad sequence]"; problem_add("entry", txnid, "bad sequence", - "%" PRIaPGNO " %c [%u].%" PRIaPGNO, prev, + "%" PRIaPGNO " %c [%zu].%" PRIaPGNO, prev, (prev == pgno) ? '=' : (MDBX_PNL_ASCENDING ? '>' : '<'), i, pgno); } @@ -677,7 +677,7 @@ static int handle_freedb(const uint64_t record_number, const MDBX_val *key, " pages, maxspan %" PRIaPGNO "%s\n", txnid, number, span, bad); if (verbose > 4) { - for (unsigned i = 0; i < number; i += span) { + for (size_t i = 0; i < number; i += span) { const pgno_t pgno = iptr[i]; for (span = 1; i + span < number && diff --git a/src/mdbx_load.c b/src/mdbx_load.c index 8a7a191a..552fedc8 100644 --- a/src/mdbx_load.c +++ b/src/mdbx_load.c @@ -673,7 +673,7 @@ int main(int argc, char *argv[]) { goto env_close; } - kbuf.iov_len = mdbx_env_get_maxvalsize_ex(env, 0) + 1; + kbuf.iov_len = mdbx_env_get_maxvalsize_ex(env, 0) + (size_t)1; if (kbuf.iov_len >= INTPTR_MAX / 2) { if (!quiet) fprintf(stderr, "mdbx_env_get_maxkeysize() failed, returns %zu\n", diff --git a/src/osal.c b/src/osal.c index f5630324..748767ba 100644 --- a/src/osal.c +++ b/src/osal.c @@ -48,6 +48,7 @@ static int ntstatus2errcode(NTSTATUS status) { OVERLAPPED ov; memset(&ov, 0, sizeof(ov)); ov.Internal = status; + MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(6387, "'_Param_(1)' could be '0'"); return GetOverlappedResult(NULL, &ov, &dummy, FALSE) ? MDBX_SUCCESS : (int)GetLastError(); } @@ -82,6 +83,8 @@ extern NTSTATUS NTAPI NtMapViewOfSection( extern NTSTATUS NTAPI NtUnmapViewOfSection(IN HANDLE ProcessHandle, IN OPTIONAL PVOID BaseAddress); +MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(28251, + "Inconsistent annotation for 'NtClose'...") extern NTSTATUS NTAPI NtClose(HANDLE Handle); extern NTSTATUS NTAPI NtAllocateVirtualMemory( @@ -320,7 +323,7 @@ MDBX_INTERNAL_FUNC int osal_vasprintf(char **strp, const char *fmt, return needed; } - *strp = osal_malloc(needed + 1); + *strp = osal_malloc(needed + (size_t)1); if (unlikely(*strp == nullptr)) { va_end(ones); #if defined(_WIN32) || defined(_WIN64) @@ -331,7 +334,7 @@ MDBX_INTERNAL_FUNC int osal_vasprintf(char **strp, const char *fmt, return -1; } - int actual = vsnprintf(*strp, needed + 1, fmt, ones); + int actual = vsnprintf(*strp, needed + (size_t)1, fmt, ones); va_end(ones); assert(actual == needed); @@ -692,7 +695,7 @@ MDBX_INTERNAL_FUNC int osal_ioring_add(osal_ioring_t *ior, const size_t offset, ((bytes | (uintptr_t)data | ior->last_bytes | (uintptr_t)(uint64_t)item->sgv[0].Buffer) & ior_alignment_mask) == 0 && - ior->last_sgvcnt + segments < OSAL_IOV_MAX) { + ior->last_sgvcnt + (size_t)segments < OSAL_IOV_MAX) { assert(ior->overlapped_fd); assert((item->single.iov_len & ior_WriteFile_flag) == 0); assert(item->sgv[ior->last_sgvcnt].Buffer == 0); @@ -801,6 +804,8 @@ MDBX_INTERNAL_FUNC void osal_ioring_walk( if (bytes & ior_WriteFile_flag) { data = Ptr64ToPtr(item->sgv[0].Buffer); bytes = ior->pagesize; + MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER( + 6385, "Reading invalid data from 'item->sgv'"); while (item->sgv[i].Buffer) { if (data + ior->pagesize != item->sgv[i].Buffer) { callback(ctx, offset, data, bytes); @@ -847,6 +852,8 @@ osal_ioring_write(osal_ioring_t *ior, mdbx_filehandle_t fd) { if (bytes & ior_WriteFile_flag) { assert(ior->overlapped_fd && fd == ior->overlapped_fd); bytes = ior->pagesize; + MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER( + 6385, "Reading invalid data from 'item->sgv'"); while (item->sgv[i].Buffer) { bytes += ior->pagesize; ++i; @@ -985,6 +992,8 @@ osal_ioring_write(osal_ioring_t *ior, mdbx_filehandle_t fd) { size_t i = 1, bytes = item->single.iov_len - ior_WriteFile_flag; if (bytes & ior_WriteFile_flag) { bytes = ior->pagesize; + MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER( + 6385, "Reading invalid data from 'item->sgv'"); while (item->sgv[i].Buffer) { bytes += ior->pagesize; ++i; @@ -1078,9 +1087,12 @@ MDBX_INTERNAL_FUNC void osal_ioring_reset(osal_ioring_t *ior) { if (item->ov.hEvent && item->ov.hEvent != ior) ior_put_event(ior, item->ov.hEvent); size_t i = 1; - if ((item->single.iov_len & ior_WriteFile_flag) == 0) + if ((item->single.iov_len & ior_WriteFile_flag) == 0) { + MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER( + 6385, "Reading invalid data from 'item->sgv'"); while (item->sgv[i].Buffer) ++i; + } item = ior_next(item, i); } } @@ -1095,8 +1107,11 @@ MDBX_INTERNAL_FUNC void osal_ioring_reset(osal_ioring_t *ior) { static void ior_cleanup(osal_ioring_t *ior, const size_t since) { osal_ioring_reset(ior); #if defined(_WIN32) || defined(_WIN64) - for (size_t i = since; i < ior->event_stack; ++i) + for (size_t i = since; i < ior->event_stack; ++i) { + MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER( + 6001, "Using uninitialized memory '**ior.event_pool'"); CloseHandle(ior->event_pool[i]); + } ior->event_stack = 0; #else (void)since; @@ -2734,6 +2749,7 @@ retry_mapview:; #endif /* POSIX / Windows */ + MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(6287, "Redundant code"); assert(rc != MDBX_SUCCESS || (map->base != nullptr && map->base != MAP_FAILED && map->current == size && map->limit == limit && diff --git a/test/chrono.c++ b/test/chrono.c++ index 71273e92..4d53b60d 100644 --- a/test/chrono.c++ +++ b/test/chrono.c++ @@ -87,10 +87,11 @@ time from_ms(uint64_t ms) { time now_realtime() { #if defined(_WIN32) || defined(_WIN64) || defined(_WINDOWS) static void(WINAPI * query_time)(LPFILETIME); - if (!query_time) { - query_time = (void(WINAPI *)(LPFILETIME))GetProcAddress( - GetModuleHandle(TEXT("kernel32.dll")), - "GetSystemTimePreciseAsFileTime"); + if (unlikely(!query_time)) { + HMODULE hModule = GetModuleHandle(TEXT("kernel32.dll")); + if (hModule) + query_time = (void(WINAPI *)(LPFILETIME))GetProcAddress( + hModule, "GetSystemTimePreciseAsFileTime"); if (!query_time) query_time = GetSystemTimeAsFileTime; } diff --git a/test/keygen.c++ b/test/keygen.c++ index e8e53262..a6d20f33 100644 --- a/test/keygen.c++ +++ b/test/keygen.c++ @@ -227,7 +227,8 @@ void maker::setup(const config::actor_params_pod &actor, unsigned actor_id, (void)thread_number; mapping = actor.keygen; - salt = (actor.keygen.seed + actor_id) * UINT64_C(14653293970879851569); + salt = + (actor.keygen.seed + uint64_t(actor_id)) * UINT64_C(14653293970879851569); base = actor.serial_base(); } @@ -315,11 +316,12 @@ void __hot maker::mk_begin(const serial_t serial, const essentials ¶ms, out.value.iov_len = std::max(unsigned(params.minlen), length(serial)); const auto variation = params.maxlen - params.minlen; if (variation) { - if (serial % (variation + 1)) { + if (serial % (variation + serial_t(1))) { auto refix = serial * UINT64_C(48835288005252737); refix ^= refix >> 32; - out.value.iov_len = std::max( - out.value.iov_len, params.minlen + 1 + size_t(refix) % variation); + out.value.iov_len = + std::max(out.value.iov_len, + params.minlen + size_t(1) + size_t(refix) % variation); } } diff --git a/test/log.c++ b/test/log.c++ index bc52432e..04dad84d 100644 --- a/test/log.c++ +++ b/test/log.c++ @@ -142,7 +142,7 @@ void output_nocheckloglevel_ap(const logging::loglevel priority, prefix.c_str(), level2str(priority), suffix.c_str()); va_list ones; - memset(&ones, 0, sizeof(ones)) /* zap MSVC and other stupid compilers */; + memset(&ones, 0, sizeof(ones)) /* zap MSVC and other goofy compilers */; if (same_or_higher(priority, error)) va_copy(ones, ap); vfprintf(last, format, ap); @@ -153,11 +153,11 @@ void output_nocheckloglevel_ap(const logging::loglevel priority, switch (end) { default: putc('\n', last); - // fall through + MDBX_CXX17_FALLTHROUGH; // fall through case '\n': fflush(last); last = nullptr; - // fall through + MDBX_CXX17_FALLTHROUGH; // fall through case ' ': case '_': case ':': diff --git a/test/osal-windows.c++ b/test/osal-windows.c++ index 57746532..24cde253 100644 --- a/test/osal-windows.c++ +++ b/test/osal-windows.c++ @@ -248,7 +248,7 @@ Environment: CommandLine.push_back('"'); for (auto It = Argument.begin();; ++It) { - unsigned NumberBackslashes = 0; + size_t NumberBackslashes = 0; while (It != Argument.end() && *It == '\\') { ++It; @@ -435,7 +435,7 @@ void osal_udelay(size_t us) { unsigned timeslice_ms = 1; while (timeBeginPeriod(timeslice_ms) == TIMERR_NOCANDO) ++timeslice_ms; - threshold_us = timeslice_ms * 1500u; + threshold_us = timeslice_ms * size_t(1500); assert(threshold_us > 0); } diff --git a/test/test.c++ b/test/test.c++ index 1e8429c5..77c90c0a 100644 --- a/test/test.c++ +++ b/test/test.c++ @@ -100,7 +100,7 @@ int testcase::hsr_callback(const MDBX_env *env, const MDBX_txn *txn, info.mi_geo.current >= info.mi_geo.upper)) { osal_yield(); if (retry > 0) - osal_udelay(retry * 100); + osal_udelay(retry * size_t(100)); return MDBX_RESULT_FALSE /* retry / wait until reader done */; } diff --git a/test/test.h++ b/test/test.h++ index 52a2add3..6158ba66 100644 --- a/test/test.h++ +++ b/test/test.h++ @@ -101,10 +101,10 @@ class testcase; class registry { struct record { - actor_testcase id; + actor_testcase id = ac_none; std::string name; - bool (*review_params)(actor_params &); - testcase *(*constructor)(const actor_config &, const mdbx_pid_t); + bool (*review_params)(actor_params &) = nullptr; + testcase *(*constructor)(const actor_config &, const mdbx_pid_t) = nullptr; }; std::unordered_map name2id; std::unordered_map id2record; From c9d11cbac18b511e6c77a9b7087c6de5ca977151 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Sat, 11 Feb 2023 07:35:56 +0300 Subject: [PATCH 346/364] =?UTF-8?q?mdbx:=20=D0=B4=D0=BE=D0=BF=D0=BE=D0=BB?= =?UTF-8?q?=D0=BD=D0=B5=D0=BD=D0=B8=D0=B5=20ChangeLog.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ChangeLog.md | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/ChangeLog.md b/ChangeLog.md index e16aea42..adcdf42a 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -17,24 +17,22 @@ and [by Yandex](https://translated.turbopages.org/proxy_u/ru-en.en/https/gitflic Исправления (без корректировок новых функций): - - Устранение регресса после коммита 474391c83c5f81def6fdf3b0b6f5716a87b78fbf - приводящего к возврату ERROR_SHARING_VIOLATION в Windows при открытии БД + - Устранен регресс после коммита 474391c83c5f81def6fdf3b0b6f5716a87b78fbf, + приводящий к возврату ERROR_SHARING_VIOLATION в Windows при открытии БД в режиме MDBX_EXCLUSIVE для чтения-записи. - - Ограничиваем размер отображения при коротком read-only файле для - предотвращении ошибки ERROR_NOT_ENOUGH_MEMORY в Windows, которая совсем - не информативна для пользователя и возникает в этом случае. + - Добавлено ограничение размера отображения при коротком read-only файле, для + предотвращения ошибки ERROR_NOT_ENOUGH_MEMORY в Windows, которая возникает + в этом случае и совсем не информативна для пользователя. - - Рефакторинг `dxb_resize()`. В том числе, для устранения срабатывания + - Произведен рефакторинг `dxb_resize()`, в том числе, для устранения срабатывания assert-проверки `size_bytes == env->me_dxb_mmap.current` в специфических многопоточных сценариях использования. Проверка срабатывала только в отладочных сборках, при специфическом наложении во времени читающей и пишущей транзакции в разных потоках, одновременно с изменением размера БД. Кроме срабатывание проверки, каких-либо других последствий не возникало. - - Исправление copy&paste опечатки в разделе "Getting started" документации. - - - Устранение проблемы `put(MDBX_UPSERT+MDBX_ALLDUPS)` для случая замены + - Устранена проблема в `put(MDBX_UPSERT+MDBX_ALLDUPS)` для случая замены всех значений единственного ключа в subDb. В ходе этой операции subDb становится полностью пустой, без каких-либо страниц и именно эта ситуация не была учтена в коде, что приводило к повреждению БД @@ -42,9 +40,17 @@ and [by Yandex](https://translated.turbopages.org/proxy_u/ru-en.en/https/gitflic Ликвидация технических долгов и мелочи: - - Исправление опечаток. - - Доработка теста для полной стохастической проверки `MDBX_EKEYMISMATCH` в режиме `MDBX_APPEND`. - - Добавление в CMake-тесты вызова mdbx_chk в режиме чтения-записи для проверки MDBX_EXCLUSIVE в этом режиме. + - Исправлены многочисленные опечатки в документации. + - Доработан тест для полной стохастической проверки `MDBX_EKEYMISMATCH` в режиме `MDBX_APPEND`. + - Расширены сценарии запуска `mdbx_chk` из CMake-тестов для проверки как в обычном, + так и эксклюзивном режимах чтения-записи. + - Уточнены спецификаторы `const` и `noexcept` для нескольких методов в C++ API. + - Устранено использование стека под буферы для `wchar`-преобразования путей. + - Для Windows добавлена функция `mdbx_env_get_path()` для получения пути к БД + в формате многобайтных символов. + - Добавлены doxygen-описания для API с широкими символами. + - Устранены предупреждения статического анализатора MSVC, + все они были несущественные, либо ложные. ------------------------------------------------------------------------------- From 8fba5ac8d8a475881ab8ed59937c7a34c0477ccb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Sun, 12 Feb 2023 00:32:49 +0300 Subject: [PATCH 347/364] =?UTF-8?q?mdbx:=20=D1=83=D1=81=D1=82=D1=80=D0=B0?= =?UTF-8?q?=D0=BD=D0=B5=D0=BD=D0=B8=D0=B5=20=D0=B8=D0=B7=D0=BB=D0=B8=D1=88?= =?UTF-8?q?=D0=BD=D0=B5=D0=B9=20assert-=D0=BF=D1=80=D0=BE=D0=B2=D0=B5?= =?UTF-8?q?=D1=80=D0=BA=D0=B8=20=D0=B2=D0=BD=D1=83=D1=82=D1=80=D0=B8=20`ov?= =?UTF-8?q?erride=5Fmeta()`.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ChangeLog.md | 5 +++++ src/core.c | 3 ++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/ChangeLog.md b/ChangeLog.md index adcdf42a..771e93e5 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -38,6 +38,11 @@ and [by Yandex](https://translated.turbopages.org/proxy_u/ru-en.en/https/gitflic ситуация не была учтена в коде, что приводило к повреждению БД при фиксации такой транзакции. + - Устранена излишняя assert-проверка внутри `override_meta()`. + Что в отладочных сборках могло приводить к ложным срабатываниям + при восстановлении БД, в том числе при автоматическом откате слабых + мета-страниц. + Ликвидация технических долгов и мелочи: - Исправлены многочисленные опечатки в документации. diff --git a/src/core.c b/src/core.c index b32ae17a..17cbf105 100644 --- a/src/core.c +++ b/src/core.c @@ -14382,7 +14382,8 @@ __cold static int __must_check_result override_meta(MDBX_env *env, meta_model(env, page, target); MDBX_meta *const model = page_meta(page); meta_set_txnid(env, model, txnid); - eASSERT(env, coherency_check_meta(env, model, true)); + if (txnid) + eASSERT(env, coherency_check_meta(env, model, true)); if (shape) { if (txnid && unlikely(!coherency_check_meta(env, shape, false))) { ERROR("bailout overriding meta-%zu since model failed " From b8092dd0dbf1f3c84cebfc5a142b9a9ce6d2da08 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Mon, 13 Feb 2023 16:00:03 +0300 Subject: [PATCH 348/364] =?UTF-8?q?mdbx:=20=D1=83=D1=81=D1=82=D1=80=D0=B0?= =?UTF-8?q?=D0=BD=D0=B5=D0=BD=D0=B8=D0=B5=20=D0=BB=D0=BE=D0=B6=D0=BD=D0=BE?= =?UTF-8?q?=D0=B3=D0=BE=20=D0=BF=D1=80=D0=B5=D0=B4=D1=83=D0=BF=D1=80=D0=B5?= =?UTF-8?q?=D0=B6=D0=B4=D0=B5=D0=BD=D0=B8=D1=8F=20GCC=20=D0=BF=D1=80=D0=B8?= =?UTF-8?q?=20=D1=81=D0=B1=D0=BE=D1=80=D0=BA=D0=B5=20=D0=B4=D0=BB=D1=8F=20?= =?UTF-8?q?SH4.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/core.c b/src/core.c index 17cbf105..1f2b7adb 100644 --- a/src/core.c +++ b/src/core.c @@ -1674,7 +1674,7 @@ __cold static int uniq_check(const osal_mmap_t *pending, MDBX_env **found) { ? uniq_peek(pending, &scan->me_lck_mmap) : uniq_poke(pending, &scan->me_lck_mmap, &salt); if (err == MDBX_ENODATA) { - uint64_t length; + uint64_t length = 0; if (likely(osal_filesize(pending->fd, &length) == MDBX_SUCCESS && length == 0)) { /* LY: skip checking since LCK-file is empty, i.e. just created. */ From 57ca0d6e1bda8ca6487797118bbdc0be27a1e43a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Mon, 13 Feb 2023 16:24:52 +0300 Subject: [PATCH 349/364] =?UTF-8?q?mdbx:=20=D0=BA=D0=BE=D1=80=D1=80=D0=B5?= =?UTF-8?q?=D0=BA=D1=82=D0=B8=D1=80=D0=BE=D0=B2=D0=BA=D0=B0=20=D0=BC=D0=B0?= =?UTF-8?q?=D0=BA=D1=80=D0=BE=D1=81=D0=BE=D0=B2=20`=5F=5Fcold`/`=5F=5Fhot`?= =?UTF-8?q?.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit В том числе для устранения проблемы `error: inlining failed in call to ‘always_inline FOO(...)’: target specific option mismatch` при сборке посредством GCC >10.x для SH4. --- src/base.h | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/src/base.h b/src/base.h index 09aa7ff3..7831b2cd 100644 --- a/src/base.h +++ b/src/base.h @@ -572,17 +572,13 @@ __extern_C key_t ftok(const char *, int); #ifndef __hot #if defined(__OPTIMIZE__) -#if defined(__e2k__) -#define __hot __attribute__((__hot__)) __optimize(3) -#elif defined(__clang__) && !__has_attribute(__hot_) && \ +#if defined(__clang__) && !__has_attribute(__hot__) && \ __has_attribute(__section__) && \ (defined(__linux__) || defined(__gnu_linux__)) /* just put frequently used functions in separate section */ #define __hot __attribute__((__section__("text.hot"))) __optimize("O3") -#elif defined(__LCC__) -#define __hot __attribute__((__hot__, __optimize__("Ofast,O4"))) #elif defined(__GNUC__) || __has_attribute(__hot__) -#define __hot __attribute__((__hot__)) __optimize("O3") +#define __hot __attribute__((__hot__)) #else #define __hot __optimize("O3") #endif @@ -593,17 +589,13 @@ __extern_C key_t ftok(const char *, int); #ifndef __cold #if defined(__OPTIMIZE__) -#if defined(__e2k__) -#define __cold __attribute__((__cold__)) __optimize(1) -#elif defined(__clang__) && !__has_attribute(cold) && \ +#if defined(__clang__) && !__has_attribute(__cold__) && \ __has_attribute(__section__) && \ (defined(__linux__) || defined(__gnu_linux__)) /* just put infrequently used functions in separate section */ #define __cold __attribute__((__section__("text.unlikely"))) __optimize("Os") -#elif defined(__LCC__) -#define __hot __attribute__((__cold__, __optimize__("Osize"))) -#elif defined(__GNUC__) || __has_attribute(cold) -#define __cold __attribute__((__cold__)) __optimize("Os") +#elif defined(__GNUC__) || __has_attribute(__cold__) +#define __cold __attribute__((__cold__)) #else #define __cold __optimize("Os") #endif From 2ea9fbe51bd1f7ed9e0d4715b20eb7ebeb1d7c3a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Mon, 13 Feb 2023 18:51:09 +0300 Subject: [PATCH 350/364] =?UTF-8?q?mdbx:=20=D0=B4=D0=BE=D0=BF=D0=BE=D0=BB?= =?UTF-8?q?=D0=BD=D0=B5=D0=BD=D0=B8=D0=B5=20ChangeLog.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ChangeLog.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/ChangeLog.md b/ChangeLog.md index 771e93e5..408d4f59 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -43,6 +43,10 @@ and [by Yandex](https://translated.turbopages.org/proxy_u/ru-en.en/https/gitflic при восстановлении БД, в том числе при автоматическом откате слабых мета-страниц. + - Скорректированы макросы `__cold`/`__hot`, в том числе для устранения проблемы + `error: inlining failed in call to ‘always_inline FOO(...)’: target specific option mismatch` + при сборке посредством GCC >10.x для SH4. + Ликвидация технических долгов и мелочи: - Исправлены многочисленные опечатки в документации. @@ -56,6 +60,7 @@ and [by Yandex](https://translated.turbopages.org/proxy_u/ru-en.en/https/gitflic - Добавлены doxygen-описания для API с широкими символами. - Устранены предупреждения статического анализатора MSVC, все они были несущественные, либо ложные. + - Устранено ложное предупреждение GCC при сборке для SH4. ------------------------------------------------------------------------------- From 29d12f1fc3ab10a41a6edab054ee884b9c57e6ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Tue, 14 Feb 2023 12:09:44 +0300 Subject: [PATCH 351/364] =?UTF-8?q?mdbx-doc:=20=D0=B4=D0=BE=D0=B1=D0=B0?= =?UTF-8?q?=D0=B2=D0=BB=D0=B5=D0=BD=D0=BE=20=D0=BF=D1=80=D0=B8=D0=BC=D0=B5?= =?UTF-8?q?=D1=87=D0=B0=D0=BD=D0=B8=D0=B5=20=D0=BA=20=D0=BE=D0=BF=D1=86?= =?UTF-8?q?=D0=B8=D0=B8=20`MDBX=5FHAVE=5FBUILTIN=5FCPU=5FSUPPORTS`.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core.c | 2 +- src/options.h | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/src/core.c b/src/core.c index 1f2b7adb..c4681f7e 100644 --- a/src/core.c +++ b/src/core.c @@ -6786,7 +6786,7 @@ __hot static pgno_t *scan4seq_neon(pgno_t *range, const size_t len, #ifdef scan4seq_impl /* The scan4seq_impl() is the best or no alternatives */ #elif !MDBX_HAVE_BUILTIN_CPU_SUPPORTS -/* The scan4seq_default() will be used since no cpu-features detection support +/* The scan4seq_default() will be used since no cpu-features detection support * from compiler. Please don't ask to implement cpuid-based detection and don't * make such PRs. */ #define scan4seq_impl scan4seq_default diff --git a/src/options.h b/src/options.h index 596efdc0..0ef27e6f 100644 --- a/src/options.h +++ b/src/options.h @@ -220,7 +220,11 @@ #endif /* MDBX_HAVE_C11ATOMICS */ /** If defined then enables use the GCC's `__builtin_cpu_supports()` - * for runtime dispatching depending on the CPU's capabilities. */ + * for runtime dispatching depending on the CPU's capabilities. + * \note Defining `MDBX_HAVE_BUILTIN_CPU_SUPPORTS` to `0` should avoided unless + * build for particular single-target platform, since on AMD64/x86 this disables + * dynamic choice (at runtime) of SSE2 / AVX2 / AVX512 instructions + * with fallback to non-accelerated baseline code. */ #ifndef MDBX_HAVE_BUILTIN_CPU_SUPPORTS #if defined(__APPLE__) || defined(BIONIC) /* Never use any modern features on Apple's or Google's OSes From 1b6e32071c44e83436b788949f0043e992efdf8c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Wed, 22 Feb 2023 17:48:05 +0300 Subject: [PATCH 352/364] =?UTF-8?q?mdbx:=20=D0=BF=D0=BE=D0=B2=D1=82=D0=BE?= =?UTF-8?q?=D1=80=D0=BD=D0=BE=D0=B5=20"=D1=83=D1=81=D1=82=D1=80=D0=B0?= =?UTF-8?q?=D0=BD=D0=B5=D0=BD=D0=B8=D0=B5"=20=D0=BF=D1=80=D0=B5=D0=B4?= =?UTF-8?q?=D1=83=D0=BF=D1=80=D0=B5=D0=B6=D0=B4=D0=B5=D0=BD=D0=B8=D0=B9=20?= =?UTF-8?q?MSVC=20Static=20Analyzer=20(aka=20Prefast).?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Никаких значимых изменений, только обход "странностей" в MSVC. Как оказалось MSVC распространяет действие директивы `pragma(warning(supppress:#))` строго на следующую строку, даже если эта строка является продолжением комментария начатого в самой директиве и/или не содержит синтаксических конструкций языка. Поэтому большинство из добавленных ранее директив для подавления ложных предупреждений, перестало работать после переформатирования исходного кода. --- src/base.h | 15 ++++++++++----- src/core.c | 4 ++-- src/lck-windows.c | 12 ++++++------ src/osal.c | 30 ++++++++++++++++-------------- 4 files changed, 34 insertions(+), 27 deletions(-) diff --git a/src/base.h b/src/base.h index 7831b2cd..b8a243e8 100644 --- a/src/base.h +++ b/src/base.h @@ -669,14 +669,19 @@ __extern_C key_t ftok(const char *, int); #endif #endif /* MDBX_GOOFY_MSVC_STATIC_ANALYZER */ -#if MDBX_GOOFY_MSVC_STATIC_ANALYZER +#if MDBX_GOOFY_MSVC_STATIC_ANALYZER || (defined(_MSC_VER) && _MSC_VER > 1919) #define MDBX_ANALYSIS_ASSUME(expr) __analysis_assume(expr) -#define MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(warn_id, note) \ - _Pragma(MDBX_STRINGIFY(prefast(suppress : warn_id))) +#ifdef _PREFAST_ +#define MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(warn_id) \ + __pragma(prefast(suppress : warn_id)) +#else +#define MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(warn_id) \ + __pragma(warning(suppress : warn_id)) +#endif #else #define MDBX_ANALYSIS_ASSUME(expr) assert(expr) -#define MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(warn_id, note) -#endif +#define MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(warn_id) +#endif /* MDBX_GOOFY_MSVC_STATIC_ANALYZER */ /*----------------------------------------------------------------------------*/ diff --git a/src/core.c b/src/core.c index c4681f7e..7b55cf63 100644 --- a/src/core.c +++ b/src/core.c @@ -8317,8 +8317,8 @@ static void cursors_eot(MDBX_txn *txn, const bool merge) { if (bk) { MDBX_xcursor *mx = mc->mc_xcursor; tASSERT(txn, txn->mt_parent != NULL); - MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER( - 6001, "Using uninitialized memory '*mc->mc_backup'."); + /* Zap: Using uninitialized memory '*mc->mc_backup'. */ + MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(6001); ENSURE(txn->mt_env, bk->mc_signature == MDBX_MC_LIVE); tASSERT(txn, mx == bk->mc_xcursor); if (stage == MDBX_MC_WAIT4EOT /* Cursor was closed by user */) diff --git a/src/lck-windows.c b/src/lck-windows.c index 10c24503..8ffccb1b 100644 --- a/src/lck-windows.c +++ b/src/lck-windows.c @@ -195,9 +195,9 @@ int mdbx_txn_lock(MDBX_env *env, bool dontwait) { } if (env->me_flags & MDBX_EXCLUSIVE) { - MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER( - 26115, "Failing to release lock 'env->me_windowsbug_lock' in function " - "'mdbx_txn_lock'"); + /* Zap: Failing to release lock 'env->me_windowsbug_lock' + * in function 'mdbx_txn_lock' */ + MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(26115); return MDBX_SUCCESS; } @@ -218,9 +218,9 @@ int mdbx_txn_lock(MDBX_env *env, bool dontwait) { } } if (rc == MDBX_SUCCESS) { - MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER( - 26115, "Failing to release lock 'env->me_windowsbug_lock' in function " - "'mdbx_txn_lock'"); + /* Zap: Failing to release lock 'env->me_windowsbug_lock' + * in function 'mdbx_txn_lock' */ + MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(26115); return rc; } diff --git a/src/osal.c b/src/osal.c index 748767ba..db70dc0b 100644 --- a/src/osal.c +++ b/src/osal.c @@ -48,7 +48,8 @@ static int ntstatus2errcode(NTSTATUS status) { OVERLAPPED ov; memset(&ov, 0, sizeof(ov)); ov.Internal = status; - MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(6387, "'_Param_(1)' could be '0'"); + /* Zap: '_Param_(1)' could be '0' */ + MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(6387); return GetOverlappedResult(NULL, &ov, &dummy, FALSE) ? MDBX_SUCCESS : (int)GetLastError(); } @@ -83,8 +84,8 @@ extern NTSTATUS NTAPI NtMapViewOfSection( extern NTSTATUS NTAPI NtUnmapViewOfSection(IN HANDLE ProcessHandle, IN OPTIONAL PVOID BaseAddress); -MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(28251, - "Inconsistent annotation for 'NtClose'...") +/* Zap: Inconsistent annotation for 'NtClose'... */ +MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(28251) extern NTSTATUS NTAPI NtClose(HANDLE Handle); extern NTSTATUS NTAPI NtAllocateVirtualMemory( @@ -804,8 +805,8 @@ MDBX_INTERNAL_FUNC void osal_ioring_walk( if (bytes & ior_WriteFile_flag) { data = Ptr64ToPtr(item->sgv[0].Buffer); bytes = ior->pagesize; - MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER( - 6385, "Reading invalid data from 'item->sgv'"); + /* Zap: Reading invalid data from 'item->sgv' */ + MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(6385); while (item->sgv[i].Buffer) { if (data + ior->pagesize != item->sgv[i].Buffer) { callback(ctx, offset, data, bytes); @@ -852,8 +853,8 @@ osal_ioring_write(osal_ioring_t *ior, mdbx_filehandle_t fd) { if (bytes & ior_WriteFile_flag) { assert(ior->overlapped_fd && fd == ior->overlapped_fd); bytes = ior->pagesize; - MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER( - 6385, "Reading invalid data from 'item->sgv'"); + /* Zap: Reading invalid data from 'item->sgv' */ + MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(6385); while (item->sgv[i].Buffer) { bytes += ior->pagesize; ++i; @@ -992,8 +993,8 @@ osal_ioring_write(osal_ioring_t *ior, mdbx_filehandle_t fd) { size_t i = 1, bytes = item->single.iov_len - ior_WriteFile_flag; if (bytes & ior_WriteFile_flag) { bytes = ior->pagesize; - MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER( - 6385, "Reading invalid data from 'item->sgv'"); + /* Zap: Reading invalid data from 'item->sgv' */ + MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(6385); while (item->sgv[i].Buffer) { bytes += ior->pagesize; ++i; @@ -1088,8 +1089,8 @@ MDBX_INTERNAL_FUNC void osal_ioring_reset(osal_ioring_t *ior) { ior_put_event(ior, item->ov.hEvent); size_t i = 1; if ((item->single.iov_len & ior_WriteFile_flag) == 0) { - MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER( - 6385, "Reading invalid data from 'item->sgv'"); + /* Zap: Reading invalid data from 'item->sgv' */ + MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(6385); while (item->sgv[i].Buffer) ++i; } @@ -1108,8 +1109,8 @@ static void ior_cleanup(osal_ioring_t *ior, const size_t since) { osal_ioring_reset(ior); #if defined(_WIN32) || defined(_WIN64) for (size_t i = since; i < ior->event_stack; ++i) { - MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER( - 6001, "Using uninitialized memory '**ior.event_pool'"); + /* Zap: Using uninitialized memory '**ior.event_pool' */ + MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(6001); CloseHandle(ior->event_pool[i]); } ior->event_stack = 0; @@ -2749,7 +2750,8 @@ retry_mapview:; #endif /* POSIX / Windows */ - MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(6287, "Redundant code"); + /* Zap: Redundant code */ + MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(6287); assert(rc != MDBX_SUCCESS || (map->base != nullptr && map->base != MAP_FAILED && map->current == size && map->limit == limit && From 5f690bbc4f9a5f9f27d767f94ee346eea3356c47 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Fri, 24 Feb 2023 10:43:00 +0300 Subject: [PATCH 353/364] =?UTF-8?q?mdbx-test:=20=D0=BF=D0=BE-=D1=83=D0=BC?= =?UTF-8?q?=D0=BE=D0=BB=D1=87=D0=B0=D0=BD=D0=B8=D1=8E=20=D1=80=D0=B0=D0=B1?= =?UTF-8?q?=D0=BE=D1=82=D0=B0=20=D0=B2=20=D1=80=D0=B5=D0=B6=D0=B8=D0=BC?= =?UTF-8?q?=D0=B5=20`MDBX=5FSYNC=5FDURABLE`.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test/main.c++ | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/test/main.c++ b/test/main.c++ index c96fc58f..2b8ff655 100644 --- a/test/main.c++ +++ b/test/main.c++ @@ -129,8 +129,7 @@ void actor_params::set_defaults(const std::string &tmpdir) { #endif pathname_db = tmpdir + "mdbx-test.db"; - mode_flags = MDBX_NOSUBDIR | MDBX_WRITEMAP | MDBX_SAFE_NOSYNC | - MDBX_NOMEMINIT | MDBX_COALESCE | MDBX_LIFORECLAIM | MDBX_ACCEDE; + mode_flags = MDBX_NOSUBDIR | MDBX_WRITEMAP | MDBX_SYNC_DURABLE | MDBX_ACCEDE; table_flags = MDBX_DUPSORT; size_lower = -1; From 359489e27161776e1d91376cd1c0d3c138433e81 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Sat, 25 Feb 2023 14:07:18 +0300 Subject: [PATCH 354/364] =?UTF-8?q?mdbx:=20=D0=B8=D1=81=D0=BF=D1=80=D0=B0?= =?UTF-8?q?=D0=B2=D0=BB=D0=B5=D0=BD=D0=B8=D0=B5=20=D1=81=D0=B5=D0=BC=D0=B0?= =?UTF-8?q?=D0=BD=D1=82=D0=B8=D1=87=D0=B5=D1=81=D0=BA=D0=BE=D0=B9=20=D0=BE?= =?UTF-8?q?=D0=BF=D0=B5=D1=87=D0=B0=D1=82=D0=BA=D0=B8=20=D0=B2=20=D0=BA?= =?UTF-8?q?=D0=BE=D0=BC=D0=BC=D0=B5=D0=BD=D1=82=D0=B0=D1=80=D0=B8=D0=B8=20?= =?UTF-8?q?=D0=BE=20=D1=80=D0=B5=D0=B6=D0=B8=D0=BC=D0=B5=20=D1=80=D0=B0?= =?UTF-8?q?=D0=B1=D0=BE=D1=82=D1=8B.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/core.c b/src/core.c index 7b55cf63..040217ad 100644 --- a/src/core.c +++ b/src/core.c @@ -14986,7 +14986,7 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, if (!(flags & (MDBX_RDONLY | MDBX_SAFE_NOSYNC | MDBX_NOMETASYNC | MDBX_EXCLUSIVE))) { if (MDBX_AVOID_MSYNC && (flags & MDBX_WRITEMAP)) { - /* Запрошен режим MDBX_SAFE_NOSYNC | MDBX_WRITEMAP при активной опции + /* Запрошен режим MDBX_SYNC_DURABLE | MDBX_WRITEMAP при активной опции * MDBX_AVOID_MSYNC. * * 1) В этой комбинации наиболее выгодно использовать WriteFileGather(), From 6d74b10db1d41db1f1872e0b71d88a8de3f4589e Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Sun, 26 Feb 2023 20:26:54 +0300 Subject: [PATCH 355/364] =?UTF-8?q?mdbx:=20=D0=BF=D0=BE=D0=B4=D0=B4=D0=B5?= =?UTF-8?q?=D1=80=D0=B6=D0=BA=D0=B0=20ASAN=20(Address=20Sanitizer)=20?= =?UTF-8?q?=D0=BF=D1=80=D0=B8=20=D1=81=D0=B1=D0=BE=D1=80=D0=BA=D0=B5=20?= =?UTF-8?q?=D0=BF=D0=BE=D1=81=D1=80=D0=B5=D0=B4=D1=81=D1=82=D0=B2=D0=BE?= =?UTF-8?q?=D0=BC=20MSVC.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- cmake/compiler.cmake | 17 +++++++++++++---- src/core.c | 5 ++++- src/mdbx.c++ | 6 ++++++ test/base.h++ | 4 ++++ 4 files changed, 27 insertions(+), 5 deletions(-) diff --git a/cmake/compiler.cmake b/cmake/compiler.cmake index dd6f71c8..1d805ea0 100644 --- a/cmake/compiler.cmake +++ b/cmake/compiler.cmake @@ -348,6 +348,8 @@ endif() if(MSVC) check_compiler_flag("/WX" CC_HAS_WERROR) + check_compiler_flag("/fsanitize=address" CC_HAS_ASAN) + check_compiler_flag("/fsanitize=undefined" CC_HAS_UBSAN) else() # # GCC started to warn for unused result starting from 4.2, and @@ -839,19 +841,26 @@ macro(setup_compile_flags) endif() if(ENABLE_ASAN) - add_compile_flags("C;CXX" "-fsanitize=address") + if(NOT MSVC) + add_compile_flags("C;CXX" "-fsanitize=address") + else() + add_compile_flags("C;CXX" "/fsanitize=address") + endif() add_definitions(-DASAN_ENABLED=1) endif() if(ENABLE_UBSAN) - add_compile_flags("C;CXX" "-fsanitize=undefined" "-fsanitize-undefined-trap-on-error") + if(NOT MSVC) + add_compile_flags("C;CXX" "-fsanitize=undefined" "-fsanitize-undefined-trap-on-error") + else() + add_compile_flags("C;CXX" "/fsanitize=undefined") + endif() add_definitions(-DUBSAN_ENABLED=1) endif() if(ENABLE_GCOV) if(NOT HAVE_GCOV) - message(FATAL_ERROR - "ENABLE_GCOV option requested but gcov library is not found") + message(FATAL_ERROR "ENABLE_GCOV option requested but gcov library is not found") endif() add_compile_flags("C;CXX" "-fprofile-arcs" "-ftest-coverage") diff --git a/src/core.c b/src/core.c index 040217ad..3f52d4e2 100644 --- a/src/core.c +++ b/src/core.c @@ -25654,7 +25654,10 @@ __dll_export }; #ifdef __SANITIZE_ADDRESS__ -LIBMDBX_API __attribute__((__weak__)) const char *__asan_default_options(void) { +#if !defined(_MSC_VER) || __has_attribute(weak) +LIBMDBX_API __attribute__((__weak__)) +#endif +const char *__asan_default_options(void) { return "symbolize=1:allow_addr2line=1:" #if MDBX_DEBUG "debug=1:" diff --git a/src/mdbx.c++ b/src/mdbx.c++ index 8a74e412..590cc07d 100644 --- a/src/mdbx.c++ +++ b/src/mdbx.c++ @@ -14,6 +14,12 @@ #define __USE_MINGW_ANSI_STDIO 1 #endif /* MinGW */ +/* Workaround for MSVC' header `extern "C"` vs `std::` redefinition bug */ +#if defined(_MSC_VER) && defined(__SANITIZE_ADDRESS__) && \ + !defined(_DISABLE_VECTOR_ANNOTATION) +#define _DISABLE_VECTOR_ANNOTATION +#endif /* _DISABLE_VECTOR_ANNOTATION */ + #include "../mdbx.h++" #include "internals.h" diff --git a/test/base.h++ b/test/base.h++ index f4e083bd..f3a7701e 100644 --- a/test/base.h++ +++ b/test/base.h++ @@ -30,6 +30,10 @@ #define _WIN32_WINNT 0x0601 /* Windows 7 */ #endif #ifdef _MSC_VER +/* Workaround for MSVC' header `extern "C"` vs `std::` redefinition bug */ +#if defined(__SANITIZE_ADDRESS__) && !defined(_DISABLE_VECTOR_ANNOTATION) +#define _DISABLE_VECTOR_ANNOTATION +#endif /* _DISABLE_VECTOR_ANNOTATION */ #ifndef _CRT_SECURE_NO_WARNINGS #define _CRT_SECURE_NO_WARNINGS #endif /* _CRT_SECURE_NO_WARNINGS */ From 5c52adf35898dd8f2a63191c4eb6d852d0027935 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Mon, 27 Feb 2023 15:17:22 +0300 Subject: [PATCH 356/364] =?UTF-8?q?mdbx-test:=20=D1=80=D0=B0=D1=81=D1=88?= =?UTF-8?q?=D0=B8=D1=80=D0=B5=D0=BD=D0=B8=D0=B5=20=D0=BD=D0=B0=D0=B1=D0=BE?= =?UTF-8?q?=D1=80=D0=B0=20=D1=80=D0=B5=D0=B6=D0=B8=D0=BC=D0=BE=D0=B2=20?= =?UTF-8?q?=D0=BF=D0=B5=D1=80=D0=B5=D0=B1=D0=B8=D1=80=D0=B0=D0=B5=D0=BC?= =?UTF-8?q?=D1=8B=D1=85=20=D1=81=D0=BA=D1=80=D0=B8=D0=BF=D1=82=D0=BE=D0=BC?= =?UTF-8?q?=20`test/long=5Fstochastic.sh`?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test/long_stochastic.sh | 54 +++++++++++++++++++++++++++++------------ 1 file changed, 38 insertions(+), 16 deletions(-) diff --git a/test/long_stochastic.sh b/test/long_stochastic.sh index 906bacee..5d06bde7 100755 --- a/test/long_stochastic.sh +++ b/test/long_stochastic.sh @@ -350,8 +350,8 @@ else } fi -syncmodes=("" ,+nosync-safe ,+nosync-utterly) -options=(writemap lifo notls perturb) +syncmodes=("" ,+nosync-safe ,+nosync-utterly ,+nometasync) +options=(writemap lifo notls perturb nomeminit nordahead) function join { local IFS="$1"; shift; echo "$*"; } @@ -414,65 +414,87 @@ for nops in 10 33 100 333 1000 3333 10000 33333 100000 333333 1000000 3333333 10 split=30 caption="Probe #$((++count)) int-key,with-dups, split=${split}, case $((++subcase)) of ${cases}" probe \ --pagesize=$PAGESIZE --size-upper-upto=${db_size_mb}M --table=+key.integer,+data.dups --keygen.split=${split} --keylen.min=min --keylen.max=max --datalen.min=min --datalen.max=max \ - --nops=$nops --batch.write=$wbatch --mode=$(bits2options $bits)${syncmodes[count%3]} \ + --nops=$nops --batch.write=$wbatch --mode=$(bits2options $bits)${syncmodes[count%4]} \ --keygen.seed=${seed} caption="Probe #$((++count)) int-key,int-data, split=${split}, case $((++subcase)) of ${cases}" probe \ --pagesize=$PAGESIZE --size-upper-upto=${db_size_mb}M --table=+key.integer,+data.integer --keygen.split=${split} --keylen.min=min --keylen.max=max --datalen.min=min --datalen.max=max \ - --nops=$nops --batch.write=$wbatch --mode=$(bits2options $bits)${syncmodes[count%3]} \ + --nops=$nops --batch.write=$wbatch --mode=$(bits2options $bits)${syncmodes[count%4]} \ --keygen.seed=${seed} caption="Probe #$((++count)) with-dups, split=${split}, case $((++subcase)) of ${cases}" probe \ --pagesize=$PAGESIZE --size-upper-upto=${db_size_mb}M --table=+data.dups --keygen.split=${split} --keylen.min=min --keylen.max=max --datalen.min=min --datalen.max=max \ - --nops=$nops --batch.write=$wbatch --mode=$(bits2options $bits)${syncmodes[count%3]} \ + --nops=$nops --batch.write=$wbatch --mode=$(bits2options $bits)${syncmodes[count%4]} \ --keygen.seed=${seed} split=24 caption="Probe #$((++count)) int-key,with-dups, split=${split}, case $((++subcase)) of ${cases}" probe \ --pagesize=$PAGESIZE --size-upper-upto=${db_size_mb}M --table=+key.integer,+data.dups --keygen.split=${split} --keylen.min=min --keylen.max=max --datalen.min=min --datalen.max=max \ - --nops=$nops --batch.write=$wbatch --mode=$(bits2options $bits)${syncmodes[count%3]} \ + --nops=$nops --batch.write=$wbatch --mode=$(bits2options $bits)${syncmodes[count%4]} \ --keygen.seed=${seed} caption="Probe #$((++count)) int-key,int-data, split=${split}, case $((++subcase)) of ${cases}" probe \ --pagesize=$PAGESIZE --size-upper-upto=${db_size_mb}M --table=+key.integer,+data.integer --keygen.split=${split} --keylen.min=min --keylen.max=max --datalen.min=min --datalen.max=max \ - --nops=$nops --batch.write=$wbatch --mode=$(bits2options $bits)${syncmodes[count%3]} \ + --nops=$nops --batch.write=$wbatch --mode=$(bits2options $bits)${syncmodes[count%4]} \ --keygen.seed=${seed} caption="Probe #$((++count)) with-dups, split=${split}, case $((++subcase)) of ${cases}" probe \ --pagesize=$PAGESIZE --size-upper-upto=${db_size_mb}M --table=+data.dups --keygen.split=${split} --keylen.min=min --keylen.max=max --datalen.min=min --datalen.max=max \ - --nops=$nops --batch.write=$wbatch --mode=$(bits2options $bits)${syncmodes[count%3]} \ + --nops=$nops --batch.write=$wbatch --mode=$(bits2options $bits)${syncmodes[count%4]} \ --keygen.seed=${seed} split=16 caption="Probe #$((++count)) int-key,w/o-dups, split=${split}, case $((++subcase)) of ${cases}" probe \ --pagesize=$PAGESIZE --size-upper-upto=${db_size_mb}M --table=+key.integer,-data.dups --keygen.split=${split} --keylen.min=min --keylen.max=max --datalen.min=min --datalen.max=1111 \ - --nops=$nops --batch.write=$wbatch --mode=$(bits2options $bits)${syncmodes[count%3]} \ + --nops=$nops --batch.write=$wbatch --mode=$(bits2options $bits)${syncmodes[count%4]} \ --keygen.seed=${seed} caption="Probe #$((++count)) int-key,with-dups, split=${split}, case $((++subcase)) of ${cases}" probe \ --pagesize=$PAGESIZE --size-upper-upto=${db_size_mb}M --table=+key.integer,+data.dups --keygen.split=${split} --keylen.min=min --keylen.max=max --datalen.min=min --datalen.max=max \ - --nops=$nops --batch.write=$wbatch --mode=$(bits2options $bits)${syncmodes[count%3]} \ + --nops=$nops --batch.write=$wbatch --mode=$(bits2options $bits)${syncmodes[count%4]} \ --keygen.seed=${seed} caption="Probe #$((++count)) int-key,int-data, split=${split}, case $((++subcase)) of ${cases}" probe \ --pagesize=$PAGESIZE --size-upper-upto=${db_size_mb}M --table=+key.integer,+data.integer --keygen.split=${split} --keylen.min=min --keylen.max=max --datalen.min=min --datalen.max=max \ - --nops=$nops --batch.write=$wbatch --mode=$(bits2options $bits)${syncmodes[count%3]} \ + --nops=$nops --batch.write=$wbatch --mode=$(bits2options $bits)${syncmodes[count%4]} \ --keygen.seed=${seed} caption="Probe #$((++count)) w/o-dups, split=${split}, case $((++subcase)) of ${cases}" probe \ --pagesize=$PAGESIZE --size-upper-upto=${db_size_mb}M --table=-data.dups --keygen.split=${split} --keylen.min=min --keylen.max=max --datalen.min=min --datalen.max=1111 \ - --nops=$nops --batch.write=$wbatch --mode=$(bits2options $bits)${syncmodes[count%3]} \ + --nops=$nops --batch.write=$wbatch --mode=$(bits2options $bits)${syncmodes[count%4]} \ --keygen.seed=${seed} caption="Probe #$((++count)) with-dups, split=${split}, case $((++subcase)) of ${cases}" probe \ --pagesize=$PAGESIZE --size-upper-upto=${db_size_mb}M --table=+data.dups --keygen.split=${split} --keylen.min=min --keylen.max=max --datalen.min=min --datalen.max=max \ - --nops=$nops --batch.write=$wbatch --mode=$(bits2options $bits)${syncmodes[count%3]} \ + --nops=$nops --batch.write=$wbatch --mode=$(bits2options $bits)${syncmodes[count%4]} \ + --keygen.seed=${seed} + + split=10 + caption="Probe #$((++count)) int-key,w/o-dups, split=${split}, case $((++subcase)) of ${cases}" probe \ + --pagesize=$PAGESIZE --size-upper-upto=${db_size_mb}M --table=+key.integer,-data.dups --keygen.split=${split} --keylen.min=min --keylen.max=max --datalen.min=min --datalen.max=1111 \ + --nops=$nops --batch.write=$wbatch --mode=$(bits2options $bits)${syncmodes[count%4]} \ + --keygen.seed=${seed} + caption="Probe #$((++count)) int-key,with-dups, split=${split}, case $((++subcase)) of ${cases}" probe \ + --pagesize=$PAGESIZE --size-upper-upto=${db_size_mb}M --table=+key.integer,+data.dups --keygen.split=${split} --keylen.min=min --keylen.max=max --datalen.min=min --datalen.max=max \ + --nops=$nops --batch.write=$wbatch --mode=$(bits2options $bits)${syncmodes[count%4]} \ + --keygen.seed=${seed} + caption="Probe #$((++count)) int-key,int-data, split=${split}, case $((++subcase)) of ${cases}" probe \ + --pagesize=$PAGESIZE --size-upper-upto=${db_size_mb}M --table=+key.integer,+data.integer --keygen.split=${split} --keylen.min=min --keylen.max=max --datalen.min=min --datalen.max=max \ + --nops=$nops --batch.write=$wbatch --mode=$(bits2options $bits)${syncmodes[count%4]} \ + --keygen.seed=${seed} + caption="Probe #$((++count)) w/o-dups, split=${split}, case $((++subcase)) of ${cases}" probe \ + --pagesize=$PAGESIZE --size-upper-upto=${db_size_mb}M --table=-data.dups --keygen.split=${split} --keylen.min=min --keylen.max=max --datalen.min=min --datalen.max=1111 \ + --nops=$nops --batch.write=$wbatch --mode=$(bits2options $bits)${syncmodes[count%4]} \ + --keygen.seed=${seed} + caption="Probe #$((++count)) with-dups, split=${split}, case $((++subcase)) of ${cases}" probe \ + --pagesize=$PAGESIZE --size-upper-upto=${db_size_mb}M --table=+data.dups --keygen.split=${split} --keylen.min=min --keylen.max=max --datalen.min=min --datalen.max=max \ + --nops=$nops --batch.write=$wbatch --mode=$(bits2options $bits)${syncmodes[count%4]} \ --keygen.seed=${seed} split=4 caption="Probe #$((++count)) int-key,w/o-dups, split=${split}, case $((++subcase)) of ${cases}" probe \ --pagesize=$PAGESIZE --size-upper-upto=${db_size_mb}M --table=+key.integer,-data.dups --keygen.split=${split} --keylen.min=min --keylen.max=max --datalen.min=min --datalen.max=1111 \ - --nops=$nops --batch.write=$wbatch --mode=$(bits2options $bits)${syncmodes[count%3]} \ + --nops=$nops --batch.write=$wbatch --mode=$(bits2options $bits)${syncmodes[count%4]} \ --keygen.seed=${seed} caption="Probe #$((++count)) int-key,int-data, split=${split}, case $((++subcase)) of ${cases}" probe \ --pagesize=$PAGESIZE --size-upper-upto=${db_size_mb}M --table=+key.integer,+data.integer --keygen.split=${split} --keylen.min=min --keylen.max=max --datalen.min=min --datalen.max=max \ - --nops=$nops --batch.write=$wbatch --mode=$(bits2options $bits)${syncmodes[count%3]} \ + --nops=$nops --batch.write=$wbatch --mode=$(bits2options $bits)${syncmodes[count%4]} \ --keygen.seed=${seed} caption="Probe #$((++count)) w/o-dups, split=${split}, case $((++subcase)) of ${cases}" probe \ --pagesize=$PAGESIZE --size-upper-upto=${db_size_mb}M --table=-data.dups --keygen.split=${split} --keylen.min=min --keylen.max=max --datalen.min=min --datalen.max=1111 \ - --nops=$nops --batch.write=$wbatch --mode=$(bits2options $bits)${syncmodes[count%3]} \ + --nops=$nops --batch.write=$wbatch --mode=$(bits2options $bits)${syncmodes[count%4]} \ --keygen.seed=${seed} done # options loop=$((loop + 1)) From 800bd55ab90f5b780dfeac5558048c4b15faff40 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Mon, 27 Feb 2023 20:45:25 +0300 Subject: [PATCH 357/364] =?UTF-8?q?mdbx-test:=20=D0=B4=D0=BE=D0=B1=D0=B0?= =?UTF-8?q?=D0=B2=D0=BB=D0=B5=D0=BD=D0=B8=D0=B5=20=D0=BE=D0=BF=D1=86=D0=B8?= =?UTF-8?q?=D0=B8=20`--extra`=20=D0=B2=20=D1=81=D0=BA=D1=80=D0=B8=D0=BF?= =?UTF-8?q?=D1=82=20`test/long=5Fstochastic.sh`?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test/long_stochastic.sh | 62 ++++++++++++++++++++++++----------------- 1 file changed, 36 insertions(+), 26 deletions(-) diff --git a/test/long_stochastic.sh b/test/long_stochastic.sh index 5d06bde7..900c1319 100755 --- a/test/long_stochastic.sh +++ b/test/long_stochastic.sh @@ -12,6 +12,7 @@ UNAME="$(uname -s 2>/dev/null || echo Unknown)" DB_UPTO_MB=17408 PAGESIZE=min DONT_CHECK_RAM=no +EXTRA=no while [ -n "$1" ] do @@ -31,8 +32,9 @@ do echo "--dir PATH Specifies directory for test DB and other files (it will be cleared)" echo "--db-upto-mb NN Limits upper size of test DB to the NN megabytes" echo "--no-geometry-jitter Disable jitter for geometry upper-size" - echo "--pagesize NN Use specified page size (256 is minimal and used by default) " - echo "--dont-check-ram-size Don't check available RAM " + echo "--pagesize NN Use specified page size (256 is minimal and used by default)" + echo "--dont-check-ram-size Don't check available RAM" + echo "--extra Iterate extra modes/flags" echo "--help Print this usage help and exit" exit -2 ;; @@ -136,7 +138,7 @@ do PAGESIZE=$((1024*64)) ;; *) - echo "Invalig page size '$2'" + echo "Invalid page size '$2'" exit -2 ;; esac @@ -145,6 +147,9 @@ do --dont-check-ram-size) DONT_CHECK_RAM=yes ;; + --extra) + EXTRA=yes + ;; *) echo "Unknown option '$1'" exit -2 @@ -350,9 +355,12 @@ else } fi +if [ "$EXTRA" != "no" ]; then + options=(writemap lifo notls perturb nomeminit nordahead) +else + options=(writemap lifo notls) +fi syncmodes=("" ,+nosync-safe ,+nosync-utterly ,+nometasync) -options=(writemap lifo notls perturb nomeminit nordahead) - function join { local IFS="$1"; shift; echo "$*"; } function bits2options { @@ -461,27 +469,29 @@ for nops in 10 33 100 333 1000 3333 10000 33333 100000 333333 1000000 3333333 10 --nops=$nops --batch.write=$wbatch --mode=$(bits2options $bits)${syncmodes[count%4]} \ --keygen.seed=${seed} - split=10 - caption="Probe #$((++count)) int-key,w/o-dups, split=${split}, case $((++subcase)) of ${cases}" probe \ - --pagesize=$PAGESIZE --size-upper-upto=${db_size_mb}M --table=+key.integer,-data.dups --keygen.split=${split} --keylen.min=min --keylen.max=max --datalen.min=min --datalen.max=1111 \ - --nops=$nops --batch.write=$wbatch --mode=$(bits2options $bits)${syncmodes[count%4]} \ - --keygen.seed=${seed} - caption="Probe #$((++count)) int-key,with-dups, split=${split}, case $((++subcase)) of ${cases}" probe \ - --pagesize=$PAGESIZE --size-upper-upto=${db_size_mb}M --table=+key.integer,+data.dups --keygen.split=${split} --keylen.min=min --keylen.max=max --datalen.min=min --datalen.max=max \ - --nops=$nops --batch.write=$wbatch --mode=$(bits2options $bits)${syncmodes[count%4]} \ - --keygen.seed=${seed} - caption="Probe #$((++count)) int-key,int-data, split=${split}, case $((++subcase)) of ${cases}" probe \ - --pagesize=$PAGESIZE --size-upper-upto=${db_size_mb}M --table=+key.integer,+data.integer --keygen.split=${split} --keylen.min=min --keylen.max=max --datalen.min=min --datalen.max=max \ - --nops=$nops --batch.write=$wbatch --mode=$(bits2options $bits)${syncmodes[count%4]} \ - --keygen.seed=${seed} - caption="Probe #$((++count)) w/o-dups, split=${split}, case $((++subcase)) of ${cases}" probe \ - --pagesize=$PAGESIZE --size-upper-upto=${db_size_mb}M --table=-data.dups --keygen.split=${split} --keylen.min=min --keylen.max=max --datalen.min=min --datalen.max=1111 \ - --nops=$nops --batch.write=$wbatch --mode=$(bits2options $bits)${syncmodes[count%4]} \ - --keygen.seed=${seed} - caption="Probe #$((++count)) with-dups, split=${split}, case $((++subcase)) of ${cases}" probe \ - --pagesize=$PAGESIZE --size-upper-upto=${db_size_mb}M --table=+data.dups --keygen.split=${split} --keylen.min=min --keylen.max=max --datalen.min=min --datalen.max=max \ - --nops=$nops --batch.write=$wbatch --mode=$(bits2options $bits)${syncmodes[count%4]} \ - --keygen.seed=${seed} + if [ "$EXTRA" != "no" ]; then + split=10 + caption="Probe #$((++count)) int-key,w/o-dups, split=${split}, case $((++subcase)) of ${cases}" probe \ + --pagesize=$PAGESIZE --size-upper-upto=${db_size_mb}M --table=+key.integer,-data.dups --keygen.split=${split} --keylen.min=min --keylen.max=max --datalen.min=min --datalen.max=1111 \ + --nops=$nops --batch.write=$wbatch --mode=$(bits2options $bits)${syncmodes[count%4]} \ + --keygen.seed=${seed} + caption="Probe #$((++count)) int-key,with-dups, split=${split}, case $((++subcase)) of ${cases}" probe \ + --pagesize=$PAGESIZE --size-upper-upto=${db_size_mb}M --table=+key.integer,+data.dups --keygen.split=${split} --keylen.min=min --keylen.max=max --datalen.min=min --datalen.max=max \ + --nops=$nops --batch.write=$wbatch --mode=$(bits2options $bits)${syncmodes[count%4]} \ + --keygen.seed=${seed} + caption="Probe #$((++count)) int-key,int-data, split=${split}, case $((++subcase)) of ${cases}" probe \ + --pagesize=$PAGESIZE --size-upper-upto=${db_size_mb}M --table=+key.integer,+data.integer --keygen.split=${split} --keylen.min=min --keylen.max=max --datalen.min=min --datalen.max=max \ + --nops=$nops --batch.write=$wbatch --mode=$(bits2options $bits)${syncmodes[count%4]} \ + --keygen.seed=${seed} + caption="Probe #$((++count)) w/o-dups, split=${split}, case $((++subcase)) of ${cases}" probe \ + --pagesize=$PAGESIZE --size-upper-upto=${db_size_mb}M --table=-data.dups --keygen.split=${split} --keylen.min=min --keylen.max=max --datalen.min=min --datalen.max=1111 \ + --nops=$nops --batch.write=$wbatch --mode=$(bits2options $bits)${syncmodes[count%4]} \ + --keygen.seed=${seed} + caption="Probe #$((++count)) with-dups, split=${split}, case $((++subcase)) of ${cases}" probe \ + --pagesize=$PAGESIZE --size-upper-upto=${db_size_mb}M --table=+data.dups --keygen.split=${split} --keylen.min=min --keylen.max=max --datalen.min=min --datalen.max=max \ + --nops=$nops --batch.write=$wbatch --mode=$(bits2options $bits)${syncmodes[count%4]} \ + --keygen.seed=${seed} + fi split=4 caption="Probe #$((++count)) int-key,w/o-dups, split=${split}, case $((++subcase)) of ${cases}" probe \ From 8f87ab252e2dc2abdf25b08cfe6f6a19164cc6f3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Tue, 28 Feb 2023 00:52:40 +0300 Subject: [PATCH 358/364] =?UTF-8?q?mdbx:=20=D0=B4=D0=BE=D0=BF=D0=BE=D0=BB?= =?UTF-8?q?=D0=BD=D0=B5=D0=BD=D0=B8=D0=B5=20ChangeLog.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ChangeLog.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/ChangeLog.md b/ChangeLog.md index 408d4f59..4ed3e84c 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -4,7 +4,7 @@ ChangeLog English version [by Google](https://gitflic-ru.translate.goog/project/erthink/libmdbx/blob?file=ChangeLog.md&_x_tr_sl=ru&_x_tr_tl=en) and [by Yandex](https://translated.turbopages.org/proxy_u/ru-en.en/https/gitflic.ru/project/erthink/libmdbx/blob?file=ChangeLog.md). -## v0.13 (в разработке) +## v0.12.4 (подготовка к релизу) Благодарности: @@ -61,6 +61,9 @@ and [by Yandex](https://translated.turbopages.org/proxy_u/ru-en.en/https/gitflic - Устранены предупреждения статического анализатора MSVC, все они были несущественные, либо ложные. - Устранено ложное предупреждение GCC при сборке для SH4. + - Добавлена поддержка ASAN (Address Sanitizer) при сборке посредством MSVC. + - Расширен набор перебираемых режимов в скрипте `test/long_stochastic.sh`, + добавлена опция `--extra`. ------------------------------------------------------------------------------- From 2ae7bfd9be2fcdb40cf4e816f91cb2697c7b5aa6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Tue, 28 Feb 2023 21:52:34 +0300 Subject: [PATCH 359/364] =?UTF-8?q?mdbx-make:=20=D0=B0=D0=BA=D1=82=D1=83?= =?UTF-8?q?=D0=B0=D0=BB=D0=B8=D0=B7=D0=B0=D1=86=D0=B8=D1=8F=20=D1=81=D0=BF?= =?UTF-8?q?=D0=B8=D1=81=D0=BA=D0=BE=D0=B2=20=D0=B4=D0=BB=D1=8F=20=D1=86?= =?UTF-8?q?=D0=B5=D0=BB=D0=B5=D0=B9=20`cross-gcc`=20=D0=B8=20`cross-qemu`.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- GNUmakefile | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/GNUmakefile b/GNUmakefile index 89b8e506..566feee1 100644 --- a/GNUmakefile +++ b/GNUmakefile @@ -715,23 +715,23 @@ endif ################################################################################ # Cross-compilation simple test -CROSS_LIST = mips-linux-gnu-gcc \ +CROSS_LIST = \ + mips64-linux-gnuabi64-gcc mips-linux-gnu-gcc \ + hppa-linux-gnu-gcc s390x-linux-gnu-gcc \ powerpc64-linux-gnu-gcc powerpc-linux-gnu-gcc \ - arm-linux-gnueabihf-gcc aarch64-linux-gnu-gcc \ - sh4-linux-gnu-gcc mips64-linux-gnuabi64-gcc \ - hppa-linux-gnu-gcc s390x-linux-gnu-gcc + arm-linux-gnueabihf-gcc aarch64-linux-gnu-gcc -## On Ubuntu Focal (20.04) with QEMU 4.2 (1:4.2-3ubuntu6.6) & GCC 9.3 (9.3.0-17ubuntu1~20.04) -# hppa-linux-gnu-gcc - works (previously: don't supported by qemu) -# s390x-linux-gnu-gcc - works (previously: qemu hang/abort) +## On Ubuntu Focal (22.04) with QEMU 6.2 (1:6.2+dfsg-2ubuntu6.6) & GCC 11.3 (11.3.0-1ubuntu1~22.04) +# sh4-linux-gnu-gcc - coredump (qemu mmap-troubles) # sparc64-linux-gnu-gcc - coredump (qemu mmap-troubles, previously: qemu fails fcntl for F_SETLK/F_GETLK) # alpha-linux-gnu-gcc - coredump (qemu mmap-troubles) -CROSS_LIST_NOQEMU = sparc64-linux-gnu-gcc alpha-linux-gnu-gcc riscv64-linux-gnu-gcc +# risc64-linux-gnu-gcc - coredump (qemu qemu fails fcntl for F_SETLK/F_GETLK) +CROSS_LIST_NOQEMU = sh4-linux-gnu-gcc sparc64-linux-gnu-gcc alpha-linux-gnu-gcc riscv64-linux-gnu-gcc cross-gcc: @echo ' Re-building by cross-compiler for: $(CROSS_LIST_NOQEMU) $(CROSS_LIST)' @echo "CORRESPONDING CROSS-COMPILERs ARE REQUIRED." - @echo "FOR INSTANCE: apt install g++-aarch64-linux-gnu g++-alpha-linux-gnu g++-arm-linux-gnueabihf g++-hppa-linux-gnu g++-mips-linux-gnu g++-mips64-linux-gnuabi64 g++-powerpc-linux-gnu g++-powerpc64-linux-gnu g++-s390x-linux-gnu g++-sh4-linux-gnu g++-sparc64-linux-gnu riscv64-linux-gnu-gcc" + @echo "FOR INSTANCE: sudo apt install \$$(apt list 'g++-*' | grep 'g++-[a-z0-9]\+-linux-gnu/' | cut -f 1 -d / | sort -u)" $(QUIET)for CC in $(CROSS_LIST_NOQEMU) $(CROSS_LIST); do \ echo "===================== $$CC"; \ $(MAKE) IOARENA=false CXXSTD= clean && CC=$$CC CXX=$$(echo $$CC | sed 's/-gcc/-g++/') EXE_LDFLAGS=-static $(MAKE) IOARENA=false all || exit $$?; \ @@ -743,8 +743,8 @@ cross-qemu: @echo ' Re-building by cross-compiler and re-check by QEMU for: $(CROSS_LIST)' @echo "CORRESPONDING CROSS-COMPILERs AND QEMUs ARE REQUIRED." @echo "FOR INSTANCE: " - @echo " 1) apt install g++-aarch64-linux-gnu g++-alpha-linux-gnu g++-arm-linux-gnueabihf g++-hppa-linux-gnu g++-mips-linux-gnu g++-mips64-linux-gnuabi64 g++-powerpc-linux-gnu g++-powerpc64-linux-gnu g++-s390x-linux-gnu g++-sh4-linux-gnu g++-sparc64-linux-gnu" - @echo " 2) apt install binfmt-support qemu-user-static qemu-user qemu-system-arm qemu-system-mips qemu-system-misc qemu-system-ppc qemu-system-sparc" + @echo " 1) sudo apt install \$$(apt list 'g++-*' | grep 'g++-[a-z0-9]\+-linux-gnu/' | cut -f 1 -d / | sort -u)" + @echo " 2) sudo apt install binfmt-support qemu-user-static qemu-user \$$(apt list 'qemu-system-*' | grep 'qemu-system-[a-z0-9]\+/' | cut -f 1 -d / | sort -u)" $(QUIET)for CC in $(CROSS_LIST); do \ echo "===================== $$CC + qemu"; \ $(MAKE) IOARENA=false CXXSTD= clean && \ From 22405885f61690b62c69b9def614fe1ff79bf4b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Wed, 1 Mar 2023 01:09:10 +0300 Subject: [PATCH 360/364] =?UTF-8?q?mdbx:=20=D0=BA=D0=BE=D1=80=D1=80=D0=B5?= =?UTF-8?q?=D0=BA=D1=82=D0=B8=D1=80=D0=BE=D0=B2=D0=BA=D0=B0=20=D0=B8=D0=B7?= =?UTF-8?q?=D0=BB=D0=B8=D1=88=D0=BD=D0=B5=D0=B9=20assert-=D0=BF=D1=80?= =?UTF-8?q?=D0=BE=D0=B2=D0=B5=D1=80=D0=BA=D0=B8=20=D0=B2=D0=BD=D1=83=D1=82?= =?UTF-8?q?=D1=80=D0=B8=20`override=5Fmeta()`.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/core.c b/src/core.c index 3f52d4e2..0a2d1da3 100644 --- a/src/core.c +++ b/src/core.c @@ -14459,7 +14459,10 @@ __cold static int __must_check_result override_meta(MDBX_env *env, osal_flush_incoherent_mmap(env->me_map, pgno2bytes(env, NUM_METAS), env->me_os_psize); } - eASSERT(env, !env->me_txn && !env->me_txn0); + eASSERT(env, (!env->me_txn && !env->me_txn0) || + (env->me_stuck_meta == (int)target && + (env->me_flags & (MDBX_EXCLUSIVE | MDBX_RDONLY)) == + MDBX_EXCLUSIVE)); return rc; } From 7db014c4fc381bb61d1d4fb1d724de7cd521f123 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Wed, 1 Mar 2023 23:18:09 +0300 Subject: [PATCH 361/364] =?UTF-8?q?mdbx++:=20=D0=B4=D0=BE=D0=B1=D0=B0?= =?UTF-8?q?=D0=B2=D0=BB=D0=B5=D0=BD=D0=B8=D0=B5=20=D0=B2=20C++=20API=20?= =?UTF-8?q?=D0=BF=D0=BE=D0=B4=D0=B4=D0=B5=D1=80=D0=B6=D0=BA=D0=B8=20=D1=80?= =?UTF-8?q?=D0=B0=D1=81=D1=88=D0=B8=D1=80=D0=B5=D0=BD=D0=BD=D1=8B=D1=85=20?= =?UTF-8?q?=D0=BE=D0=BF=D1=86=D0=B8=D0=B9=20=D0=B2=D1=80=D0=B5=D0=BC=D0=B5?= =?UTF-8?q?=D0=BD=D0=B8=20=D0=B2=D1=8B=D0=BF=D0=BE=D0=BB=D0=BD=D0=B5=D0=BD?= =?UTF-8?q?=D0=B8=D1=8F=20`enum=20MDBX=5Foption=5Ft`.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit https://gitflic.ru/project/erthink/libmdbx/issue/4 --- mdbx.h | 8 ++- mdbx.h++ | 198 ++++++++++++++++++++++++++++++++++++++++++------------- 2 files changed, 159 insertions(+), 47 deletions(-) diff --git a/mdbx.h b/mdbx.h index ce46b6f6..08542765 100644 --- a/mdbx.h +++ b/mdbx.h @@ -2060,7 +2060,9 @@ LIBMDBX_API const char *mdbx_strerror_r_ANSI2OEM(int errnum, char *buf, * \returns a non-zero error value on failure and 0 on success. */ LIBMDBX_API int mdbx_env_create(MDBX_env **penv); -/** \brief MDBX environment options. */ +/** \brief MDBX environment extra runtime options. + * \ingroup c_settings + * \see mdbx_env_set_option() \see mdbx_env_get_option() */ enum MDBX_option_t { /** \brief Controls the maximum number of named databases for the environment. * @@ -2268,7 +2270,7 @@ enum MDBX_option_t { typedef enum MDBX_option_t MDBX_option_t; #endif -/** \brief Sets the value of a runtime options for an environment. +/** \brief Sets the value of a extra runtime options for an environment. * \ingroup c_settings * * \param [in] env An environment handle returned by \ref mdbx_env_create(). @@ -2281,7 +2283,7 @@ typedef enum MDBX_option_t MDBX_option_t; LIBMDBX_API int mdbx_env_set_option(MDBX_env *env, const MDBX_option_t option, uint64_t value); -/** \brief Gets the value of runtime options from an environment. +/** \brief Gets the value of extra runtime options from an environment. * \ingroup c_settings * * \param [in] env An environment handle returned by \ref mdbx_env_create(). diff --git a/mdbx.h++ b/mdbx.h++ index 8e5e5ecf..a05f1c63 100644 --- a/mdbx.h++ +++ b/mdbx.h++ @@ -84,6 +84,11 @@ #include #endif +#if __cplusplus >= 201103L +#include +#include +#endif + #include "mdbx.h" #if (defined(__cpp_lib_bit_cast) && __cpp_lib_bit_cast >= 201806L) || \ @@ -386,6 +391,11 @@ using path = ::std::wstring; using path = ::std::string; #endif /* mdbx::path */ +#if __cplusplus >= 201103L || defined(DOXYGEN) +/// \brief Duration in 1/65536 units of second. +using duration = ::std::chrono::duration>; +#endif /* Duration for C++11 */ + /// \defgroup cxx_exceptions exceptions and errors /// @{ @@ -3346,9 +3356,11 @@ public: /// \brief Returns the maximum number of threads/reader slots for the /// environment. + /// \see extra_runtime_option::max_readers inline unsigned max_readers() const; /// \brief Returns the maximum number of named databases for the environment. + /// \see extra_runtime_option::max_maps inline unsigned max_maps() const; /// \brief Returns the application context associated with the environment. @@ -3360,59 +3372,117 @@ public: /// \brief Sets threshold to force flush the data buffers to disk, for /// non-sync durability modes. /// - /// The threshold value affects all processes which operates with given - /// environment until the last process close environment or a new value will - /// be settled. - /// Data is always written to disk when \ref txn_managed::commit() is called, - /// but the operating system may keep it buffered. MDBX always flushes the OS - /// buffers upon commit as well, unless the environment was opened with \ref - /// whole_fragile, \ref lazy_weak_tail or in part \ref - /// half_synchronous_weak_last. The default is 0, than mean no any threshold - /// checked, and no additional flush will be made. + /// \details The threshold value affects all processes which operates with + /// given environment until the last process close environment or a new value + /// will be settled. Data is always written to disk when \ref + /// txn_managed::commit() is called, but the operating system may keep it + /// buffered. MDBX always flushes the OS buffers upon commit as well, unless + /// the environment was opened with \ref whole_fragile, \ref lazy_weak_tail or + /// in part \ref half_synchronous_weak_last. /// + /// The default is 0, than mean no any threshold checked, and no additional + /// flush will be made. + /// \see extra_runtime_option::sync_bytes inline env &set_sync_threshold(size_t bytes); + /// \brief Gets threshold used to force flush the data buffers to disk, for + /// non-sync durability modes. + /// + /// \copydetails set_sync_threshold() + /// \see extra_runtime_option::sync_bytes + inline size_t sync_threshold() const; + +#if __cplusplus >= 201103L || defined(DOXYGEN) /// \brief Sets relative period since the last unsteady commit to force flush /// the data buffers to disk, for non-sync durability modes. /// - /// The relative period value affects all processes which operates with given - /// environment until the last process close environment or a new value will - /// be settled. - /// Data is always written to disk when \ref txn_managed::commit() is called, - /// but the operating system may keep it buffered. MDBX always flushes the OS - /// buffers upon commit as well, unless the environment was opened with \ref - /// whole_fragile, \ref lazy_weak_tail or in part \ref - /// half_synchronous_weak_last. Settled period don't checked asynchronously, - /// but only by the \ref txn_managed::commit() and \ref env::sync_to_disk() - /// functions. Therefore, in cases where transactions are committed - /// infrequently and/or irregularly, polling by \ref env::poll_sync_to_disk() - /// may be a reasonable solution to timeout enforcement. The default is 0, - /// than mean no any timeout checked, and no additional flush will be made. + /// \details The relative period value affects all processes which operates + /// with given environment until the last process close environment or a new + /// value will be settled. Data is always written to disk when \ref + /// txn_managed::commit() is called, but the operating system may keep it + /// buffered. MDBX always flushes the OS buffers upon commit as well, unless + /// the environment was opened with \ref whole_fragile, \ref lazy_weak_tail or + /// in part \ref half_synchronous_weak_last. Settled period don't checked + /// asynchronously, but only by the \ref txn_managed::commit() and \ref + /// env::sync_to_disk() functions. Therefore, in cases where transactions are + /// committed infrequently and/or irregularly, polling by \ref + /// env::poll_sync_to_disk() may be a reasonable solution to timeout + /// enforcement. /// + /// The default is 0, than mean no any timeout checked, and no additional + /// flush will be made. + /// \see extra_runtime_option::sync_period + inline env &set_sync_period(const duration &period); + + /// \brief Gets relative period since the last unsteady commit that used to + /// force flush the data buffers to disk, for non-sync durability modes. + /// \copydetails set_sync_period(const duration&) + /// \see set_sync_period(const duration&) + /// \see extra_runtime_option::sync_period + inline duration sync_period() const; +#endif + + /// \copydoc set_sync_period(const duration&) /// \param [in] seconds_16dot16 The period in 1/65536 of second when a /// synchronous flush would be made since the last unsteady commit. - inline env &set_sync_period(unsigned seconds_16dot16); + inline env &set_sync_period__seconds_16dot16(unsigned seconds_16dot16); - /// \brief Sets relative period since the last unsteady commit to force flush - /// the data buffers to disk, for non-sync durability modes. - /// - /// The relative period value affects all processes which operates with given - /// environment until the last process close environment or a new value will - /// be settled. - /// Data is always written to disk when \ref txn_managed::commit() is called, - /// but the operating system may keep it buffered. MDBX always flushes the OS - /// buffers upon commit as well, unless the environment was opened with \ref - /// whole_fragile, \ref lazy_weak_tail or in part \ref - /// half_synchronous_weak_last. Settled period don't checked asynchronously, - /// but only by the \ref txn_managed::commit() and \ref env::sync_to_disk() - /// functions. Therefore, in cases where transactions are committed - /// infrequently and/or irregularly, polling by \ref env::poll_sync_to_disk() - /// may be a reasonable solution to timeout enforcement. The default is 0, - /// than mean no any timeout checked, and no additional flush will be made. - /// + /// \copydoc sync_period() + /// \see sync_period__seconds_16dot16(unsigned) + inline unsigned sync_period__seconds_16dot16() const; + + /// \copydoc set_sync_period(const duration&) /// \param [in] seconds The period in second when a synchronous flush would /// be made since the last unsteady commit. - inline env &set_sync_period(double seconds); + inline env &set_sync_period__seconds_double(double seconds); + + /// \copydoc sync_period() + /// \see set_sync_period__seconds_double(double) + inline double sync_period__seconds_double() const; + + /// \copydoc MDBX_option_t + enum class extra_runtime_option { + /// \copydoc MDBX_opt_max_db + /// \see max_maps() \see env::operate_parameters::max_maps + max_maps = MDBX_opt_max_db, + /// \copydoc MDBX_opt_max_readers + /// \see max_readers() \see env::operate_parameters::max_readers + max_readers = MDBX_opt_max_readers, + /// \copydoc MDBX_opt_sync_bytes + /// \see sync_threshold() \see set_sync_threshold() + sync_bytes = MDBX_opt_sync_bytes, + /// \copydoc MDBX_opt_sync_period + /// \see sync_period() \see set_sync_period() + sync_period = MDBX_opt_sync_period, + /// \copydoc MDBX_opt_rp_augment_limit + rp_augment_limit = MDBX_opt_rp_augment_limit, + /// \copydoc MDBX_opt_loose_limit + loose_limit = MDBX_opt_loose_limit, + /// \copydoc MDBX_opt_dp_reserve_limit + dp_reserve_limit = MDBX_opt_dp_reserve_limit, + /// \copydoc MDBX_opt_txn_dp_limit + dp_limit = MDBX_opt_txn_dp_limit, + /// \copydoc MDBX_opt_txn_dp_initial + dp_initial = MDBX_opt_txn_dp_initial, + /// \copydoc MDBX_opt_spill_max_denominator + spill_max_denominator = MDBX_opt_spill_max_denominator, + /// \copydoc MDBX_opt_spill_min_denominator + spill_min_denominator = MDBX_opt_spill_min_denominator, + /// \copydoc MDBX_opt_spill_parent4child_denominator + spill_parent4child_denominator = MDBX_opt_spill_parent4child_denominator, + /// \copydoc MDBX_opt_merge_threshold_16dot16_percent + merge_threshold_16dot16_percent = MDBX_opt_merge_threshold_16dot16_percent, + /// \copydoc MDBX_opt_writethrough_threshold + writethrough_threshold = MDBX_opt_writethrough_threshold, + /// \copydoc MDBX_opt_prefault_write_enable + prefault_write_enable = MDBX_opt_prefault_write_enable, + }; + + /// \copybrief mdbx_env_set_option() + inline env &set_extra_option(extra_runtime_option option, uint64_t value); + + /// \copybrief mdbx_env_get_option() + inline uint64_t extra_option(extra_runtime_option option) const; /// \brief Alter environment flags. inline env &alter_flags(MDBX_env_flags_t flags, bool on_off); @@ -5068,13 +5138,53 @@ inline env &env::set_sync_threshold(size_t bytes) { return *this; } -inline env &env::set_sync_period(unsigned seconds_16dot16) { +inline size_t env::sync_threshold() const { + size_t bytes; + error::success_or_throw(::mdbx_env_get_syncbytes(handle_, &bytes)); + return bytes; +} + +inline env &env::set_sync_period__seconds_16dot16(unsigned seconds_16dot16) { error::success_or_throw(::mdbx_env_set_syncperiod(handle_, seconds_16dot16)); return *this; } -inline env &env::set_sync_period(double seconds) { - return set_sync_period(unsigned(seconds * 65536)); +inline unsigned env::sync_period__seconds_16dot16() const { + unsigned seconds_16dot16; + error::success_or_throw(::mdbx_env_get_syncperiod(handle_, &seconds_16dot16)); + return seconds_16dot16; +} + +inline env &env::set_sync_period__seconds_double(double seconds) { + return set_sync_period__seconds_16dot16(unsigned(seconds * 65536)); +} + +inline double env::sync_period__seconds_double() const { + return sync_period__seconds_16dot16() / 65536.0; +} + +#if __cplusplus >= 201103L +inline env &env::set_sync_period(const duration &period) { + return set_sync_period__seconds_16dot16(period.count()); +} + +inline duration env::sync_period() const { + return duration(sync_period__seconds_16dot16()); +} +#endif + +inline env &env::set_extra_option(enum env::extra_runtime_option option, + uint64_t value) { + error::success_or_throw( + ::mdbx_env_set_option(handle_, ::MDBX_option_t(option), value)); + return *this; +} + +inline uint64_t env::extra_option(enum env::extra_runtime_option option) const { + uint64_t value; + error::success_or_throw( + ::mdbx_env_get_option(handle_, ::MDBX_option_t(option), &value)); + return value; } inline env &env::alter_flags(MDBX_env_flags_t flags, bool on_off) { From f17c55a872adaa9432fbee6c7d61576af1ebf0be Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Thu, 2 Mar 2023 16:34:19 +0300 Subject: [PATCH 362/364] =?UTF-8?q?mdbx:=20=D0=BE=D0=B1=D0=BD=D0=BE=D0=B2?= =?UTF-8?q?=D0=BB=D0=B5=D0=BD=D0=B8=D0=B5=20ChangeLog.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ChangeLog.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/ChangeLog.md b/ChangeLog.md index 4ed3e84c..62bad5d3 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -6,6 +6,16 @@ and [by Yandex](https://translated.turbopages.org/proxy_u/ru-en.en/https/gitflic ## v0.12.4 (подготовка к релизу) +Стабилизирующий выпуск с исправлением обнаруженных ошибок, устранением +недочетов и технических долгов. Ветка 0.12 считается готовой к +продуктовому использованию, получает статус стабильной и далее будет +получать только исправление ошибок. Разработка будет продолжена в ветке +0.13, а ветка 0.11 становится архивной. + +``` +63 files changed, 1144 insertions(+), 569 deletions(-) +``` + Благодарности: - Max за сообщение о проблеме ERROR_SHARING_VIOLATION @@ -64,6 +74,8 @@ and [by Yandex](https://translated.turbopages.org/proxy_u/ru-en.en/https/gitflic - Добавлена поддержка ASAN (Address Sanitizer) при сборке посредством MSVC. - Расширен набор перебираемых режимов в скрипте `test/long_stochastic.sh`, добавлена опция `--extra`. + - В C++ API добавлена поддержка расширенных опций времени выполнения `mdbx::extra_runtime_option`, + аналогично `enum MDBX_option_t` из C API. ------------------------------------------------------------------------------- From ad93633d10d36c350cd5dabf0758149042fd3326 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Fri, 3 Mar 2023 16:02:37 +0300 Subject: [PATCH 363/364] =?UTF-8?q?mdbx-tools:=20=D0=B2=D1=8B=D0=B2=D0=BE?= =?UTF-8?q?=D0=B4=20=D0=B2=D1=81=D0=B5=D1=85=20=D1=81=D1=87=D0=B5=D1=82?= =?UTF-8?q?=D1=87=D0=B8=D0=BA=D0=BE=D0=B2=20page-operations=20=D0=B2=20`md?= =?UTF-8?q?bx=5Fstat`.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/mdbx_stat.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/mdbx_stat.c b/src/mdbx_stat.c index 860cf54d..adedc13e 100644 --- a/src/mdbx_stat.c +++ b/src/mdbx_stat.c @@ -256,6 +256,17 @@ int main(int argc, char *argv[]) { printf(" WOP: %8" PRIu64 "\t// number of explicit write operations (not a pages) to a disk\n", mei.mi_pgop_stat.wops); + printf(" PreFault: %8" PRIu64 + "\t// number of prefault write operations (not a pages)\n", + mei.mi_pgop_stat.prefault); + printf(" mInCore: %8" PRIu64 "\t// number of mincore() calls\n", + mei.mi_pgop_stat.mincore); + printf(" mSync: %8" PRIu64 + "\t// number of explicit msync-to-disk operations (not a pages)\n", + mei.mi_pgop_stat.msync); + printf(" fSync: %8" PRIu64 + "\t// number of explicit fsync-to-disk operations (not a pages)\n", + mei.mi_pgop_stat.fsync); } if (envinfo) { From 53177e483c18adf5109aed7a5895915e9798ee24 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Fri, 3 Mar 2023 23:23:08 +0300 Subject: [PATCH 364/364] =?UTF-8?q?mdbx:=20=D0=B2=D1=8B=D0=BF=D1=83=D1=81?= =?UTF-8?q?=D0=BA=200.12.4=20"=D0=90=D1=80=D1=82=D0=B0-333"?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Стабилизирующий выпуск с исправлением обнаруженных ошибок, устранением недочетов и технических долгов. Ветка 0.12 считается готовой к продуктовому использованию, получает статус стабильной и далее будет получать только исправление ошибок. Разработка будет продолжена в ветке 0.13, а ветка 0.11 становится архивной. Благодарности: -------------- - Max за сообщение о проблеме ERROR_SHARING_VIOLATION в режиме MDBX_EXCLUSIVE на Windows. - Alisher Ashyrov за сообщение о проблеме с assert-проверкой и содействие в отладке. - Masatoshi Fukunaga за сообщение о проблеме `put(MDBX_UPSERT+MDBX_ALLDUPS)` для случая замены всех значений в subDb. Исправления (без корректировок новых функций): ---------------------------------------------- - Устранен регресс после коммита 474391c83c5f81def6fdf3b0b6f5716a87b78fbf, приводящий к возврату ERROR_SHARING_VIOLATION в Windows при открытии БД в режиме MDBX_EXCLUSIVE для чтения-записи. - Добавлено ограничение размера отображения при коротком read-only файле, для предотвращения ошибки ERROR_NOT_ENOUGH_MEMORY в Windows, которая возникает в этом случае и совсем не информативна для пользователя. - Произведен рефакторинг `dxb_resize()`, в том числе, для устранения срабатывания assert-проверки `size_bytes == env->me_dxb_mmap.current` в специфических многопоточных сценариях использования. Проверка срабатывала только в отладочных сборках, при специфическом наложении во времени читающей и пишущей транзакции в разных потоках, одновременно с изменением размера БД. Кроме срабатывание проверки, каких-либо других последствий не возникало. - Устранена проблема в `put(MDBX_UPSERT+MDBX_ALLDUPS)` для случая замены всех значений единственного ключа в subDb. В ходе этой операции subDb становится полностью пустой, без каких-либо страниц и именно эта ситуация не была учтена в коде, что приводило к повреждению БД при фиксации такой транзакции. - Устранена излишняя assert-проверка внутри `override_meta()`. Что в отладочных сборках могло приводить к ложным срабатываниям при восстановлении БД, в том числе при автоматическом откате слабых мета-страниц. - Скорректированы макросы `__cold`/`__hot`, в том числе для устранения проблемы `error: inlining failed in call to ‘always_inline FOO(...)’: target specific option mismatch` при сборке посредством GCC >10.x для SH4. Ликвидация технических долгов и мелочи: --------------------------------------- - Исправлены многочисленные опечатки в документации. - Доработан тест для полной стохастической проверки `MDBX_EKEYMISMATCH` в режиме `MDBX_APPEND`. - Расширены сценарии запуска `mdbx_chk` из CMake-тестов для проверки как в обычном, так и эксклюзивном режимах чтения-записи. - Уточнены спецификаторы `const` и `noexcept` для нескольких методов в C++ API. - Устранено использование стека под буферы для `wchar`-преобразования путей. - Для Windows добавлена функция `mdbx_env_get_path()` для получения пути к БД в формате многобайтных символов. - Добавлены doxygen-описания для API с широкими символами. - Устранены предупреждения статического анализатора MSVC, все они были несущественные, либо ложные. - Устранено ложное предупреждение GCC при сборке для SH4. - Добавлена поддержка ASAN (Address Sanitizer) при сборке посредством MSVC. - Расширен набор перебираемых режимов в скрипте `test/long_stochastic.sh`, добавлена опция `--extra`. - В C++ API добавлена поддержка расширенных опций времени выполнения `mdbx::extra_runtime_option`, аналогично `enum MDBX_option_t` из C API. - Вывод всех счетчиков page-operations в `mdbx_stat`. 63 files changed, 1161 insertions(+), 569 deletions(-) Signed-off-by: Леонид Юрьев (Leonid Yuriev) --- ChangeLog.md | 6 ++++-- src/man1/mdbx_chk.1 | 2 +- src/man1/mdbx_copy.1 | 2 +- src/man1/mdbx_drop.1 | 2 +- src/man1/mdbx_dump.1 | 2 +- src/man1/mdbx_load.1 | 2 +- src/man1/mdbx_stat.1 | 2 +- 7 files changed, 10 insertions(+), 8 deletions(-) diff --git a/ChangeLog.md b/ChangeLog.md index 62bad5d3..a0296737 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -4,7 +4,7 @@ ChangeLog English version [by Google](https://gitflic-ru.translate.goog/project/erthink/libmdbx/blob?file=ChangeLog.md&_x_tr_sl=ru&_x_tr_tl=en) and [by Yandex](https://translated.turbopages.org/proxy_u/ru-en.en/https/gitflic.ru/project/erthink/libmdbx/blob?file=ChangeLog.md). -## v0.12.4 (подготовка к релизу) +## v0.12.4 (Арта-333) от 2023-03-03 Стабилизирующий выпуск с исправлением обнаруженных ошибок, устранением недочетов и технических долгов. Ветка 0.12 считается готовой к @@ -13,7 +13,8 @@ and [by Yandex](https://translated.turbopages.org/proxy_u/ru-en.en/https/gitflic 0.13, а ветка 0.11 становится архивной. ``` -63 files changed, 1144 insertions(+), 569 deletions(-) +63 files changed, 1161 insertions(+), 569 deletions(-) +Signed-off-by: Леонид Юрьев (Leonid Yuriev) ``` Благодарности: @@ -76,6 +77,7 @@ and [by Yandex](https://translated.turbopages.org/proxy_u/ru-en.en/https/gitflic добавлена опция `--extra`. - В C++ API добавлена поддержка расширенных опций времени выполнения `mdbx::extra_runtime_option`, аналогично `enum MDBX_option_t` из C API. + - Вывод всех счетчиков page-operations в `mdbx_stat`. ------------------------------------------------------------------------------- diff --git a/src/man1/mdbx_chk.1 b/src/man1/mdbx_chk.1 index 9141bf7a..0f5810d4 100644 --- a/src/man1/mdbx_chk.1 +++ b/src/man1/mdbx_chk.1 @@ -1,6 +1,6 @@ .\" Copyright 2015-2023 Leonid Yuriev . .\" Copying restrictions apply. See COPYRIGHT/LICENSE. -.TH MDBX_CHK 1 "2023-01-07" "MDBX 0.12.3" +.TH MDBX_CHK 1 "2023-03-03" "MDBX 0.12.4" .SH NAME mdbx_chk \- MDBX checking tool .SH SYNOPSIS diff --git a/src/man1/mdbx_copy.1 b/src/man1/mdbx_copy.1 index b83c0a27..729919b6 100644 --- a/src/man1/mdbx_copy.1 +++ b/src/man1/mdbx_copy.1 @@ -2,7 +2,7 @@ .\" Copyright 2012-2015 Howard Chu, Symas Corp. All Rights Reserved. .\" Copyright 2015,2016 Peter-Service R&D LLC . .\" Copying restrictions apply. See COPYRIGHT/LICENSE. -.TH MDBX_COPY 1 "2023-01-07" "MDBX 0.12.3" +.TH MDBX_COPY 1 "2023-03-03" "MDBX 0.12.4" .SH NAME mdbx_copy \- MDBX environment copy tool .SH SYNOPSIS diff --git a/src/man1/mdbx_drop.1 b/src/man1/mdbx_drop.1 index e2beaef3..86dd8666 100644 --- a/src/man1/mdbx_drop.1 +++ b/src/man1/mdbx_drop.1 @@ -1,7 +1,7 @@ .\" Copyright 2021-2023 Leonid Yuriev . .\" Copyright 2014-2021 Howard Chu, Symas Corp. All Rights Reserved. .\" Copying restrictions apply. See COPYRIGHT/LICENSE. -.TH MDBX_DROP 1 "2023-01-07" "MDBX 0.12.3" +.TH MDBX_DROP 1 "2023-03-03" "MDBX 0.12.4" .SH NAME mdbx_drop \- MDBX database delete tool .SH SYNOPSIS diff --git a/src/man1/mdbx_dump.1 b/src/man1/mdbx_dump.1 index 007705a3..d6eb9577 100644 --- a/src/man1/mdbx_dump.1 +++ b/src/man1/mdbx_dump.1 @@ -2,7 +2,7 @@ .\" Copyright 2014-2015 Howard Chu, Symas Corp. All Rights Reserved. .\" Copyright 2015,2016 Peter-Service R&D LLC . .\" Copying restrictions apply. See COPYRIGHT/LICENSE. -.TH MDBX_DUMP 1 "2023-01-07" "MDBX 0.12.3" +.TH MDBX_DUMP 1 "2023-03-03" "MDBX 0.12.4" .SH NAME mdbx_dump \- MDBX environment export tool .SH SYNOPSIS diff --git a/src/man1/mdbx_load.1 b/src/man1/mdbx_load.1 index 65ed20aa..798814d9 100644 --- a/src/man1/mdbx_load.1 +++ b/src/man1/mdbx_load.1 @@ -2,7 +2,7 @@ .\" Copyright 2014-2015 Howard Chu, Symas Corp. All Rights Reserved. .\" Copyright 2015,2016 Peter-Service R&D LLC . .\" Copying restrictions apply. See COPYRIGHT/LICENSE. -.TH MDBX_LOAD 1 "2023-01-07" "MDBX 0.12.3" +.TH MDBX_LOAD 1 "2023-03-03" "MDBX 0.12.4" .SH NAME mdbx_load \- MDBX environment import tool .SH SYNOPSIS diff --git a/src/man1/mdbx_stat.1 b/src/man1/mdbx_stat.1 index 31302e03..72c15088 100644 --- a/src/man1/mdbx_stat.1 +++ b/src/man1/mdbx_stat.1 @@ -2,7 +2,7 @@ .\" Copyright 2012-2015 Howard Chu, Symas Corp. All Rights Reserved. .\" Copyright 2015,2016 Peter-Service R&D LLC . .\" Copying restrictions apply. See COPYRIGHT/LICENSE. -.TH MDBX_STAT 1 "2023-01-07" "MDBX 0.12.3" +.TH MDBX_STAT 1 "2023-03-03" "MDBX 0.12.4" .SH NAME mdbx_stat \- MDBX environment status tool .SH SYNOPSIS