From 2a5cbe64452af6d140239e47eb04042010e02b38 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Sun, 10 Feb 2019 16:23:49 +0300 Subject: [PATCH 01/19] mdbx: fix comment typo (minor). --- src/mdbx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mdbx.c b/src/mdbx.c index 7d536eca..80e3ac76 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -5127,7 +5127,7 @@ static int __cold mdbx_read_header(MDBX_env *env, MDBX_meta *meta, /* LY: check and silently put mm_geo.now into [geo.lower...geo.upper]. * - * Copy-with-compaction by previous version of libmfbx could produce DB-file + * Copy-with-compaction by previous version of libmdbx could produce DB-file * less than meta.geo.lower bound, in case actual filling is low or no data * at all. This is not a problem as there is no damage or loss of data. * Therefore it is better not to consider such situation as an error, but From 251f189428e5299ccf02a3a797679ef1d4cf2266 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Wed, 13 Feb 2019 20:23:24 +0300 Subject: [PATCH 02/19] mdbx: fix one more comment typo (minor). --- mdbx.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mdbx.h b/mdbx.h index 364b6c76..4afd81e9 100644 --- a/mdbx.h +++ b/mdbx.h @@ -610,7 +610,7 @@ LIBMDBX_API int mdbx_env_create(MDBX_env **penv); * - MDBX_NOTLS * Don't use Thread-Local Storage. Tie reader locktable slots to * MDBX_txn objects instead of to threads. I.e. mdbx_txn_reset() keeps - * the slot reseved for the MDBX_txn object. A thread may use parallel + * the slot reserved for the MDBX_txn object. A thread may use parallel * read-only transactions. A read-only transaction may span threads if * the user synchronizes its use. Applications that multiplex many * user threads over individual OS threads need this option. Such an From cbf96368b9428940eabc6ae69c1f6721dbba032a Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Sat, 22 Jun 2019 01:29:45 +0300 Subject: [PATCH 03/19] mdbx: backport - fix GC corruption due deep recursive rebalance from update_gc(). Change-Id: I810250deb25cd625e737000282b434e3158ef8cc --- src/mdbx.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/mdbx.c b/src/mdbx.c index 80e3ac76..64a54fc0 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -4320,7 +4320,9 @@ retry: data.iov_len = (chunk + 1) * sizeof(pgno_t); mdbx_trace("%s.reserve: %u [%u...%u] @%" PRIaTXN, dbg_prefix_mode, chunk, settled + 1, settled + chunk + 1, reservation_gc_id); + mc.mc_flags |= C_RECLAIMING; rc = mdbx_cursor_put(&mc, &key, &data, MDBX_RESERVE | MDBX_NOOVERWRITE); + mc.mc_flags -= C_RECLAIMING; mdbx_tassert(txn, mdbx_pnl_check(env->me_reclaimed_pglist, true)); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; From e095282437bbf060cfe37a3f7a60d8ff069c2b9c Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Sat, 2 Feb 2019 17:13:41 +0300 Subject: [PATCH 04/19] mdbx: backport - fix __ANDROID__ typo. Thank to Howard Chu . Change-Id: Ibcbe2e4790a5df5758d9fd6c621793ea42a94682 --- src/lck-posix.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/lck-posix.c b/src/lck-posix.c index 841d5cb9..e8d4539a 100644 --- a/src/lck-posix.c +++ b/src/lck-posix.c @@ -20,8 +20,8 @@ #ifndef MDBX_USE_ROBUST /* Howard Chu: Android currently lacks Robust Mutex support */ #if defined(EOWNERDEAD) && \ - !defined(ANDROID) /* LY: glibc before 2.10 has a troubles with Robust \ - Mutex too. */ \ + !defined(__ANDROID__) /* LY: glibc before 2.10 has a troubles \ + with Robust Mutex too. */ \ && __GLIBC_PREREQ(2, 10) #define MDBX_USE_ROBUST 1 #else From aa7a55b480a3e67baf63767cf96f64e3a8397c72 Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Mon, 4 Feb 2019 00:08:56 +0300 Subject: [PATCH 05/19] mdbx: backport - fix handling MDBX_APPENDDUP mode. Change-Id: I36de2a8dcab5126dab3857a7840ab3904a1d19c8 --- src/mdbx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mdbx.c b/src/mdbx.c index 64a54fc0..0835aa5a 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -8302,7 +8302,7 @@ int mdbx_cursor_put(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, if (rc > 0) { rc = MDBX_NOTFOUND; mc->mc_ki[mc->mc_top]++; - } else { + } else if (unlikely(rc < 0 || (flags & MDBX_APPENDDUP) == 0)) { /* new key is <= last key */ rc = MDBX_EKEYMISMATCH; } From efcb417838b7d00e73da865d5b18ee2086d88aed Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Thu, 2 May 2019 16:46:05 +0300 Subject: [PATCH 06/19] mdbx: backport - fix TAGRET typo (minor). Change-Id: Iffafbed7fdad3492aeb51f17caf8109a5b3e35c0 --- src/version.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/version.c b/src/version.c index 0fae43fb..413edb4e 100644 --- a/src/version.c +++ b/src/version.c @@ -30,5 +30,5 @@ "@MDBX_GIT_DESCRIBE@"}}; /*LIBMDBX_EXPORTS*/ const mdbx_build_info mdbx_build = { - "@MDBX_BUILD_TIMESTAMP@", "@MDBX_BUILD_TAGRET@", "@MDBX_BUILD_OPTIONS@", + "@MDBX_BUILD_TIMESTAMP@", "@MDBX_BUILD_TARGET@", "@MDBX_BUILD_OPTIONS@", "@MDBX_BUILD_COMPILER@", "@MDBX_BUILD_FLAGS@"}; From 179185985e42c90ace97044ef8830700e6f89f11 Mon Sep 17 00:00:00 2001 From: Howard Chu Date: Wed, 30 Jan 2019 23:43:34 +0000 Subject: [PATCH 07/19] mdbx: import - tweak mdb_page_split (ITS#8969). Bump up number of keys for which we use fine-grained splitpoint search Change-Id: Icca2e1953cbcd6898b790f657636c2195b397790 --- src/mdbx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mdbx.c b/src/mdbx.c index 0835aa5a..9036d7c0 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -10884,7 +10884,7 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *newkey, * This yields better packing during sequential inserts. */ int dir; - if (nkeys < 20 || nsize > pmax / 16 || newindx >= nkeys) { + if (nkeys < 32 || nsize > pmax / 16 || newindx >= nkeys) { /* Find split point */ psize = 0; if (newindx <= split_indx || newindx >= nkeys) { From c2f9d088d5363c234ddcb7b931460c23b0ce18ab Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Sat, 22 Jun 2019 18:46:41 +0300 Subject: [PATCH 08/19] mdbx: rework RECLAIMING inside update_gc(). Change-Id: I9cf592476780bfdb346472baa12497d68a3d5aad --- src/mdbx.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/src/mdbx.c b/src/mdbx.c index 9036d7c0..6c5f1560 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -3755,6 +3755,7 @@ static int mdbx_prep_backlog(MDBX_txn *txn, MDBX_cursor *mc) { const int extra = mdbx_backlog_extragap(txn->mt_env); if (mdbx_backlog_size(txn) < mc->mc_db->md_depth + extra) { + mc->mc_flags &= ~C_RECLAIMING; int rc = mdbx_cursor_touch(mc); if (unlikely(rc)) return rc; @@ -3768,6 +3769,7 @@ static int mdbx_prep_backlog(MDBX_txn *txn, MDBX_cursor *mc) { break; } } + mc->mc_flags |= C_RECLAIMING; } return MDBX_SUCCESS; @@ -3880,6 +3882,7 @@ static int mdbx_update_gc(MDBX_txn *txn) { if (unlikely(rc != MDBX_SUCCESS)) goto bailout_notracking; + mc.mc_flags |= C_RECLAIMING; mc.mc_next = txn->mt_cursors[FREE_DBI]; txn->mt_cursors[FREE_DBI] = &mc; @@ -3925,9 +3928,7 @@ retry: mdbx_tassert(txn, cleaned_gc_id < *env->me_oldest); mdbx_trace("%s.cleanup-reclaimed-id [%u]%" PRIaTXN, dbg_prefix_mode, cleaned_gc_slot, cleaned_gc_id); - mc.mc_flags |= C_RECLAIMING; rc = mdbx_cursor_del(&mc, 0); - mc.mc_flags ^= C_RECLAIMING; if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } while (cleaned_gc_slot < MDBX_PNL_SIZE(txn->mt_lifo_reclaimed)); @@ -3949,9 +3950,7 @@ retry: mdbx_tassert(txn, cleaned_gc_id < *env->me_oldest); mdbx_trace("%s.cleanup-reclaimed-id %" PRIaTXN, dbg_prefix_mode, cleaned_gc_id); - mc.mc_flags |= C_RECLAIMING; rc = mdbx_cursor_del(&mc, 0); - mc.mc_flags ^= C_RECLAIMING; if (unlikely(rc != MDBX_SUCCESS)) goto bailout; settled = 0; @@ -4104,7 +4103,9 @@ retry: if (befree_stored < MDBX_PNL_SIZE(txn->mt_befree_pages)) { if (unlikely(!befree_stored)) { /* Make sure last page of freeDB is touched and on befree-list */ + mc.mc_flags &= ~C_RECLAIMING; rc = mdbx_page_search(&mc, NULL, MDBX_PS_LAST | MDBX_PS_MODIFY); + mc.mc_flags |= C_RECLAIMING; if (unlikely(rc != MDBX_SUCCESS && rc != MDBX_NOTFOUND)) goto bailout; } @@ -4198,7 +4199,9 @@ retry: reused_gc_slot) * env->me_maxgc_ov1page) { /* LY: need just a txn-id for save page list. */ + mc.mc_flags &= ~C_RECLAIMING; rc = mdbx_page_alloc(&mc, 0, NULL, MDBX_ALLOC_GC | MDBX_ALLOC_KICK); + mc.mc_flags |= C_RECLAIMING; if (likely(rc == MDBX_SUCCESS)) { /* LY: ok, reclaimed from freedb. */ mdbx_trace("%s: took @%" PRIaTXN " from GC, continue", @@ -4320,9 +4323,7 @@ retry: data.iov_len = (chunk + 1) * sizeof(pgno_t); mdbx_trace("%s.reserve: %u [%u...%u] @%" PRIaTXN, dbg_prefix_mode, chunk, settled + 1, settled + chunk + 1, reservation_gc_id); - mc.mc_flags |= C_RECLAIMING; rc = mdbx_cursor_put(&mc, &key, &data, MDBX_RESERVE | MDBX_NOOVERWRITE); - mc.mc_flags -= C_RECLAIMING; mdbx_tassert(txn, mdbx_pnl_check(env->me_reclaimed_pglist, true)); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; @@ -4414,7 +4415,7 @@ retry: key.iov_len = sizeof(fill_gc_id); mdbx_tassert(txn, data.iov_len >= sizeof(pgno_t) * 2); - mc.mc_flags |= C_RECLAIMING | C_GCFREEZE; + mc.mc_flags |= C_GCFREEZE; unsigned chunk = (unsigned)(data.iov_len / sizeof(pgno_t)) - 1; if (unlikely(chunk > left)) { mdbx_trace("%s: chunk %u > left %u, @%" PRIaTXN, dbg_prefix_mode, chunk, @@ -4422,12 +4423,12 @@ retry: if (loop < 5 || chunk - left > env->me_maxgc_ov1page) { data.iov_len = (left + 1) * sizeof(pgno_t); if (loop < 21) - mc.mc_flags -= C_GCFREEZE; + mc.mc_flags &= ~C_GCFREEZE; } chunk = left; } rc = mdbx_cursor_put(&mc, &key, &data, MDBX_CURRENT | MDBX_RESERVE); - mc.mc_flags &= ~(C_RECLAIMING | C_GCFREEZE); + mc.mc_flags &= ~C_GCFREEZE; if (unlikely(rc != MDBX_SUCCESS)) goto bailout; From 0eeb5f83c2e5e5060b9534553ab579a9cc46f199 Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Sat, 22 Jun 2019 22:54:04 +0300 Subject: [PATCH 09/19] mdbx: bump version to v0.2.2 Change-Id: I1b3802ce91e7b5241f3cbcf3ec54aa6394971dff --- src/version.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/version.c b/src/version.c index 413edb4e..e7eb4c2e 100644 --- a/src/version.c +++ b/src/version.c @@ -18,8 +18,8 @@ #error "API version mismatch!" #endif -#define MDBX_VERSION_RELEASE 0 -#define MDBX_VERSION_REVISION 2 +#define MDBX_VERSION_RELEASE 2 +#define MDBX_VERSION_REVISION 0 /*LIBMDBX_EXPORTS*/ const mdbx_version_info mdbx_version = { MDBX_VERSION_MAJOR, From d138a2a8e1b306a6ec0a2d7767769cf3448e22f2 Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Sun, 23 Jun 2019 20:41:50 +0300 Subject: [PATCH 10/19] mdbx-test: backport - fix dbsize-options handling. Change-Id: Ia51f802ac1ad4e8b1b059a3f3b38214bda6b43fc --- test/main.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/main.cc b/test/main.cc index 41868ccf..63a3c831 100644 --- a/test/main.cc +++ b/test/main.cc @@ -182,11 +182,11 @@ int main(int argc, char *const argv[]) { mdbx_limits_dbsize_min(params.pagesize), mdbx_limits_dbsize_max(params.pagesize))) continue; - if (config::parse_option(argc, argv, narg, "size", params.size_now, + if (config::parse_option(argc, argv, narg, "size-upper", params.size_upper, mdbx_limits_dbsize_min(params.pagesize), mdbx_limits_dbsize_max(params.pagesize))) continue; - if (config::parse_option(argc, argv, narg, "size-upper", params.size_upper, + if (config::parse_option(argc, argv, narg, "size", params.size_now, mdbx_limits_dbsize_min(params.pagesize), mdbx_limits_dbsize_max(params.pagesize))) continue; From 04a91adc70c6aaedefbf9d95f6092a3a7d763281 Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Mon, 24 Jun 2019 00:56:26 +0300 Subject: [PATCH 11/19] mdbx: backport - fix pwrite() for WRITE_MAX. Change-Id: If4924d20c1e267c2d3a190c860b89fc2fda0d517 --- src/osal.c | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/src/osal.c b/src/osal.c index 925c43b9..2b43fbe4 100644 --- a/src/osal.c +++ b/src/osal.c @@ -581,17 +581,25 @@ int mdbx_pwrite(mdbx_filehandle_t fd, const void *buf, size_t bytes, return (bytes == written) ? MDBX_SUCCESS : MDBX_EIO /* ERROR_WRITE_FAULT */; return GetLastError(); #else - int rc; - intptr_t written; - do { + while (true) { STATIC_ASSERT_MSG(sizeof(off_t) >= sizeof(size_t), "libmdbx requires 64-bit file I/O on 64-bit systems"); - written = pwrite(fd, buf, bytes, offset); + const intptr_t written = + pwrite(fd, buf, (bytes <= MAX_WRITE) ? bytes : MAX_WRITE, offset); if (likely(bytes == (size_t)written)) return MDBX_SUCCESS; - rc = errno; - } while (rc == EINTR); - return (written < 0) ? rc : MDBX_EIO /* Use which error code (ENOSPC)? */; + if (written < 0) { + const int rc = errno; + if (rc != EINTR) + return rc; + } else if (written > 0) { + bytes -= written; + offset += written; + buf = (char *)buf + written; + } else { + return -1; + } + } #endif } From 961f08a5d2522335aa6522904e07ed92a7213a67 Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Mon, 8 Jul 2019 15:27:26 +0300 Subject: [PATCH 12/19] mdbx: backport - refine backlog preparation inside update_gc(). Change-Id: Ib18842c2922afba794d6ab69337580bcea29bfe6 --- src/mdbx.c | 34 ++++++++++++++++++++++++++-------- 1 file changed, 26 insertions(+), 8 deletions(-) diff --git a/src/mdbx.c b/src/mdbx.c index 6c5f1560..b3e0c5ec 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -3739,7 +3739,7 @@ static __inline int mdbx_backlog_size(MDBX_txn *txn) { ? MDBX_PNL_SIZE(txn->mt_env->me_reclaimed_pglist) + txn->mt_loose_count : 0; - return reclaimed_and_loose + txn->mt_end_pgno - txn->mt_next_pgno; + return reclaimed_and_loose; } static __inline int mdbx_backlog_extragap(MDBX_env *env) { @@ -3752,7 +3752,9 @@ static __inline int mdbx_backlog_extragap(MDBX_env *env) { * in mdbx_page_alloc() during a deleting, when freeDB tree is unbalanced. */ static int mdbx_prep_backlog(MDBX_txn *txn, MDBX_cursor *mc) { /* LY: extra page(s) for b-tree rebalancing */ - const int extra = mdbx_backlog_extragap(txn->mt_env); + const int extra = + mdbx_backlog_extragap(txn->mt_env) + + MDBX_PNL_SIZEOF(txn->mt_befree_pages) / txn->mt_env->me_maxkey_limit; if (mdbx_backlog_size(txn) < mc->mc_db->md_depth + extra) { mc->mc_flags &= ~C_RECLAIMING; @@ -3760,11 +3762,10 @@ static int mdbx_prep_backlog(MDBX_txn *txn, MDBX_cursor *mc) { if (unlikely(rc)) return rc; - int backlog; - while (unlikely((backlog = mdbx_backlog_size(txn)) < extra)) { + while (unlikely(mdbx_backlog_size(txn) < extra)) { rc = mdbx_page_alloc(mc, 1, NULL, MDBX_ALLOC_GC); if (unlikely(rc)) { - if (unlikely(rc != MDBX_NOTFOUND)) + if (rc != MDBX_NOTFOUND) return rc; break; } @@ -3775,6 +3776,20 @@ static int mdbx_prep_backlog(MDBX_txn *txn, MDBX_cursor *mc) { return MDBX_SUCCESS; } +static void mdbx_prep_backlog_data(MDBX_txn *txn, MDBX_cursor *mc, + size_t bytes) { + const int wanna = + (int)OVPAGES(txn->mt_env, bytes) + mdbx_backlog_extragap(txn->mt_env); + if (unlikely(wanna > mdbx_backlog_size(txn))) { + mc->mc_flags &= ~C_RECLAIMING; + do { + if (mdbx_page_alloc(mc, 1, NULL, MDBX_ALLOC_GC) != MDBX_SUCCESS) + break; + } while (wanna > mdbx_backlog_size(txn)); + mc->mc_flags |= C_RECLAIMING; + } +} + /* Count all the pages in each DB and in the freelist and make sure * it matches the actual number of pages being used. * All named DBs must be open for a correct count. */ @@ -4106,7 +4121,7 @@ retry: mc.mc_flags &= ~C_RECLAIMING; rc = mdbx_page_search(&mc, NULL, MDBX_PS_LAST | MDBX_PS_MODIFY); mc.mc_flags |= C_RECLAIMING; - if (unlikely(rc != MDBX_SUCCESS && rc != MDBX_NOTFOUND)) + if (unlikely(rc != MDBX_SUCCESS) && rc != MDBX_NOTFOUND) goto bailout; } /* Write to last page of freeDB */ @@ -4114,6 +4129,7 @@ retry: key.iov_base = &txn->mt_txnid; do { data.iov_len = MDBX_PNL_SIZEOF(txn->mt_befree_pages); + mdbx_prep_backlog_data(txn, &mc, data.iov_len); rc = mdbx_cursor_put(&mc, &key, &data, MDBX_RESERVE); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; @@ -4323,6 +4339,7 @@ retry: data.iov_len = (chunk + 1) * sizeof(pgno_t); mdbx_trace("%s.reserve: %u [%u...%u] @%" PRIaTXN, dbg_prefix_mode, chunk, settled + 1, settled + chunk + 1, reservation_gc_id); + mdbx_prep_backlog_data(txn, &mc, data.iov_len); rc = mdbx_cursor_put(&mc, &key, &data, MDBX_RESERVE | MDBX_NOOVERWRITE); mdbx_tassert(txn, mdbx_pnl_check(env->me_reclaimed_pglist, true)); if (unlikely(rc != MDBX_SUCCESS)) @@ -4420,9 +4437,10 @@ retry: if (unlikely(chunk > left)) { mdbx_trace("%s: chunk %u > left %u, @%" PRIaTXN, dbg_prefix_mode, chunk, left, fill_gc_id); - if (loop < 5 || chunk - left > env->me_maxgc_ov1page) { + if ((loop < 5 && chunk - left > loop / 2) || + chunk - left > env->me_maxgc_ov1page) { data.iov_len = (left + 1) * sizeof(pgno_t); - if (loop < 21) + if (loop < 7) mc.mc_flags &= ~C_GCFREEZE; } chunk = left; From 1a123b539573f8674e1c3444e891202aed579132 Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Tue, 9 Jul 2019 15:23:19 +0300 Subject: [PATCH 13/19] mdbx: backport - fix DB-shrinking race with copy-asis & readers. Change-Id: I893b388d186b6425ab60be4b7cc6bf9b67142def --- src/bits.h | 9 ++++--- src/mdbx.c | 70 ++++++++++++++++++++++++++++++++++++++++++------------ 2 files changed, 61 insertions(+), 18 deletions(-) diff --git a/src/bits.h b/src/bits.h index 89c45470..2e41edb9 100644 --- a/src/bits.h +++ b/src/bits.h @@ -253,11 +253,14 @@ typedef struct MDBX_reader { volatile mdbx_pid_t mr_pid; /* The thread ID of the thread owning this txn. */ volatile mdbx_tid_t mr_tid; + /* The number of pages used in the reader's MVCC snapshot, + * i.e. the value of meta->mm_geo.next and txn->mt_next_pgno */ + volatile pgno_t mr_snapshot_pages; /* cache line alignment */ - uint8_t pad[MDBX_CACHELINE_SIZE - - (sizeof(txnid_t) + sizeof(mdbx_pid_t) + sizeof(mdbx_tid_t)) % - MDBX_CACHELINE_SIZE]; + uint8_t pad[MDBX_CACHELINE_SIZE - (sizeof(txnid_t) + sizeof(mdbx_pid_t) + + sizeof(mdbx_tid_t) + sizeof(pgno_t)) % + MDBX_CACHELINE_SIZE]; } MDBX_reader; /* Information about a single database in the environment. */ diff --git a/src/mdbx.c b/src/mdbx.c index b3e0c5ec..27b5a251 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -2163,7 +2163,7 @@ static txnid_t mdbx_find_oldest(MDBX_txn *txn) { mdbx_tassert(txn, edge <= txn->mt_txnid - 1); MDBX_lockinfo *const lck = env->me_lck; - if (unlikely(env->me_lck == NULL /* exclusive mode */)) + if (unlikely(lck == NULL /* exclusive mode */)) return env->me_oldest_stub = edge; const txnid_t last_oldest = lck->mti_oldest; @@ -2201,6 +2201,32 @@ static txnid_t mdbx_find_oldest(MDBX_txn *txn) { return oldest; } +/* Find largest mvcc-snapshot still referenced. */ +static pgno_t mdbx_find_largest(MDBX_env *env, pgno_t largest) { + MDBX_lockinfo *const lck = env->me_lck; + if (likely(lck != NULL /* exclusive mode */)) { + const unsigned snap_nreaders = lck->mti_numreaders; + for (unsigned i = 0; i < snap_nreaders; ++i) { + retry: + if (lck->mti_readers[i].mr_pid) { + /* mdbx_jitter4testing(true); */ + const pgno_t snap_pages = lck->mti_readers[i].mr_snapshot_pages; + const txnid_t snap_txnid = lck->mti_readers[i].mr_txnid; + mdbx_memory_barrier(); + if (unlikely(snap_pages != lck->mti_readers[i].mr_snapshot_pages || + snap_txnid != lck->mti_readers[i].mr_txnid)) + goto retry; + if (largest < snap_pages && + lck->mti_oldest <= /* ignore pending updates */ snap_txnid && + snap_txnid <= env->me_txn0->mt_txnid) + largest = snap_pages; + } + } + } + + return largest; +} + /* Add a page to the txn's dirty list */ static int __must_check_result mdbx_page_dirty(MDBX_txn *txn, MDBX_page *mp) { int (*const adder)(MDBX_DPL, pgno_t pgno, MDBX_page * page) = @@ -3231,6 +3257,7 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) { const txnid_t snap = mdbx_meta_txnid_fluid(env, meta); mdbx_jitter4testing(false); if (r) { + r->mr_snapshot_pages = meta->mm_geo.next; r->mr_txnid = snap; mdbx_jitter4testing(false); mdbx_assert(env, r->mr_pid == mdbx_getpid()); @@ -3267,6 +3294,8 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) { mdbx_assert(env, txn->mt_txnid >= *env->me_oldest); txn->mt_ro_reader = r; txn->mt_dbxs = env->me_dbxs; /* mostly static anyway */ + mdbx_ensure(env, txn->mt_txnid >= + /* paranoia is appropriate here */ *env->me_oldest); } else { /* Not yet touching txn == env->me_txn0, it may be active */ mdbx_jitter4testing(false); @@ -3616,13 +3645,20 @@ static int mdbx_txn_end(MDBX_txn *txn, unsigned mode) { (void *)env, txn->mt_dbs[MAIN_DBI].md_root, txn->mt_dbs[FREE_DBI].md_root); + mdbx_ensure(env, txn->mt_txnid >= + /* paranoia is appropriate here */ *env->me_oldest); if (F_ISSET(txn->mt_flags, MDBX_TXN_RDONLY)) { #if defined(_WIN32) || defined(_WIN64) if (txn->mt_flags & MDBX_SHRINK_ALLOWED) mdbx_srwlock_ReleaseShared(&env->me_remap_guard); #endif if (txn->mt_ro_reader) { + mdbx_ensure(env, /* paranoia is appropriate here */ + txn->mt_txnid == txn->mt_ro_reader->mr_txnid && + txn->mt_ro_reader->mr_txnid >= env->me_lck->mti_oldest); + txn->mt_ro_reader->mr_snapshot_pages = 0; txn->mt_ro_reader->mr_txnid = ~(txnid_t)0; + mdbx_memory_barrier(); env->me_lck->mti_readers_refresh_flag = true; if (mode & MDBX_END_SLOT) { if ((env->me_flags & MDBX_ENV_TXKEY) == 0) @@ -5311,19 +5347,23 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, if ((flags & MDBX_SHRINK_ALLOWED) && pending->mm_geo.shrink && pending->mm_geo.now - pending->mm_geo.next > pending->mm_geo.shrink + backlog_gap) { - const pgno_t aligner = - pending->mm_geo.grow ? pending->mm_geo.grow : pending->mm_geo.shrink; - const pgno_t with_backlog_gap = pending->mm_geo.next + backlog_gap; - const pgno_t aligned = pgno_align2os_pgno( - env, with_backlog_gap + aligner - with_backlog_gap % aligner); - const pgno_t bottom = - (aligned > pending->mm_geo.lower) ? aligned : pending->mm_geo.lower; - if (pending->mm_geo.now > bottom) { - flags &= MDBX_WRITEMAP | MDBX_SHRINK_ALLOWED; /* force steady */ - shrink = pending->mm_geo.now - bottom; - pending->mm_geo.now = bottom; - if (mdbx_meta_txnid_stable(env, head) == pending->mm_txnid_a) - mdbx_meta_set_txnid(env, pending, pending->mm_txnid_a + 1); + const pgno_t largest = mdbx_find_largest(env, pending->mm_geo.next); + if (pending->mm_geo.now > largest && + pending->mm_geo.now - largest > pending->mm_geo.shrink + backlog_gap) { + const pgno_t aligner = + pending->mm_geo.grow ? pending->mm_geo.grow : pending->mm_geo.shrink; + const pgno_t with_backlog_gap = largest + backlog_gap; + const pgno_t aligned = pgno_align2os_pgno( + env, with_backlog_gap + aligner - with_backlog_gap % aligner); + const pgno_t bottom = + (aligned > pending->mm_geo.lower) ? aligned : pending->mm_geo.lower; + if (pending->mm_geo.now > bottom) { + flags &= MDBX_WRITEMAP | MDBX_SHRINK_ALLOWED; /* force steady */ + shrink = pending->mm_geo.now - bottom; + pending->mm_geo.now = bottom; + if (mdbx_meta_txnid_stable(env, head) == pending->mm_txnid_a) + mdbx_meta_set_txnid(env, pending, pending->mm_txnid_a + 1); + } } } @@ -5974,7 +6014,7 @@ int __cold mdbx_env_get_maxreaders(MDBX_env *env, unsigned *readers) { } /* Further setup required for opening an MDBX environment */ -static int __cold mdbx_setup_dxb(MDBX_env *env, int lck_rc) { +static int __cold mdbx_setup_dxb(MDBX_env *env, const int lck_rc) { uint64_t filesize_before_mmap; MDBX_meta meta; int rc = MDBX_RESULT_FALSE; From 214df7c1e19e86b073b9073c60e697cc99265b48 Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Sat, 27 Jul 2019 22:13:28 +0300 Subject: [PATCH 14/19] Note about moving from Github. The [repository was moved](https://abf.io/erthink/libmdbx) due to illegal discriminatory blocking of access from the territory of the Russian Crimea and for sovereign crimeans. Crimea is Russia. Change-Id: I5a4eb6b50be2e88f4dc6658d00331954e373603a --- README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.md b/README.md index a2a6ab7a..aa7c6a5a 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,7 @@ +## The [repository was moved](https://abf.io/erthink/libmdbx) due to illegal discriminatory blocking of access from the territory of the Russian Crimea and for sovereign crimeans. + +--- + libmdbx ====================================== **Revised and extended descendant of [Symas LMDB](https://symas.com/lmdb/).** From 263dbd97c5ece6a59698945efa419142a563bcf4 Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Thu, 22 Aug 2019 21:11:49 +0300 Subject: [PATCH 15/19] mdbx: backport - fix env_sync_ex() for out-of-txn case. Change-Id: Ie19bbe1d467ce4fc83f8dfc1e367070f532ee335 --- src/mdbx.c | 50 +++++++++++++++++++++++++++----------------------- 1 file changed, 27 insertions(+), 23 deletions(-) diff --git a/src/mdbx.c b/src/mdbx.c index 27b5a251..b05bffd2 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -2160,7 +2160,7 @@ static txnid_t mdbx_find_oldest(MDBX_txn *txn) { mdbx_tassert(txn, (txn->mt_flags & MDBX_RDONLY) == 0); MDBX_env *env = txn->mt_env; const txnid_t edge = mdbx_reclaiming_detent(env); - mdbx_tassert(txn, edge <= txn->mt_txnid - 1); + mdbx_tassert(txn, edge <= txn->mt_txnid); MDBX_lockinfo *const lck = env->me_lck; if (unlikely(lck == NULL /* exclusive mode */)) @@ -2202,7 +2202,7 @@ static txnid_t mdbx_find_oldest(MDBX_txn *txn) { } /* Find largest mvcc-snapshot still referenced. */ -static pgno_t mdbx_find_largest(MDBX_env *env, pgno_t largest) { +static __cold pgno_t mdbx_find_largest(MDBX_env *env, pgno_t largest) { MDBX_lockinfo *const lck = env->me_lck; if (likely(lck != NULL /* exclusive mode */)) { const unsigned snap_nreaders = lck->mti_numreaders; @@ -3009,39 +3009,43 @@ static int mdbx_env_sync_ex(MDBX_env *env, int force, int nonblock) { return rc; } - MDBX_meta *head = mdbx_meta_head(env); + const MDBX_meta *head = mdbx_meta_head(env); if (!META_IS_STEADY(head) || env->me_sync_pending) { if (force || (env->me_sync_threshold && env->me_sync_pending >= env->me_sync_threshold)) flags &= MDBX_WRITEMAP /* clear flags for full steady sync */; - if (outside_txn && - env->me_sync_pending > - pgno2bytes(env, 16 /* FIXME: define threshold */) && - (flags & MDBX_NOSYNC) == 0) { - mdbx_assert(env, ((flags ^ env->me_flags) & MDBX_WRITEMAP) == 0); - const size_t usedbytes = pgno_align2os_bytes(env, head->mm_geo.next); + if (outside_txn) { + if (env->me_sync_pending > + pgno2bytes(env, 16 /* FIXME: define threshold */) && + (flags & (MDBX_NOSYNC | MDBX_MAPASYNC)) == 0) { + mdbx_assert(env, ((flags ^ env->me_flags) & MDBX_WRITEMAP) == 0); + const size_t usedbytes = pgno_align2os_bytes(env, head->mm_geo.next); - mdbx_txn_unlock(env); + mdbx_txn_unlock(env); - /* LY: pre-sync without holding lock to reduce latency for writer(s) */ - int rc = (flags & MDBX_WRITEMAP) - ? mdbx_msync(&env->me_dxb_mmap, 0, usedbytes, - flags & MDBX_MAPASYNC) - : mdbx_filesync(env->me_fd, false); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; + /* LY: pre-sync without holding lock to reduce latency for writer(s) */ + int rc = (flags & MDBX_WRITEMAP) + ? mdbx_msync(&env->me_dxb_mmap, 0, usedbytes, false) + : mdbx_filesync(env->me_fd, false); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; - rc = mdbx_txn_lock(env, nonblock); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; + rc = mdbx_txn_lock(env, nonblock); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; - /* LY: head may be changed. */ - head = mdbx_meta_head(env); + /* LY: head may be changed. */ + head = mdbx_meta_head(env); + } + env->me_txn0->mt_txnid = meta_txnid(env, head, false); + mdbx_find_oldest(env->me_txn0); } - if (!META_IS_STEADY(head) || env->me_sync_pending) { + if (!META_IS_STEADY(head) || + ((flags & (MDBX_NOSYNC | MDBX_MAPASYNC)) == 0 && + env->me_sync_pending)) { mdbx_debug("meta-head %" PRIaPGNO ", %s, sync_pending %" PRIuPTR, container_of(head, MDBX_page, mp_data)->mp_pgno, mdbx_durable_str(head), env->me_sync_pending); From 0eff1930b5730a85b88b1912166bdf9f424b8ae7 Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Thu, 22 Aug 2019 22:33:18 +0300 Subject: [PATCH 16/19] mdbx: backport - don't shrink DB less largest reader inside mdbx_env_set_geometry(). Change-Id: I42a5d3a08313fb9590a6730bc0dc06c7b4f16634 --- src/mdbx.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/mdbx.c b/src/mdbx.c index b05bffd2..cf147f0b 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -5749,6 +5749,10 @@ LIBMDBX_API int mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, need_unlock = true; } MDBX_meta *head = mdbx_meta_head(env); + if (!inside_txn) { + env->me_txn0->mt_txnid = meta_txnid(env, head, false); + mdbx_find_oldest(env->me_txn0); + } if (pagesize < 0) pagesize = env->me_psize; @@ -5768,7 +5772,8 @@ LIBMDBX_API int mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, if (shrink_threshold < 0) shrink_threshold = pgno2bytes(env, head->mm_geo.shrink); - const size_t usedbytes = pgno2bytes(env, head->mm_geo.next); + const size_t usedbytes = + pgno2bytes(env, mdbx_find_largest(env, head->mm_geo.next)); if ((size_t)size_upper < usedbytes) { rc = MDBX_MAP_FULL; goto bailout; From 91ab9e28048a3f1b247c065f3eca9c6bce13b58a Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Fri, 23 Aug 2019 17:47:38 +0300 Subject: [PATCH 17/19] mdbx: backport - fix reclaiming_detent() for out-of-txn case. Change-Id: I5275f4f3676b125e860f6a7c204a5f9cdc65dd5f --- src/mdbx.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/mdbx.c b/src/mdbx.c index cf147f0b..75b67c45 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -2139,7 +2139,9 @@ static __hot MDBX_meta *mdbx_meta_head(const MDBX_env *env) { static __hot txnid_t mdbx_reclaiming_detent(const MDBX_env *env) { if (F_ISSET(env->me_flags, MDBX_UTTERLY_NOSYNC)) - return env->me_txn->mt_txnid - 1; + return likely(env->me_txn0->mt_owner == mdbx_thread_self()) + ? env->me_txn0->mt_txnid - 1 + : mdbx_meta_txnid_fluid(env, mdbx_meta_head(env)); return mdbx_meta_txnid_stable(env, mdbx_meta_steady(env)); } From 53b60cdecca3173cbf41f40a486a998387c56fcf Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Tue, 27 Aug 2019 14:50:19 +0300 Subject: [PATCH 18/19] mdbx-load: backport - fix backslash escaping (for compatibility with ITS#9068). In fact MDBX not affected by this bug, since a very long time mdbx_dump was fixed to not produce a problematic sequence of backslash. For compatibility with LMDB after http://www.openldap.org/devel/gitweb.cgi?p=openldap.git;a=commit;h=5c012bbe033f9bbb273078b07dded59f080d348d Change-Id: I8ff8e003ae29504605402b937becd4fb37120408 --- src/tools/mdbx_load.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/tools/mdbx_load.c b/src/tools/mdbx_load.c index 4be1ffe1..26436849 100644 --- a/src/tools/mdbx_load.c +++ b/src/tools/mdbx_load.c @@ -262,10 +262,9 @@ static int readline(MDBX_val *out, MDBX_val *buf) { if (mode & PRINT) { while (c2 < end) { - if (*c2 == '\\') { + if (unlikely(*c2 == '\\')) { if (c2[1] == '\\') { - c1++; - c2 += 2; + *c1++ = '\\'; } else { if (c2 + 3 > end || !isxdigit(c2[1]) || !isxdigit(c2[2])) { Eof = 1; @@ -273,8 +272,8 @@ static int readline(MDBX_val *out, MDBX_val *buf) { return EOF; } *c1++ = (char)unhex(++c2); - c2 += 2; } + c2 += 2; } else { /* copies are redundant when no escapes were used */ *c1++ = *c2++; From 67a8f581b6b2940987a40fd66ab0539b9e8c6dac Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Mon, 18 Nov 2019 16:49:02 +0300 Subject: [PATCH 19/19] mdbx: backport - drop obsolete assertion & refine related code. Fixes https://github.com/leo-yuriev/libmdbx/issues/69 Change-Id: I8f87a5cccc754405c338dd1357065dd066a3e3ce --- src/mdbx.c | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/src/mdbx.c b/src/mdbx.c index 75b67c45..23429ac4 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -5339,7 +5339,6 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, mdbx_assert(env, pending < METAPAGE(env, 0) || pending > METAPAGE(env, NUM_METAS)); mdbx_assert(env, (env->me_flags & (MDBX_RDONLY | MDBX_FATAL_ERROR)) == 0); - mdbx_assert(env, !META_IS_STEADY(head) || env->me_sync_pending != 0); mdbx_assert(env, pending->mm_geo.next <= pending->mm_geo.now); const size_t usedbytes = pgno_align2os_bytes(env, pending->mm_geo.next); @@ -5961,9 +5960,12 @@ LIBMDBX_API int mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, goto bailout; head = /* base address could be changed */ mdbx_meta_head(env); } - env->me_sync_pending += env->me_psize; - mdbx_meta_set_txnid(env, &meta, mdbx_meta_txnid_stable(env, head) + 1); - rc = mdbx_sync_locked(env, env->me_flags, &meta); + if (inside_txn) + env->me_txn->mt_flags |= MDBX_TXN_DIRTY; + else { + mdbx_meta_set_txnid(env, &meta, mdbx_meta_txnid_stable(env, head) + 1); + rc = mdbx_sync_locked(env, env->me_flags, &meta); + } } } else if (pagesize != (intptr_t)env->me_psize) { mdbx_setup_pagesize(env, pagesize); @@ -6322,7 +6324,6 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, const int lck_rc) { mdbx_ensure(env, mdbx_meta_eq(env, &meta, head)); mdbx_meta_set_txnid(env, &meta, txnid + 1); - env->me_sync_pending += env->me_psize; err = mdbx_sync_locked(env, env->me_flags | MDBX_SHRINK_ALLOWED, &meta); if (err) { mdbx_info("error %d, while updating meta.geo: " @@ -13150,12 +13151,7 @@ int mdbx_canary_put(MDBX_txn *txn, const mdbx_canary *canary) { txn->mt_canary.z = canary->z; } txn->mt_canary.v = txn->mt_txnid; - - if ((txn->mt_flags & MDBX_TXN_DIRTY) == 0) { - MDBX_env *env = txn->mt_env; - txn->mt_flags |= MDBX_TXN_DIRTY; - env->me_sync_pending += env->me_psize; - } + txn->mt_flags |= MDBX_TXN_DIRTY; return MDBX_SUCCESS; }