From ffafd5be101aeb70b17bc4dcfcfe868cb8fb01c7 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Mon, 21 May 2018 16:31:36 +0300 Subject: [PATCH 01/83] mdbx: disable warning #5045 for MSVC (minor). --- src/bits.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/bits.h b/src/bits.h index 12f2e0d0..ea8e2731 100644 --- a/src/bits.h +++ b/src/bits.h @@ -45,6 +45,9 @@ #if _MSC_VER > 1800 # pragma warning(disable : 4464) /* relative include path contains '..' */ #endif +#if _MSC_VER > 1913 +# pragma warning(disable : 5045) /* Compiler will insert Spectre mitigation... */ +#endif #pragma warning(disable : 4710) /* 'xyz': function not inlined */ #pragma warning(disable : 4711) /* function 'xyz' selected for automatic inline expansion */ #pragma warning(disable : 4201) /* nonstandard extension used : nameless struct / union */ From db50fb8726482806c3c25313babd68a18a1133c5 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Thu, 21 Jun 2018 17:26:29 +0300 Subject: [PATCH 02/83] mdbx: backport - fix Coverity warning (minor, paranoia). Change-Id: I232377a03244dc33beb4f332c0024b454027f659 --- src/mdbx.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/mdbx.c b/src/mdbx.c index c6c8a41b..480b9a38 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -1134,7 +1134,7 @@ const char *__cold mdbx_strerror(int errnum) { const char *msg = __mdbx_strerr(errnum); if (!msg) { #ifdef _MSC_VER - static __thread char buffer[1024]; + static char buffer[1024]; size_t size = FormatMessageA( FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, NULL, errnum, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), buffer, @@ -1142,6 +1142,13 @@ const char *__cold mdbx_strerror(int errnum) { if (size) msg = buffer; #else + if (errnum < 0) { + static char buffer[32]; + int rc = snprintf(buffer, sizeof(buffer) - 1, "unknown error %d", errnum); + assert(rc > 0); + (void)rc; + return buffer; + } msg = strerror(errnum); #endif } From de43ab0d21a55c4aa65eafeae6e7da654565c968 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Thu, 21 Jun 2018 19:43:43 +0300 Subject: [PATCH 03/83] mdbx-ci: migrate to Circle-CI 2.0 Change-Id: Id86af9e033d64a4dc2043db33cd8e7ae173feb22 --- .circleci/config.yml | 20 ++++++++++++++++++++ circle.yml | 14 -------------- 2 files changed, 20 insertions(+), 14 deletions(-) create mode 100644 .circleci/config.yml delete mode 100644 circle.yml diff --git a/.circleci/config.yml b/.circleci/config.yml new file mode 100644 index 00000000..91e11a4b --- /dev/null +++ b/.circleci/config.yml @@ -0,0 +1,20 @@ +version: 2 +jobs: + build: + docker: + - image: circleci/buildpack-deps:artful + environment: + - TESTDB: /tmp/test.db + - TESTLOG: /tmp/test.log + steps: + - checkout + - run: make all + - run: ulimit -c unlimited && make check + - run: + command: | + mkdir -p /tmp/artifacts + mv -t /tmp/artifacts $TESTLOG $TESTDB core.* + when: on_fail + - store_artifacts: + path: /tmp/artifacts + destination: test-artifacts diff --git a/circle.yml b/circle.yml deleted file mode 100644 index 77da30e9..00000000 --- a/circle.yml +++ /dev/null @@ -1,14 +0,0 @@ -machine: - timezone: - Europe/Moscow - -database: - override: - -compile: - override: - - make all - -test: - override: - - make check || mv test.log ${CIRCLE_ARTIFACTS}/ From c579b974a26338563217174cde1fa99e4ff5b52d Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Sat, 30 Jun 2018 00:02:25 +0300 Subject: [PATCH 04/83] mdbx: backport - avoid weak meta inside mdbx_init_metas(). Change-Id: Ib9c5ab04ad8cff3ad43d94a288cecec45d7ef37d --- src/mdbx.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/mdbx.c b/src/mdbx.c index 480b9a38..cce50b4c 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -4600,11 +4600,10 @@ static MDBX_page *__cold mdbx_init_metas(const MDBX_env *env, void *buffer) { MDBX_page *page1 = mdbx_meta_model(env, page0, 0); MDBX_page *page2 = mdbx_meta_model(env, page1, 1); mdbx_meta_model(env, page2, 2); - page2->mp_meta.mm_datasync_sign = MDBX_DATASIGN_WEAK; mdbx_assert(env, !mdbx_meta_eq(env, &page0->mp_meta, &page1->mp_meta)); mdbx_assert(env, !mdbx_meta_eq(env, &page1->mp_meta, &page2->mp_meta)); mdbx_assert(env, !mdbx_meta_eq(env, &page2->mp_meta, &page0->mp_meta)); - return page1; + return page2; } static int mdbx_sync_locked(MDBX_env *env, unsigned flags, From d1809e6e2d4a6cadbf0f9270bebbc51129d9989b Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Sun, 1 Jul 2018 17:42:35 +0300 Subject: [PATCH 05/83] mdbx: backport - minor fix to avoid Valgrind false-positive issue. Change-Id: Ifa4dc51b500ff42a88182d750e22572aa5b2155b --- src/mdbx.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/mdbx.c b/src/mdbx.c index cce50b4c..ddf52bd4 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -7721,6 +7721,7 @@ int mdbx_cursor_put(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, offset = env->me_psize - (unsigned)olddata.iov_len; flags |= F_DUPDATA | F_SUBDATA; dummy.md_root = mp->mp_pgno; + dummy.md_seq = dummy.md_merkle = 0; sub_root = mp; } if (mp != fp) { From e57e52160937252005959e4d3127e3482b47ba34 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Tue, 24 Jul 2018 00:17:54 +0300 Subject: [PATCH 06/83] mdbx: backport - fix nasty suspend_and_append() bug. Change-Id: I043adcff2e6c040426a51b5d4b15bac849e6dd9f --- src/lck-windows.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/lck-windows.c b/src/lck-windows.c index 831200d3..3f4d2c80 100644 --- a/src/lck-windows.c +++ b/src/lck-windows.c @@ -190,7 +190,9 @@ static int suspend_and_append(mdbx_handle_array_t **array, (limit * 2 - ARRAY_LENGTH((*array)->handles))); if (!ptr) return MDBX_ENOMEM; - (*array) = (mdbx_handle_array_t *)ptr; + if (limit == ARRAY_LENGTH((*array)->handles)) + memcpy(ptr, *array, sizeof(mdbx_handle_array_t)); + *array = (mdbx_handle_array_t *)ptr; (*array)->limit = limit * 2; } From e442395cbd6ab6785ecb961154db2946eecf4ee8 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Tue, 31 Jul 2018 11:43:25 +0300 Subject: [PATCH 07/83] mdbx: bump version to v0.1.6 Change-Id: I95d45a815008e2cc9a8785a8c762310a1e907e21 --- CMakeLists.txt | 4 ++-- appveyor.yml | 2 +- src/version.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index b6640755..6f53b08d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -13,8 +13,8 @@ message(WARNING " set(MDBX_VERSION_MAJOR 0) set(MDBX_VERSION_MINOR 1) -set(MDBX_VERSION_RELEASE 3) -set(MDBX_VERSION_REVISION 1) +set(MDBX_VERSION_RELEASE 6) +set(MDBX_VERSION_REVISION 0) set(MDBX_VERSION_STRING ${MDBX_VERSION_MAJOR}.${MDBX_VERSION_MINOR}.${MDBX_VERSION_RELEASE}) diff --git a/appveyor.yml b/appveyor.yml index ce817b23..98ac34bd 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -1,4 +1,4 @@ -version: 0.1.5.{build} +version: 0.1.6.{build} environment: matrix: diff --git a/src/version.c b/src/version.c index aeb15bed..d5cd3697 100644 --- a/src/version.c +++ b/src/version.c @@ -18,7 +18,7 @@ #error "API version mismatch!" #endif -#define MDBX_VERSION_RELEASE 5 +#define MDBX_VERSION_RELEASE 6 #define MDBX_VERSION_REVISION 1 /*LIBMDBX_EXPORTS*/ const mdbx_version_info mdbx_version = { From 9a1ef8acfb10c1e01947d81d0d9372342e4a0032 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Tue, 31 Jul 2018 11:47:10 +0300 Subject: [PATCH 08/83] mdbx-cmake: remove warning-message. Change-Id: Icf9e4f7a96916cf9ab04613344867217be04827a --- CMakeLists.txt | 9 --------- 1 file changed, 9 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6f53b08d..60b50845 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,15 +2,6 @@ cmake_minimum_required(VERSION 2.8.7) set(TARGET mdbx) project(${TARGET}) -message(WARNING " -*************************************************************** - MDBX is under active development, database format and API - aren't stable at least until 2018Q3. New version won't be - backwards compatible. Main focus of the rework is to provide - clear and robust API and new features. -*************************************************************** -") - set(MDBX_VERSION_MAJOR 0) set(MDBX_VERSION_MINOR 1) set(MDBX_VERSION_RELEASE 6) From c9790b28d0b95c0f686c12b00217195de54d998c Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Tue, 31 Jul 2018 11:54:26 +0300 Subject: [PATCH 09/83] mdbx-cmake: fix so-version. Change-Id: I427d2f27f9092d65a0ffd11353ca466070e98618 --- CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 60b50845..5e424453 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -102,14 +102,14 @@ add_library(${TARGET}_SHARED SHARED set_target_properties(${TARGET}_SHARED PROPERTIES VERSION ${MDBX_VERSION_STRING} - SOVERSION ${MDBX_VERSION_MAJOR} + SOVERSION ${MDBX_VERSION_MAJOR}.${MDBX_VERSION_MINOR} OUTPUT_NAME ${TARGET} CLEAN_DIRECT_OUTPUT 1 ) set_target_properties(${TARGET}_STATIC PROPERTIES VERSION ${MDBX_VERSION_STRING} - SOVERSION ${MDBX_VERSION_MAJOR} + SOVERSION ${MDBX_VERSION_MAJOR}.${MDBX_VERSION_MINOR} OUTPUT_NAME ${TARGET} CLEAN_DIRECT_OUTPUT 1 ) From eb994802530cadf816e02440681cebd6ad74f9d4 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Wed, 1 Aug 2018 18:16:40 +0300 Subject: [PATCH 10/83] mdbx: backport - drop unused mdbx_lck_upgrade(). --- src/lck-posix.c | 2 -- src/lck-windows.c | 41 ----------------------------------------- src/osal.h | 1 - 3 files changed, 44 deletions(-) diff --git a/src/lck-posix.c b/src/lck-posix.c index 532505e8..c8d5e657 100644 --- a/src/lck-posix.c +++ b/src/lck-posix.c @@ -89,8 +89,6 @@ int mdbx_lck_downgrade(MDBX_env *env, bool complete) { return complete ? mdbx_lck_shared(env->me_lfd) : MDBX_SUCCESS; } -int mdbx_lck_upgrade(MDBX_env *env) { return mdbx_lck_exclusive(env->me_lfd); } - int mdbx_rpid_set(MDBX_env *env) { return mdbx_lck_op(env->me_lfd, F_SETLK, F_WRLCK, env->me_pid, 1); } diff --git a/src/lck-windows.c b/src/lck-windows.c index 3f4d2c80..1e9e0cc9 100644 --- a/src/lck-windows.c +++ b/src/lck-windows.c @@ -441,47 +441,6 @@ int mdbx_lck_downgrade(MDBX_env *env, bool complete) { return MDBX_SUCCESS /* 7) now at S-? (used), done */; } -int mdbx_lck_upgrade(MDBX_env *env) { - /* Transite from locked state (S-E) to exclusive-write (E-E) */ - assert(env->me_fd != INVALID_HANDLE_VALUE); - assert(env->me_lfd != INVALID_HANDLE_VALUE); - - /* 1) must be at S-E (locked), transite to ?_E (middle) */ - if (!funlock(env->me_lfd, LCK_LOWER)) - mdbx_panic("%s(%s) failed: errcode %u", mdbx_func_, - "S-E(locked) >> ?-E(middle)", GetLastError()); - - /* 3) now on ?-E (middle), try E-E (exclusive-write) */ - mdbx_jitter4testing(false); - if (flock(env->me_lfd, LCK_EXCLUSIVE | LCK_DONTWAIT, LCK_LOWER)) - return MDBX_RESULT_TRUE; /* 4) got E-E (exclusive-write), done */ - - /* 5) still on ?-E (middle) */ - int rc = GetLastError(); - mdbx_jitter4testing(false); - if (rc != ERROR_SHARING_VIOLATION && rc != ERROR_LOCK_VIOLATION) { - /* 6) something went wrong, report but continue */ - mdbx_error("%s(%s) failed: errcode %u", mdbx_func_, - "?-E(middle) >> E-E(exclusive-write)", rc); - } - - /* 7) still on ?-E (middle), try restore S-E (locked) */ - mdbx_jitter4testing(false); - rc = flock(env->me_lfd, LCK_SHARED | LCK_DONTWAIT, LCK_LOWER) - ? MDBX_RESULT_FALSE - : GetLastError(); - - mdbx_jitter4testing(false); - if (rc != MDBX_RESULT_FALSE) { - mdbx_fatal("%s(%s) failed: errcode %u", mdbx_func_, - "?-E(middle) >> S-E(locked)", rc); - return rc; - } - - /* 8) now on S-E (locked) */ - return MDBX_RESULT_FALSE; -} - void mdbx_lck_destroy(MDBX_env *env) { int rc; diff --git a/src/osal.h b/src/osal.h index 46bf82a0..0b230a4a 100644 --- a/src/osal.h +++ b/src/osal.h @@ -548,7 +548,6 @@ int mdbx_lck_init(MDBX_env *env); int mdbx_lck_seize(MDBX_env *env); int mdbx_lck_downgrade(MDBX_env *env, bool complete); -int mdbx_lck_upgrade(MDBX_env *env); void mdbx_lck_destroy(MDBX_env *env); int mdbx_rdt_lock(MDBX_env *env); From d4bfc17818aba08557fe09eba8d0caba5300a7aa Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Wed, 1 Aug 2018 18:25:41 +0300 Subject: [PATCH 11/83] mdbx: backport - add fallback2shared for mdbx_lck_exclusive(). --- src/lck-posix.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/src/lck-posix.c b/src/lck-posix.c index c8d5e657..e2353575 100644 --- a/src/lck-posix.c +++ b/src/lck-posix.c @@ -68,11 +68,19 @@ static int mdbx_lck_op(mdbx_filehandle_t fd, int op, int lck, off_t offset, } } -static __inline int mdbx_lck_exclusive(int lfd) { +static __inline int mdbx_lck_exclusive(int lfd, bool fallback2shared) { assert(lfd != INVALID_HANDLE_VALUE); if (flock(lfd, LOCK_EX | LOCK_NB)) return errno; - return mdbx_lck_op(lfd, F_SETLK, F_WRLCK, 0, 1); + int rc = mdbx_lck_op(lfd, F_SETLK, F_WRLCK, 0, 1); + if (rc != 0 && fallback2shared) { + while (flock(lfd, LOCK_SH)) { + int rc = errno; + if (rc != EINTR) + return rc; + } + } + return rc; } static __inline int mdbx_lck_shared(int lfd) { @@ -157,7 +165,7 @@ bailout: void mdbx_lck_destroy(MDBX_env *env) { if (env->me_lfd != INVALID_HANDLE_VALUE) { /* try get exclusive access */ - if (env->me_lck && mdbx_lck_exclusive(env->me_lfd) == 0) { + if (env->me_lck && mdbx_lck_exclusive(env->me_lfd, false) == 0) { mdbx_info("%s: got exclusive, drown mutexes", mdbx_func_); int rc = pthread_mutex_destroy(&env->me_lck->mti_rmutex); if (rc == 0) @@ -225,7 +233,7 @@ static int internal_seize_lck(int lfd) { assert(lfd != INVALID_HANDLE_VALUE); /* try exclusive access */ - int rc = mdbx_lck_exclusive(lfd); + int rc = mdbx_lck_exclusive(lfd, false); if (rc == 0) /* got exclusive */ return MDBX_RESULT_TRUE; @@ -234,7 +242,7 @@ static int internal_seize_lck(int lfd) { rc = mdbx_lck_shared(lfd); if (rc == 0) { /* got shared, try exclusive again */ - rc = mdbx_lck_exclusive(lfd); + rc = mdbx_lck_exclusive(lfd, true); if (rc == 0) /* now got exclusive */ return MDBX_RESULT_TRUE; From 38067c456606a78a3bcb6ea8e71196eee8cab1ad Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Thu, 2 Aug 2018 11:14:46 +0300 Subject: [PATCH 12/83] mdbx: backport - fix 'db_dummy' inside mdbx_dbi_open_ex(). Change-Id: I70a21c9b77a43c5af749da5723fa965487a056b0 --- src/mdbx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mdbx.c b/src/mdbx.c index ddf52bd4..230893c3 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -10864,10 +10864,10 @@ int mdbx_dbi_open_ex(MDBX_txn *txn, const char *table_name, unsigned user_flags, } unsigned dbflag = DB_FRESH | DB_VALID | DB_USRVALID; + MDBX_db db_dummy; if (unlikely(rc)) { /* MDBX_NOTFOUND and MDBX_CREATE: Create new DB */ assert(rc == MDBX_NOTFOUND); - MDBX_db db_dummy; memset(&db_dummy, 0, sizeof(db_dummy)); db_dummy.md_root = P_INVALID; db_dummy.md_flags = user_flags & PERSISTENT_FLAGS; From d2fcbf5f82020f5499b433b9abd8f436071bdbed Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Fri, 10 Aug 2018 09:36:03 +0300 Subject: [PATCH 13/83] mdbx: backport - fix assert-condition inside mdbx_pnl_xappend(). Change-Id: Id5ac89c85b7e673c44d60a626c805fe666d221bc --- src/mdbx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mdbx.c b/src/mdbx.c index 230893c3..77799f13 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -536,7 +536,7 @@ static void mdbx_txl_free(MDBX_TXL list) { /* Append ID to PNL. The PNL must be big enough. */ static __inline void mdbx_pnl_xappend(MDBX_PNL pl, pgno_t id) { - assert(pl[0] + (size_t)1 < MDBX_PNL_ALLOCLEN(pl)); + assert(pl[0] + (size_t)1 <= MDBX_PNL_ALLOCLEN(pl)); pl[pl[0] += 1] = id; } From 3979ba4784a394c553562af696b7667b6a3c1f89 Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Fri, 10 Aug 2018 18:33:59 +0300 Subject: [PATCH 14/83] mdbx: backport - fix assertions. Change-Id: I95c43ef1ea2da55a124dc43f03890cf1d96f2e61 --- src/mdbx.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/mdbx.c b/src/mdbx.c index 77799f13..9a70a641 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -7828,8 +7828,6 @@ int mdbx_cursor_put(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, memcpy(olddata.iov_base, data->iov_base, data->iov_len); else { mdbx_cassert(mc, NUMKEYS(mc->mc_pg[mc->mc_top]) == 1); - mdbx_cassert(mc, mc->mc_pg[mc->mc_top]->mp_upper == - mc->mc_pg[mc->mc_top]->mp_lower); mdbx_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top]) && !IS_LEAF2(mc->mc_pg[mc->mc_top])); mdbx_cassert(mc, NODEDSZ(leaf) == 0); @@ -7837,7 +7835,7 @@ int mdbx_cursor_put(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, mdbx_cassert(mc, key->iov_len < UINT16_MAX); leaf->mn_ksize = (uint16_t)key->iov_len; memcpy(NODEKEY(leaf), key->iov_base, key->iov_len); - assert((char *)NODEDATA(leaf) + NODEDSZ(leaf) < + assert((char *)NODEKEY(leaf) + NODEDSZ(leaf) < (char *)(mc->mc_pg[mc->mc_top]) + env->me_psize); goto fix_parent; } From bff6aa460ac4baa9d7d2bae8fe931e986f845c01 Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Fri, 10 Aug 2018 18:39:19 +0300 Subject: [PATCH 15/83] mdbx: backport - fix MDBX_EKEYMISMATCH while update multi-value with MDBX_CURRENT. Change-Id: I3095620a94f694fb2c29b9c4faab9ea02b9bd7b7 --- src/mdbx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mdbx.c b/src/mdbx.c index 9a70a641..0bb416b8 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -7460,7 +7460,7 @@ int mdbx_cursor_put(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, DVAL((flags & MDBX_RESERVE) ? nullptr : data), data->iov_len); int dupdata_flag = 0; - if (flags & MDBX_CURRENT) { + if ((flags & MDBX_CURRENT) != 0 && (mc->mc_flags & C_SUB) == 0) { /* Опция MDBX_CURRENT означает, что запрошено обновление текущей записи, * на которой сейчас стоит курсор. Проверяем что переданный ключ совпадает * со значением в текущей позиции курсора. From e054ad2ebbfa1b7ea2f99c8011fda736578d0ee6 Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Sat, 11 Aug 2018 21:44:36 +0300 Subject: [PATCH 16/83] mdbx-test: backport - add 'strikethrough' for bitmask-options. Change-Id: I86dd2f8cdbd5a32a0471a5eee1e2b3a5857541ac --- test/config.cc | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/test/config.cc b/test/config.cc index cbff68ce..04968515 100644 --- a/test/config.cc +++ b/test/config.cc @@ -75,7 +75,7 @@ bool parse_option(int argc, char *const argv[], int &narg, const char *option, if (!parse_option(argc, argv, narg, option, &list)) return false; - mask = 0; + unsigned clear = 0; while (*list) { if (*list == ',' || *list == ' ' || *list == '\t') { ++list; @@ -83,14 +83,21 @@ bool parse_option(int argc, char *const argv[], int &narg, const char *option, } const char *const comma = strchr(list, ','); + const bool strikethrough = *list == '-' || *list == '~'; + if (strikethrough || *list == '+') + ++list; + else + mask = clear; const size_t len = (comma) ? comma - list : strlen(list); const option_verb *scan = verbs; + while (true) { if (!scan->verb) failure("Unknown verb '%.*s', for option '==%s'\n", (int)len, list, option); if (strlen(scan->verb) == len && strncmp(list, scan->verb, len) == 0) { - mask |= scan->mask; + mask = strikethrough ? mask & ~scan->mask : mask | scan->mask; + clear = strikethrough ? clear & ~scan->mask : clear | scan->mask; list += len; break; } From e18551061ec3f4dd2f9bdeb5f031bf810432b550 Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Sat, 11 Aug 2018 22:06:26 +0300 Subject: [PATCH 17/83] mdbx-test: backport - fix keylen/datalen min/max ranges checking. Change-Id: Iee5d2f71ad22ec6e86167f5181deff54f0b5b518 --- test/keygen.cc | 8 ++++---- test/main.cc | 27 ++++++++++++++++++++------- 2 files changed, 24 insertions(+), 11 deletions(-) diff --git a/test/keygen.cc b/test/keygen.cc index 99b46f29..1b18fa00 100644 --- a/test/keygen.cc +++ b/test/keygen.cc @@ -122,16 +122,16 @@ void maker::setup(const config::actor_params_pod &actor, unsigned thread_number) { key_essentials.flags = actor.table_flags & (MDBX_INTEGERKEY | MDBX_REVERSEKEY); - assert(actor.keylen_min < UINT8_MAX); + assert(actor.keylen_min <= UINT8_MAX); key_essentials.minlen = (uint8_t)actor.keylen_min; - assert(actor.keylen_max < UINT16_MAX); + assert(actor.keylen_max <= UINT16_MAX); key_essentials.maxlen = (uint16_t)actor.keylen_max; value_essentials.flags = actor.table_flags & (MDBX_INTEGERDUP | MDBX_REVERSEDUP); - assert(actor.datalen_min < UINT8_MAX); + assert(actor.datalen_min <= UINT8_MAX); value_essentials.minlen = (uint8_t)actor.datalen_min; - assert(actor.datalen_max < UINT16_MAX); + assert(actor.datalen_max <= UINT16_MAX); value_essentials.maxlen = (uint16_t)actor.datalen_max; assert(thread_number < 2); diff --git a/test/main.cc b/test/main.cc index bc3198ed..e6e0e177 100644 --- a/test/main.cc +++ b/test/main.cc @@ -188,20 +188,33 @@ int main(int argc, char *const argv[]) { config::duration, 1)) continue; if (config::parse_option(argc, argv, narg, "keylen.min", params.keylen_min, - config::no_scale, 0, params.keylen_max)) + config::no_scale, 0, UINT8_MAX)) { + if (params.keylen_max < params.keylen_min) + params.keylen_max = params.keylen_min; continue; - if (config::parse_option(argc, argv, narg, "keylen.max", params.keylen_max, - config::no_scale, params.keylen_min, - mdbx_get_maxkeysize(0))) + } + if (config::parse_option( + argc, argv, narg, "keylen.max", params.keylen_max, config::no_scale, + 0, std::min(mdbx_get_maxkeysize(0), (int)UINT16_MAX))) { + + if (params.keylen_min > params.keylen_max) + params.keylen_min = params.keylen_max; continue; + } if (config::parse_option(argc, argv, narg, "datalen.min", params.datalen_min, config::no_scale, 0, - params.datalen_max)) + UINT8_MAX)) { + if (params.datalen_max < params.datalen_min) + params.datalen_max = params.datalen_min; continue; + } if (config::parse_option(argc, argv, narg, "datalen.max", - params.datalen_max, config::no_scale, - params.datalen_min, MDBX_MAXDATASIZE)) + params.datalen_max, config::no_scale, 0, + std::min((int)UINT16_MAX, MDBX_MAXDATASIZE))) { + if (params.datalen_min > params.datalen_max) + params.datalen_min = params.datalen_max; continue; + } if (config::parse_option(argc, argv, narg, "batch.read", params.batch_read, config::no_scale, 1)) continue; From 59026d5f84dfa7ced1545a3751c28d45324ad3a3 Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Sun, 12 Aug 2018 21:15:23 +0300 Subject: [PATCH 18/83] mdbx-test: backport - fix minor typos. Change-Id: I4889a0e698bdfdda7eed257a5cd29e8b8089d102 --- src/mdbx.c | 2 +- test/hill.cc | 2 -- test/utils.cc | 2 +- 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/src/mdbx.c b/src/mdbx.c index 0bb416b8..514b8a71 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -7681,7 +7681,7 @@ int mdbx_cursor_put(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, offset *= 4; /* space for 4 more */ break; } - /* FALLTHRU: Big enough MDBX_DUPFIXaED sub-page */ + /* FALLTHRU: Big enough MDBX_DUPFIXED sub-page */ __fallthrough; case MDBX_CURRENT | MDBX_NODUPDATA: case MDBX_CURRENT: diff --git a/test/hill.cc b/test/hill.cc index c9115784..0d609b86 100644 --- a/test/hill.cc +++ b/test/hill.cc @@ -156,8 +156,6 @@ bool testcase_hill::run() { a_serial); generate_pair(a_serial, a_key, a_data_0, 0); generate_pair(a_serial, a_key, a_data_1, age_shift); - if (a_serial == 808) - log_trace("!!!"); int rc = mdbx_replace(txn_guard.get(), dbi, &a_key->value, &a_data_1->value, &a_data_0->value, update_flags); if (unlikely(rc != MDBX_SUCCESS)) diff --git a/test/utils.cc b/test/utils.cc index 0855c7ee..53a750e3 100644 --- a/test/utils.cc +++ b/test/utils.cc @@ -93,7 +93,7 @@ bool hex2data(const char *hex_begin, const char *hex_end, void *ptr, //----------------------------------------------------------------------------- -/* TODO: replace my 'libmera' fomr t1ha. */ +/* TODO: replace my 'libmera' from t1ha. */ uint64_t entropy_ticks(void) { #if defined(EMSCRIPTEN) return (uint64_t)emscripten_get_now(); From 652bb08f8c8a52b2eba05f87eff952dc68a46c2f Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Mon, 13 Aug 2018 01:50:29 +0300 Subject: [PATCH 19/83] mdbx-test: backport - use strtoull() and retry with base=10. Change-Id: Ica846ed0a13eb4468a45620518b9ccf85e77a764 --- test/config.cc | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/test/config.cc b/test/config.cc index 04968515..ae9367e6 100644 --- a/test/config.cc +++ b/test/config.cc @@ -118,7 +118,12 @@ bool parse_option(int argc, char *const argv[], int &narg, const char *option, char *suffix = nullptr; errno = 0; - unsigned long raw = strtoul(value_cstr, &suffix, 0); + unsigned long long raw = strtoull(value_cstr, &suffix, 0); + if ((suffix && *suffix) || errno) { + suffix = nullptr; + errno = 0; + raw = strtoull(value_cstr, &suffix, 10); + } if (errno) failure("Option '--%s' expects a numeric value (%s)\n", option, test_strerror(errno)); From b91e645919e3a96423250f98994795eb4573e515 Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Sun, 12 Aug 2018 21:08:07 +0300 Subject: [PATCH 20/83] mdbx-test: add 'gc.sh' script. Change-Id: I633c93c0865b0d2609688713e986edf51ce6547d --- test/gc.sh | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100755 test/gc.sh diff --git a/test/gc.sh b/test/gc.sh new file mode 100755 index 00000000..81e32ba6 --- /dev/null +++ b/test/gc.sh @@ -0,0 +1,35 @@ +#!/bin/bash +set -euo pipefail +TESTDB_PREFIX=${1:-/dev/shm/mdbx-gc-test} + +function rep9 { printf "%*s" $1 '' | tr ' ' '9'; } +function join { local IFS="$1"; shift; echo "$*"; } +function bit2option { local -n arr=$1; (( ($2&(1<<$3)) != 0 )) && echo -n '+' || echo -n '-'; echo "${arr[$3]}"; } + +options=(writemap coalesce lifo) + +function bits2list { + local -n arr=$1 + local i + local list=() + for ((i=0; i<${#arr[@]}; ++i)) do + list[$i]=$(bit2option $1 $2 $i) + done + join , "${list[@]}" +} + +for nops in {1..7}; do + for ((wbatch=nops; wbatch > 0; --wbatch)); do + for ((bits=2**${#options[@]}; --bits >= 0; )); do + echo "=================================== $(date)" + rm -f ${TESTDB_PREFIX}* + echo --nops=$( rep9 $nops ) --batch.write=$( rep9 $wbatch ) --mode=$(bits2list options $bits) + ./mdbx_test --pathname=${TESTDB_PREFIX} --size=8G --keylen.min=1 --keylen.max=250 --datalen.min=1 --datalen.max=500 \ + --nops=$( rep9 $nops ) --batch.write=$( rep9 $wbatch ) --mode=$(bits2list options $bits) \ + --hill | bzip2 -c > ${TESTDB_PREFIX}.log.bz2 + ./mdbx_chk -nvv ${TESTDB_PREFIX} | tee ${TESTDB_PREFIX}-chk.log + done + done +done + +echo "=== ALL DONE ====================== $(date)" From 57655583e51848b951de332c643b009a33d2c5cc Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Sat, 11 Aug 2018 02:04:34 +0300 Subject: [PATCH 21/83] mdbx: backport - fix/rewrite mdbx_update_gc(). Change-Id: I580a1ff0cbeeb529e2bcbd50d97bfba7bcf5a546 --- src/bits.h | 4 +- src/mdbx.c | 392 ++++++++++++++++++++++++++++++----------------------- 2 files changed, 223 insertions(+), 173 deletions(-) diff --git a/src/bits.h b/src/bits.h index 4b0edd01..a217444b 100644 --- a/src/bits.h +++ b/src/bits.h @@ -751,8 +751,8 @@ struct MDBX_env { MDBX_PNL me_free_pgs; /* ID2L of pages written during a write txn. Length MDBX_PNL_UM_SIZE. */ MDBX_ID2L me_dirtylist; - /* Max number of freelist items that can fit in a single overflow page */ - unsigned me_maxfree_1pg; + /* Number of freelist items that can fit in a single overflow page */ + unsigned me_maxgc_ov1page; /* Max size of a node on a page */ unsigned me_nodemax; unsigned me_maxkey_limit; /* max size of a key */ diff --git a/src/mdbx.c b/src/mdbx.c index 514b8a71..4c7cc2e6 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -2141,7 +2141,7 @@ static int mdbx_page_alloc(MDBX_cursor *mc, unsigned num, MDBX_page **mp, if (likely(flags & MDBX_ALLOC_GC)) { flags |= env->me_flags & (MDBX_COALESCE | MDBX_LIFORECLAIM); if (unlikely(mc->mc_flags & C_RECLAIMING)) { - /* If mc is updating the freeDB, then the freelist cannot play + /* If mc is updating the freeDB, then the befree-list cannot play * catch-up with itself by growing while trying to save it. */ flags &= ~(MDBX_ALLOC_GC | MDBX_ALLOC_KICK | MDBX_COALESCE | MDBX_LIFORECLAIM); @@ -2380,8 +2380,8 @@ static int mdbx_page_alloc(MDBX_cursor *mc, unsigned num, MDBX_page **mp, if (repg_len > MDBX_PNL_UM_SIZE / 2) break; if (flags & MDBX_COALESCE) { - if (repg_len /* current size */ >= env->me_maxfree_1pg / 2 || - repg_pos /* prev size */ >= env->me_maxfree_1pg / 4) + if (repg_len /* current size */ >= env->me_maxgc_ov1page || + repg_pos /* prev size */ >= env->me_maxgc_ov1page / 2) flags &= ~MDBX_COALESCE; } } @@ -3485,80 +3485,95 @@ static int mdbx_prep_backlog(MDBX_txn *txn, MDBX_cursor *mc) { return MDBX_SUCCESS; } -/* Save the freelist as of this transaction to the freeDB. - * This changes the freelist. Keep trying until it stabilizes. */ -static int mdbx_freelist_save(MDBX_txn *txn) { +/* Cleanup reclaimed GC records, than save the befree-list as of this + * transaction to GC (aka freeDB). This recursive changes the reclaimed-list + * loose-list and befree-list. Keep trying until it stabilizes. */ +static int mdbx_update_gc(MDBX_txn *txn) { /* env->me_reclaimed_pglist[] can grow and shrink during this call. - * env->me_last_reclaimed and txn->mt_free_pages[] can only grow. - * Page numbers cannot disappear from txn->mt_free_pages[]. */ - MDBX_cursor mc; + * env->me_last_reclaimed and txn->mt_befree_pages[] can only grow. + * Page numbers cannot disappear from txn->mt_befree_pages[]. */ MDBX_env *env = txn->mt_env; - int rc, more = 1; - txnid_t cleanup_reclaimed_id = 0, head_id = 0; - pgno_t befree_count = 0; - intptr_t head_room = 0, total_room = 0; - unsigned cleanup_reclaimed_pos = 0, refill_reclaimed_pos = 0; const bool lifo = (env->me_flags & MDBX_LIFORECLAIM) != 0; - rc = mdbx_cursor_init(&mc, txn, FREE_DBI, NULL); + MDBX_cursor mc; + int rc = mdbx_cursor_init(&mc, txn, FREE_DBI, NULL); if (unlikely(rc != MDBX_SUCCESS)) return rc; - /* MDBX_RESERVE cancels meminit in ovpage malloc (when no WRITEMAP) */ - const intptr_t clean_limit = - (env->me_flags & (MDBX_NOMEMINIT | MDBX_WRITEMAP)) ? SSIZE_MAX - : env->me_maxfree_1pg; + const char *dbg_prefix_mode = lifo ? " lifo" : " fifo"; + mdbx_trace("\n>>> @%" PRIaTXN, txn->mt_txnid); + (void)dbg_prefix_mode; + unsigned befree_stored = 0, loop = 0; + mdbx_tassert(txn, mdbx_pnl_check(env->me_reclaimed_pglist)); +retry: + mdbx_trace(" >> restart"); mdbx_tassert(txn, mdbx_pnl_check(env->me_reclaimed_pglist)); -again_on_freelist_change: - mdbx_tassert(txn, mdbx_pnl_check(env->me_reclaimed_pglist)); + if (unlikely(/* paranoia */ ++loop > 42)) { + mdbx_error("too more loops %u, bailout", loop); + rc = MDBX_PROBLEM; + goto bailout; + } + + unsigned placed = 0, cleaned_gc_slot = 0, reused_gc_slot = 0, + filled_gc_slot = ~0u; + txnid_t cleaned_gc_id = 0, + head_gc_id = lifo ? *env->me_oldest : env->me_last_reclaimed; + while (1) { - /* Come back here after each Put() in case freelist changed */ + /* Come back here after each Put() in case befree-list changed */ MDBX_val key, data; mdbx_tassert(txn, mdbx_pnl_check(env->me_reclaimed_pglist)); if (!lifo) { /* If using records from freeDB which we have not yet deleted, * now delete them and any we reserved for me_reclaimed_pglist. */ - while (cleanup_reclaimed_id < env->me_last_reclaimed) { + while (cleaned_gc_id < env->me_last_reclaimed) { rc = mdbx_cursor_first(&mc, &key, NULL); - if (unlikely(rc)) + if (unlikely(rc != MDBX_SUCCESS)) goto bailout; rc = mdbx_prep_backlog(txn, &mc); - if (unlikely(rc)) + if (unlikely(rc != MDBX_SUCCESS)) goto bailout; - cleanup_reclaimed_id = head_id = *(txnid_t *)key.iov_base; - total_room = head_room = 0; - more = 1; - mdbx_tassert(txn, cleanup_reclaimed_id <= env->me_last_reclaimed); + cleaned_gc_id = head_gc_id = *(txnid_t *)key.iov_base; + mdbx_tassert(txn, cleaned_gc_id < *env->me_oldest); + placed = 0; + mdbx_tassert(txn, cleaned_gc_id <= env->me_last_reclaimed); mc.mc_flags |= C_RECLAIMING; + mdbx_trace("%s.cleanup-reclaimed-id %" PRIaTXN, dbg_prefix_mode, + cleaned_gc_id); rc = mdbx_cursor_del(&mc, 0); mc.mc_flags ^= C_RECLAIMING; - if (unlikely(rc)) + if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } } else if (txn->mt_lifo_reclaimed) { /* LY: cleanup reclaimed records. */ - while (cleanup_reclaimed_pos < txn->mt_lifo_reclaimed[0]) { - cleanup_reclaimed_id = txn->mt_lifo_reclaimed[++cleanup_reclaimed_pos]; - key.iov_base = &cleanup_reclaimed_id; - key.iov_len = sizeof(cleanup_reclaimed_id); + while (cleaned_gc_slot < txn->mt_lifo_reclaimed[0]) { + cleaned_gc_id = txn->mt_lifo_reclaimed[++cleaned_gc_slot]; + head_gc_id = (head_gc_id > cleaned_gc_id) ? cleaned_gc_id : head_gc_id; + key.iov_base = &cleaned_gc_id; + key.iov_len = sizeof(cleaned_gc_id); rc = mdbx_cursor_get(&mc, &key, NULL, MDBX_SET); if (likely(rc != MDBX_NOTFOUND)) { - if (unlikely(rc)) + if (unlikely(rc != MDBX_SUCCESS)) goto bailout; rc = mdbx_prep_backlog(txn, &mc); - if (unlikely(rc)) + if (unlikely(rc != MDBX_SUCCESS)) goto bailout; + mdbx_tassert(txn, cleaned_gc_id < *env->me_oldest); mc.mc_flags |= C_RECLAIMING; + mdbx_trace("%s.cleanup-reclaimed-id [%u]%" PRIaTXN, dbg_prefix_mode, + cleaned_gc_slot, cleaned_gc_id); rc = mdbx_cursor_del(&mc, 0); mc.mc_flags ^= C_RECLAIMING; - if (unlikely(rc)) + if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } } } + // handle loose pages - put ones into the reclaimed- or befree-list mdbx_tassert(txn, mdbx_pnl_check(env->me_reclaimed_pglist)); if (txn->mt_loose_pages) { /* Return loose page numbers to me_reclaimed_pglist, @@ -3566,7 +3581,7 @@ again_on_freelist_change: * The pages themselves remain in dirtylist. */ if (unlikely(!env->me_reclaimed_pglist) && !(lifo && env->me_last_reclaimed > 1)) { - /* Put loose page numbers in mt_free_pages, + /* Put loose page numbers in mt_befree_pages, * since unable to return them to me_reclaimed_pglist. */ if (unlikely((rc = mdbx_pnl_need(&txn->mt_befree_pages, txn->mt_loose_count)) != 0)) @@ -3575,8 +3590,9 @@ again_on_freelist_change: mdbx_pnl_xappend(txn->mt_befree_pages, mp->mp_pgno); } else { /* Room for loose pages + temp PNL with same */ - if ((rc = mdbx_pnl_need(&env->me_reclaimed_pglist, - 2 * txn->mt_loose_count + 1)) != 0) + rc = mdbx_pnl_need(&env->me_reclaimed_pglist, + 2 * txn->mt_loose_count + 1); + if (unlikely(rc != MDBX_SUCCESS)) goto bailout; MDBX_PNL loose = env->me_reclaimed_pglist + MDBX_PNL_ALLOCLEN(env->me_reclaimed_pglist) - @@ -3612,9 +3628,9 @@ again_on_freelist_change: txn->mt_loose_count = 0; } + // handle reclaimed pages - return suitable into unallocated space mdbx_tassert(txn, mdbx_pnl_check(env->me_reclaimed_pglist)); if (env->me_reclaimed_pglist) { - /* Refund suitable pages into "unallocated" space */ pgno_t tail = txn->mt_next_pgno; pgno_t *const begin = env->me_reclaimed_pglist + 1; pgno_t *const end = begin + env->me_reclaimed_pglist[0]; @@ -3644,82 +3660,71 @@ again_on_freelist_change: } } - /* Save the PNL of pages freed by this txn, to a single record */ - if (befree_count < txn->mt_befree_pages[0]) { - if (unlikely(!befree_count)) { - /* Make sure last page of freeDB is touched and on freelist */ + // handle befree-list - store ones into singe gc-record + if (befree_stored < txn->mt_befree_pages[0]) { + if (unlikely(!befree_stored)) { + /* Make sure last page of freeDB is touched and on befree-list */ rc = mdbx_page_search(&mc, NULL, MDBX_PS_LAST | MDBX_PS_MODIFY); - if (unlikely(rc && rc != MDBX_NOTFOUND)) + if (unlikely(rc != MDBX_SUCCESS && rc != MDBX_NOTFOUND)) goto bailout; } - pgno_t *befree_pages = txn->mt_befree_pages; /* Write to last page of freeDB */ key.iov_len = sizeof(txn->mt_txnid); key.iov_base = &txn->mt_txnid; do { - befree_count = befree_pages[0]; - data.iov_len = MDBX_PNL_SIZEOF(befree_pages); + data.iov_len = MDBX_PNL_SIZEOF(txn->mt_befree_pages); rc = mdbx_cursor_put(&mc, &key, &data, MDBX_RESERVE); - if (unlikely(rc)) + if (unlikely(rc != MDBX_SUCCESS)) goto bailout; - /* Retry if mt_free_pages[] grew during the Put() */ - befree_pages = txn->mt_befree_pages; - } while (befree_count < befree_pages[0]); + /* Retry if mt_befree_pages[] grew during the Put() */ + } while (data.iov_len < MDBX_PNL_SIZEOF(txn->mt_befree_pages)); - mdbx_pnl_sort(befree_pages); - memcpy(data.iov_base, befree_pages, data.iov_len); + befree_stored = (unsigned)txn->mt_befree_pages[0]; + mdbx_pnl_sort(txn->mt_befree_pages); + memcpy(data.iov_base, txn->mt_befree_pages, data.iov_len); + + mdbx_trace("%s.put-befree #%u @ %" PRIaTXN, dbg_prefix_mode, + befree_stored, txn->mt_txnid); if (mdbx_debug_enabled(MDBX_DBG_EXTRA)) { - unsigned i = (unsigned)befree_pages[0]; + unsigned i = befree_stored; mdbx_debug_extra("PNL write txn %" PRIaTXN " root %" PRIaPGNO " num %u, PNL", txn->mt_txnid, txn->mt_dbs[FREE_DBI].md_root, i); for (; i; i--) - mdbx_debug_extra_print(" %" PRIaPGNO "", befree_pages[i]); + mdbx_debug_extra_print(" %" PRIaPGNO "", txn->mt_befree_pages[i]); mdbx_debug_extra_print("\n"); } continue; } + // handle reclaimed and loost pages - merge and store both into gc mdbx_tassert(txn, mdbx_pnl_check(env->me_reclaimed_pglist)); - const intptr_t rpl_len = - (env->me_reclaimed_pglist ? env->me_reclaimed_pglist[0] : 0) + - txn->mt_loose_count; - if (rpl_len && refill_reclaimed_pos == 0) - refill_reclaimed_pos = 1; + mdbx_tassert(txn, txn->mt_loose_count == 0); + const unsigned amount = + env->me_reclaimed_pglist ? env->me_reclaimed_pglist[0] : 0; + const unsigned left = amount - placed; - /* Reserve records for me_reclaimed_pglist[]. Split it if multi-page, - * to avoid searching freeDB for a page range. Use keys in - * range [1,me_last_reclaimed]: Smaller than txnid of oldest reader. */ - if (total_room >= rpl_len) { - if (total_room == rpl_len || --more < 0) - break; - } else if (head_room >= (intptr_t)env->me_maxfree_1pg && head_id > 1) { - /* Keep current record (overflow page), add a new one */ - head_id--; - refill_reclaimed_pos++; - head_room = 0; - } + mdbx_trace("%s: amount %u, placed %d, left %d", dbg_prefix_mode, amount, + placed, (int)left); + if (0 >= (int)left) + break; + mdbx_trace(" >> reserving"); + txnid_t reservation_gc_id; + const unsigned lifo_gc_slots = + txn->mt_lifo_reclaimed ? (unsigned)txn->mt_lifo_reclaimed[0] : 0; if (lifo) { - if (refill_reclaimed_pos > - (txn->mt_lifo_reclaimed ? txn->mt_lifo_reclaimed[0] : 0)) { + if (reused_gc_slot >= lifo_gc_slots) { /* LY: need just a txn-id for save page list. */ rc = mdbx_page_alloc(&mc, 0, NULL, MDBX_ALLOC_GC | MDBX_ALLOC_KICK); - if (likely(rc == 0)) + if (likely(rc == MDBX_SUCCESS)) /* LY: ok, reclaimed from freedb. */ continue; if (unlikely(rc != MDBX_NOTFOUND)) /* LY: other troubles... */ goto bailout; - /* LY: freedb is empty, will look any free txn-id in high2low order. */ - if (unlikely(env->me_last_reclaimed < 1)) { - /* LY: not any txn in the past of freedb. */ - rc = MDBX_MAP_FULL; - goto bailout; - } - if (unlikely(!txn->mt_lifo_reclaimed)) { txn->mt_lifo_reclaimed = mdbx_txl_alloc(); if (unlikely(!txn->mt_lifo_reclaimed)) { @@ -3727,57 +3732,80 @@ again_on_freelist_change: goto bailout; } } - /* LY: append the list. */ - rc = mdbx_txl_append(&txn->mt_lifo_reclaimed, - env->me_last_reclaimed - 1); - if (unlikely(rc)) + /* LY: freedb is empty, will look any free txn-id in high2low order. */ + rc = mdbx_txl_append(&txn->mt_lifo_reclaimed, --head_gc_id); + if (unlikely(rc != MDBX_SUCCESS)) goto bailout; - --env->me_last_reclaimed; - /* LY: note that freeDB cleanup is not needed. */ - ++cleanup_reclaimed_pos; + cleaned_gc_slot += 1 /* mark GC cleanup is not needed. */; + + mdbx_trace("%s: append @%" PRIaTXN + " to lifo-reclaimed, cleaned-gc-slot = %u", + dbg_prefix_mode, head_gc_id, cleaned_gc_slot); } mdbx_tassert(txn, txn->mt_lifo_reclaimed != NULL); - head_id = txn->mt_lifo_reclaimed[refill_reclaimed_pos]; + reservation_gc_id = txn->mt_lifo_reclaimed[++reused_gc_slot]; + mdbx_trace("%s: take @%" PRIaTXN " from lifo-reclaimed[%u]", + dbg_prefix_mode, reservation_gc_id, reused_gc_slot); + head_gc_id = + (head_gc_id > reservation_gc_id) ? reservation_gc_id : head_gc_id; } else { mdbx_tassert(txn, txn->mt_lifo_reclaimed == NULL); + reused_gc_slot++ /* just count reserved records */; + reservation_gc_id = head_gc_id--; + mdbx_trace("%s: take @%" PRIaTXN " from head-gc-id", dbg_prefix_mode, + reservation_gc_id); } - /* (Re)write {key = head_id, PNL length = head_room} */ - total_room -= head_room; - head_room = rpl_len - total_room; - if (head_room > (intptr_t)env->me_maxfree_1pg && head_id > 1) { - /* Overflow multi-page for part of me_reclaimed_pglist */ - head_room /= (head_id < INT16_MAX) ? (pgno_t)head_id - : INT16_MAX; /* amortize page sizes */ - head_room += env->me_maxfree_1pg - head_room % (env->me_maxfree_1pg + 1); - } else if (head_room < 0) { - /* Rare case, not bothering to delete this record */ - head_room = 0; - continue; + mdbx_trace("%s: head_gc_id %" PRIaTXN + ", reused_gc_slot %u, lifo_gc_slots %u, reservation-id " + "%" PRIaTXN, + dbg_prefix_mode, head_gc_id, reused_gc_slot, lifo_gc_slots, + reservation_gc_id); + + const bool no_slots_more = + head_gc_id < 2 && (!lifo || reused_gc_slot >= lifo_gc_slots); + const unsigned chunk = + (left < env->me_maxgc_ov1page || no_slots_more) + ? left + : (left < env->me_maxgc_ov1page * 2) + ? /* the half to each of the last two chunks */ left / 2 + : env->me_maxgc_ov1page; + + mdbx_trace("%s: chunk %u, no_slots_more %s, gc-per-ovpage %u", + dbg_prefix_mode, chunk, no_slots_more ? "yes" : "no", + env->me_maxgc_ov1page); + + mdbx_tassert(txn, reservation_gc_id < *env->me_oldest); + if (unlikely(reservation_gc_id < 1)) { + /* LY: not any txn in the past of freedb. */ + rc = MDBX_PROBLEM; + goto bailout; } - key.iov_len = sizeof(head_id); - key.iov_base = &head_id; - data.iov_len = (head_room + 1) * sizeof(pgno_t); - rc = mdbx_cursor_put(&mc, &key, &data, MDBX_RESERVE); + + key.iov_len = sizeof(reservation_gc_id); + key.iov_base = &reservation_gc_id; + data.iov_len = (chunk + 1) * sizeof(pgno_t); + mdbx_trace("%s.reserve: %u [%u...%u] @%" PRIaTXN, dbg_prefix_mode, chunk, + placed + 1, placed + chunk + 1, reservation_gc_id); + rc = mdbx_cursor_put(&mc, &key, &data, MDBX_RESERVE | MDBX_NOOVERWRITE); mdbx_tassert(txn, mdbx_pnl_check(env->me_reclaimed_pglist)); - if (unlikely(rc)) + if (unlikely(rc != MDBX_SUCCESS)) goto bailout; /* PNL is initially empty, zero out at least the length */ - pgno_t *pgs = (pgno_t *)data.iov_base; - intptr_t i = head_room > clean_limit ? head_room : 0; - do { - pgs[i] = 0; - } while (--i >= 0); - total_room += head_room; + memset(data.iov_base, 0, sizeof(pgno_t)); + placed += chunk; + mdbx_trace("%s.placed %u (+%u), continue", dbg_prefix_mode, placed, chunk); continue; } mdbx_tassert(txn, - cleanup_reclaimed_pos == + cleaned_gc_slot == (txn->mt_lifo_reclaimed ? txn->mt_lifo_reclaimed[0] : 0)); - /* Fill in the reserved me_reclaimed_pglist records */ + mdbx_trace(" >> filling"); + /* Fill in the reserved records */ + filled_gc_slot = reused_gc_slot; rc = MDBX_SUCCESS; mdbx_tassert(txn, mdbx_pnl_check(env->me_reclaimed_pglist)); if (env->me_reclaimed_pglist && env->me_reclaimed_pglist[0]) { @@ -3785,89 +3813,109 @@ again_on_freelist_change: key.iov_len = data.iov_len = 0; /* avoid MSVC warning */ key.iov_base = data.iov_base = NULL; - size_t rpl_left = env->me_reclaimed_pglist[0]; - pgno_t *rpl_end = env->me_reclaimed_pglist + rpl_left; - if (txn->mt_lifo_reclaimed == 0) { + unsigned left = env->me_reclaimed_pglist[0]; + pgno_t *end = env->me_reclaimed_pglist + left; + if (txn->mt_lifo_reclaimed == nullptr) { mdbx_tassert(txn, lifo == 0); rc = mdbx_cursor_first(&mc, &key, &data); - if (unlikely(rc)) + if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } else { mdbx_tassert(txn, lifo != 0); } while (1) { - txnid_t id; - if (txn->mt_lifo_reclaimed == 0) { + txnid_t fill_gc_id; + mdbx_trace("%s: left %u of %u", dbg_prefix_mode, left, + (unsigned)env->me_reclaimed_pglist[0]); + if (txn->mt_lifo_reclaimed == nullptr) { mdbx_tassert(txn, lifo == 0); - id = *(txnid_t *)key.iov_base; - mdbx_tassert(txn, id <= env->me_last_reclaimed); + fill_gc_id = *(txnid_t *)key.iov_base; + if (filled_gc_slot-- /* just countdown reserved records */ == 0 || + fill_gc_id > env->me_last_reclaimed) { + mdbx_notice( + "** restart: reserve depleted (filled_slot %u, fill_id %" PRIaTXN + " > last_reclaimed %" PRIaTXN, + filled_gc_slot, fill_gc_id, env->me_last_reclaimed); + goto retry; + } } else { mdbx_tassert(txn, lifo != 0); - mdbx_tassert(txn, - refill_reclaimed_pos > 0 && - refill_reclaimed_pos <= txn->mt_lifo_reclaimed[0]); - id = txn->mt_lifo_reclaimed[refill_reclaimed_pos--]; - key.iov_base = &id; - key.iov_len = sizeof(id); + if (filled_gc_slot == 0) { + mdbx_notice("** restart: reserve depleted (filled_slot == 0)"); + goto retry; + } + mdbx_tassert(txn, filled_gc_slot > 0 && + filled_gc_slot <= txn->mt_lifo_reclaimed[0]); + fill_gc_id = txn->mt_lifo_reclaimed[filled_gc_slot--]; + mdbx_trace("%s.seek-reservaton @%" PRIaTXN " at lifo_reclaimed[%u]", + dbg_prefix_mode, fill_gc_id, (unsigned)filled_gc_slot); + key.iov_base = &fill_gc_id; + key.iov_len = sizeof(fill_gc_id); rc = mdbx_cursor_get(&mc, &key, &data, MDBX_SET); - if (unlikely(rc)) + if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } mdbx_tassert( - txn, cleanup_reclaimed_pos == + txn, cleaned_gc_slot == (txn->mt_lifo_reclaimed ? txn->mt_lifo_reclaimed[0] : 0)); mdbx_tassert(txn, data.iov_len >= sizeof(pgno_t) * 2); - size_t chunk_len = (data.iov_len / sizeof(pgno_t)) - 1; - if (chunk_len > rpl_left) - chunk_len = rpl_left; - data.iov_len = (chunk_len + 1) * sizeof(pgno_t); - key.iov_base = &id; - key.iov_len = sizeof(id); + const size_t space = (data.iov_len / sizeof(pgno_t)) - 1; + const unsigned chunk = (space > left) ? left : (unsigned)space; + data.iov_len = (chunk + 1) * sizeof(pgno_t); + mdbx_tassert(txn, fill_gc_id > 0 && fill_gc_id < *env->me_oldest); + key.iov_base = &fill_gc_id; + key.iov_len = sizeof(fill_gc_id); - rpl_end -= chunk_len; - data.iov_base = rpl_end; - pgno_t save = rpl_end[0]; - rpl_end[0] = (pgno_t)chunk_len; - mdbx_tassert(txn, mdbx_pnl_check(rpl_end)); + end -= chunk; + data.iov_base = end; + pgno_t save = end[0]; + end[0] = (pgno_t)chunk; + mdbx_tassert(txn, mdbx_pnl_check(end)); mc.mc_flags |= C_RECLAIMING; rc = mdbx_cursor_put(&mc, &key, &data, MDBX_CURRENT); mc.mc_flags ^= C_RECLAIMING; - mdbx_tassert(txn, mdbx_pnl_check(rpl_end)); + mdbx_tassert(txn, mdbx_pnl_check(end)); mdbx_tassert( - txn, cleanup_reclaimed_pos == + txn, cleaned_gc_slot == (txn->mt_lifo_reclaimed ? txn->mt_lifo_reclaimed[0] : 0)); - rpl_end[0] = save; - if (unlikely(rc)) + pgno_t *from = end + 1, *to = end + end[0]; + mdbx_trace("%s.fill: %u [ %u:%" PRIaPGNO "...%u:%" PRIaPGNO + "] @%" PRIaTXN, + dbg_prefix_mode, (unsigned)end[0], + (unsigned)(from - env->me_reclaimed_pglist), *from, + (unsigned)(to - env->me_reclaimed_pglist), *to, fill_gc_id); + end[0] = save; + if (unlikely(rc != MDBX_SUCCESS)) goto bailout; - rpl_left -= chunk_len; - if (rpl_left == 0) + left -= chunk; + if (left == 0) { + rc = MDBX_SUCCESS; break; + } if (!lifo) { rc = mdbx_cursor_next(&mc, &key, &data, MDBX_NEXT); - if (unlikely(rc)) + if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } } } + mdbx_tassert(txn, rc == MDBX_SUCCESS); + if (txn->mt_lifo_reclaimed) { + mdbx_tassert(txn, cleaned_gc_slot == txn->mt_lifo_reclaimed[0]); + if (unlikely(filled_gc_slot != 0)) { + mdbx_notice("** restart: reserve excess (filled-slot %u > 0)", + filled_gc_slot); + goto retry; + } + } + bailout: if (txn->mt_lifo_reclaimed) { - mdbx_tassert(txn, rc || cleanup_reclaimed_pos == txn->mt_lifo_reclaimed[0]); - if (rc == MDBX_SUCCESS && - cleanup_reclaimed_pos != txn->mt_lifo_reclaimed[0]) { - mdbx_tassert(txn, cleanup_reclaimed_pos < txn->mt_lifo_reclaimed[0]); - /* LY: zeroed cleanup_idx to force cleanup - * and refill created freeDB records. */ - cleanup_reclaimed_pos = 0; - /* LY: restart filling */ - total_room = head_room = refill_reclaimed_pos = 0; - more = 1; - goto again_on_freelist_change; - } txn->mt_lifo_reclaimed[0] = 0; if (txn != env->me_txn0) { mdbx_txl_free(txn->mt_lifo_reclaimed); @@ -3875,6 +3923,7 @@ bailout: } } + mdbx_trace("<<< rc = %d", rc); return rc; } @@ -3997,7 +4046,7 @@ static __cold bool mdbx_txn_import_dbi(MDBX_txn *txn, MDBX_dbi dbi) { (env->me_dbflags[i] & MDBX_VALID)) { txn->mt_dbs[i].md_flags = env->me_dbflags[i] & PERSISTENT_FLAGS; txn->mt_dbflags[i] = DB_VALID | DB_USRVALID | DB_STALE; - assert(txn->mt_dbxs[i].md_cmp != NULL); + mdbx_tassert(txn, txn->mt_dbxs[i].md_cmp != NULL); } } txn->mt_numdbs = snap_numdbs; @@ -4241,7 +4290,7 @@ int mdbx_txn_commit(MDBX_txn *txn) { } } - rc = mdbx_freelist_save(txn); + rc = mdbx_update_gc(txn); if (unlikely(rc != MDBX_SUCCESS)) goto fail; @@ -4845,7 +4894,8 @@ int __cold mdbx_env_get_maxkeysize(MDBX_env *env) { #define mdbx_maxkey(nodemax) ((nodemax) - (NODESIZE + sizeof(MDBX_db))) -#define mdbx_maxfree1pg(pagesize) (((pagesize)-PAGEHDRSZ) / sizeof(pgno_t) - 1) +#define mdbx_maxgc_ov1page(pagesize) \ + (((pagesize)-PAGEHDRSZ) / sizeof(pgno_t) - 1) int mdbx_get_maxkeysize(size_t pagesize) { if (pagesize == 0) @@ -4867,11 +4917,11 @@ static void __cold mdbx_setup_pagesize(MDBX_env *env, const size_t pagesize) { mdbx_ensure(env, pagesize <= MAX_PAGESIZE); env->me_psize = (unsigned)pagesize; - STATIC_ASSERT(mdbx_maxfree1pg(MIN_PAGESIZE) > 42); - STATIC_ASSERT(mdbx_maxfree1pg(MAX_PAGESIZE) < MDBX_PNL_DB_MAX); - const intptr_t maxfree_1pg = (pagesize - PAGEHDRSZ) / sizeof(pgno_t) - 1; - mdbx_ensure(env, maxfree_1pg > 42 && maxfree_1pg < MDBX_PNL_DB_MAX); - env->me_maxfree_1pg = (unsigned)maxfree_1pg; + STATIC_ASSERT(mdbx_maxgc_ov1page(MIN_PAGESIZE) > 42); + STATIC_ASSERT(mdbx_maxgc_ov1page(MAX_PAGESIZE) < MDBX_PNL_DB_MAX); + const intptr_t maxgc_ov1page = (pagesize - PAGEHDRSZ) / sizeof(pgno_t) - 1; + mdbx_ensure(env, maxgc_ov1page > 42 && maxgc_ov1page < MDBX_PNL_DB_MAX); + env->me_maxgc_ov1page = (unsigned)maxgc_ov1page; STATIC_ASSERT(mdbx_nodemax(MIN_PAGESIZE) > 42); STATIC_ASSERT(mdbx_nodemax(MAX_PAGESIZE) < UINT16_MAX); From bc6db4e4d7a6ac4b5b19709fa0abd62c5d16ae30 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Sat, 30 Jun 2018 00:42:29 +0300 Subject: [PATCH 22/83] mdbx: backport - allow mdbx_env_compact() to fix page leaks. Don't treat fixing page leaks as an error while copy DB with compactification. Change-Id: I2a575ff9e2b24610172aaca939b5f6957c26ec77 --- src/mdbx.c | 68 +++++++++++++++++++++++++++++++++--------------------- 1 file changed, 42 insertions(+), 26 deletions(-) diff --git a/src/mdbx.c b/src/mdbx.c index 4c7cc2e6..91dac812 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -10137,8 +10137,8 @@ typedef struct mdbx_copy { MDBX_env *mc_env; MDBX_txn *mc_txn; mdbx_condmutex_t mc_condmutex; - char *mc_wbuf[2]; - char *mc_over[2]; + uint8_t *mc_wbuf[2]; + uint8_t *mc_over[2]; size_t mc_wlen[2]; size_t mc_olen[2]; mdbx_filehandle_t mc_fd; @@ -10153,7 +10153,7 @@ typedef struct mdbx_copy { /* Dedicated writer thread for compacting copy. */ static THREAD_RESULT __cold THREAD_CALL mdbx_env_copythr(void *arg) { mdbx_copy *my = arg; - char *ptr; + uint8_t *ptr; int toggle = 0; int rc; @@ -10314,7 +10314,7 @@ static int __cold mdbx_env_cwalk(mdbx_copy *my, pgno_t *pg, int flags) { my->mc_wlen[toggle] += my->mc_env->me_psize; if (omp->mp_pages > 1) { my->mc_olen[toggle] = pgno2bytes(my->mc_env, omp->mp_pages - 1); - my->mc_over[toggle] = (char *)omp + my->mc_env->me_psize; + my->mc_over[toggle] = (uint8_t *)omp + my->mc_env->me_psize; rc = mdbx_env_cthr_toggle(my, 1); if (unlikely(rc != MDBX_SUCCESS)) goto done; @@ -10394,23 +10394,26 @@ done: static int __cold mdbx_env_compact(MDBX_env *env, mdbx_filehandle_t fd) { MDBX_txn *txn = NULL; mdbx_thread_t thr; - mdbx_copy my; - memset(&my, 0, sizeof(my)); + mdbx_copy ctx; + memset(&ctx, 0, sizeof(ctx)); - int rc = mdbx_condmutex_init(&my.mc_condmutex); + int rc = mdbx_condmutex_init(&ctx.mc_condmutex); if (unlikely(rc != MDBX_SUCCESS)) return rc; - rc = mdbx_memalign_alloc(env->me_os_psize, MDBX_WBUF * 2, - (void **)&my.mc_wbuf[0]); + + const size_t buffer_size = pgno2bytes(env, NUM_METAS) + MDBX_WBUF * 2; + uint8_t *buffer = NULL; + rc = mdbx_memalign_alloc(env->me_os_psize, buffer_size, (void **)&buffer); if (unlikely(rc != MDBX_SUCCESS)) goto done; - memset(my.mc_wbuf[0], 0, MDBX_WBUF * 2); - my.mc_wbuf[1] = my.mc_wbuf[0] + MDBX_WBUF; - my.mc_next_pgno = NUM_METAS; - my.mc_env = env; - my.mc_fd = fd; - rc = mdbx_thread_create(&thr, mdbx_env_copythr, &my); + ctx.mc_wbuf[0] = buffer + pgno2bytes(env, NUM_METAS); + memset(ctx.mc_wbuf[0], 0, MDBX_WBUF * 2); + ctx.mc_wbuf[1] = ctx.mc_wbuf[0] + MDBX_WBUF; + ctx.mc_next_pgno = NUM_METAS; + ctx.mc_env = env; + ctx.mc_fd = fd; + rc = mdbx_thread_create(&thr, mdbx_env_copythr, &ctx); if (unlikely(rc != MDBX_SUCCESS)) goto done; @@ -10418,7 +10421,7 @@ static int __cold mdbx_env_compact(MDBX_env *env, mdbx_filehandle_t fd) { if (unlikely(rc != MDBX_SUCCESS)) goto finish; - MDBX_page *meta = mdbx_init_metas(env, my.mc_wbuf[0]); + MDBX_page *const meta = mdbx_init_metas(env, buffer); /* Set metapage 1 with current main DB */ pgno_t new_root, root = txn->mt_dbs[MAIN_DBI].md_root; @@ -10460,25 +10463,38 @@ static int __cold mdbx_env_compact(MDBX_env *env, mdbx_filehandle_t fd) { /* update signature */ meta->mp_meta.mm_datasync_sign = mdbx_meta_sign(&meta->mp_meta); - my.mc_wlen[0] = pgno2bytes(env, NUM_METAS); - my.mc_txn = txn; - rc = mdbx_env_cwalk(&my, &root, 0); + ctx.mc_wlen[0] = pgno2bytes(env, NUM_METAS); + ctx.mc_txn = txn; + rc = mdbx_env_cwalk(&ctx, &root, 0); if (rc == MDBX_SUCCESS && root != new_root) { - mdbx_error("unexpected root %" PRIaPGNO " (%" PRIaPGNO ")", root, new_root); - rc = MDBX_PROBLEM; /* page leak or corrupt DB */ + if (root > new_root) { + mdbx_error("post-compactification root %" PRIaPGNO + " GT expected %" PRIaPGNO " (source DB corrupted)", + root, new_root); + rc = MDBX_CORRUPTED; /* page leak or corrupt DB */ + } else { + mdbx_error("post-compactification root %" PRIaPGNO + " LT expected %" PRIaPGNO " (page leak(s) in source DB)", + root, new_root); + /* fixup and rewrite metas */ + meta->mp_meta.mm_dbs[MAIN_DBI].md_root = root; + meta->mp_meta.mm_geo.next = meta->mp_meta.mm_geo.now = root + 1; + meta->mp_meta.mm_datasync_sign = mdbx_meta_sign(&meta->mp_meta); + rc = mdbx_pwrite(fd, buffer, pgno2bytes(env, NUM_METAS), 0); + } } finish: if (rc != MDBX_SUCCESS) - my.mc_error = rc; - mdbx_env_cthr_toggle(&my, 1 | MDBX_EOF); + ctx.mc_error = rc; + mdbx_env_cthr_toggle(&ctx, 1 | MDBX_EOF); rc = mdbx_thread_join(thr); mdbx_txn_abort(txn); done: - mdbx_memalign_free(my.mc_wbuf[0]); - mdbx_condmutex_destroy(&my.mc_condmutex); - return rc ? rc : my.mc_error; + mdbx_memalign_free(buffer); + mdbx_condmutex_destroy(&ctx.mc_condmutex); + return rc ? rc : ctx.mc_error; } /* Copy environment as-is. */ From 888003c072dd98949cb3007517acb61dce486ca8 Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Sun, 19 Aug 2018 02:04:44 +0300 Subject: [PATCH 23/83] mdbx: backport - fix comments typos (squashed). --- mdbx.h | 2 +- src/bits.h | 2 +- src/mdbx.c | 2 +- test/osal-windows.cc | 8 ++++---- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/mdbx.h b/mdbx.h index 9758fe57..276c8ea6 100644 --- a/mdbx.h +++ b/mdbx.h @@ -468,7 +468,7 @@ typedef struct MDBX_envinfo { uint64_t lower; /* lower limit for datafile size */ uint64_t upper; /* upper limit for datafile size */ uint64_t current; /* current datafile size */ - uint64_t shrink; /* shrink theshold for datafile */ + uint64_t shrink; /* shrink threshold for datafile */ uint64_t grow; /* growth step for datafile */ } mi_geo; uint64_t mi_mapsize; /* Size of the data memory map */ diff --git a/src/bits.h b/src/bits.h index a217444b..37d528f4 100644 --- a/src/bits.h +++ b/src/bits.h @@ -408,7 +408,7 @@ typedef struct MDBX_lockinfo { volatile uint32_t mti_envmode; #ifdef MDBX_OSAL_LOCK - /* Mutex protecting write access to this table. */ + /* Mutex protecting write-txn. */ union { MDBX_OSAL_LOCK mti_wmutex; uint8_t pad_mti_wmutex[MDBX_OSAL_LOCK_SIZE % sizeof(size_t)]; diff --git a/src/mdbx.c b/src/mdbx.c index 91dac812..95e74ca4 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -5391,7 +5391,7 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, int lck_rc) { /* apply preconfigured params, but only if substantial changes: * - upper or lower limit changes - * - shrink theshold or growth step + * - shrink threshold or growth step * But ignore just chagne just a 'now/current' size. */ if (bytes_align2os_bytes(env, env->me_dbgeo.upper) != pgno_align2os_bytes(env, meta.mm_geo.upper) || diff --git a/test/osal-windows.cc b/test/osal-windows.cc index 109c835a..b8cdb535 100644 --- a/test/osal-windows.cc +++ b/test/osal-windows.cc @@ -53,7 +53,7 @@ void osal_wait4barrier(void) { } } -static HANDLE make_inharitable(HANDLE hHandle) { +static HANDLE make_inheritable(HANDLE hHandle) { assert(hHandle != NULL && hHandle != INVALID_HANDLE_VALUE); if (!DuplicateHandle(GetCurrentProcess(), hHandle, GetCurrentProcess(), &hHandle, 0, TRUE, @@ -71,7 +71,7 @@ void osal_setup(const std::vector &actors) { HANDLE hEvent = CreateEvent(NULL, TRUE, FALSE, NULL); if (!hEvent) failure_perror("CreateEvent()", GetLastError()); - hEvent = make_inharitable(hEvent); + hEvent = make_inheritable(hEvent); log_trace("osal_setup: event %" PRIuPTR " -> %p", i, hEvent); events[i] = hEvent; } @@ -79,12 +79,12 @@ void osal_setup(const std::vector &actors) { hBarrierSemaphore = CreateSemaphore(NULL, 0, (LONG)actors.size(), NULL); if (!hBarrierSemaphore) failure_perror("CreateSemaphore(BarrierSemaphore)", GetLastError()); - hBarrierSemaphore = make_inharitable(hBarrierSemaphore); + hBarrierSemaphore = make_inheritable(hBarrierSemaphore); hBarrierEvent = CreateEvent(NULL, TRUE, FALSE, NULL); if (!hBarrierEvent) failure_perror("CreateEvent(BarrierEvent)", GetLastError()); - hBarrierEvent = make_inharitable(hBarrierEvent); + hBarrierEvent = make_inheritable(hBarrierEvent); } void osal_broadcast(unsigned id) { From bd672a558342bff6d36ada4ea2ba9ec4ffc9a2a9 Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Mon, 20 Aug 2018 12:30:09 +0300 Subject: [PATCH 24/83] mdbx: backport - add mdbx_limits_xyz() (squashed). Change-Id: I56c79704c59386a0c4d84b001020484c23925e6c --- mdbx.h | 7 ++++++- src/mdbx.c | 45 +++++++++++++++++++++++++++++++++++++-------- test/main.cc | 6 +++--- 3 files changed, 46 insertions(+), 12 deletions(-) diff --git a/mdbx.h b/mdbx.h index 276c8ea6..2d15cc24 100644 --- a/mdbx.h +++ b/mdbx.h @@ -900,7 +900,7 @@ LIBMDBX_API int mdbx_env_set_maxdbs(MDBX_env *env, MDBX_dbi dbs); * * Returns The maximum size of a key we can write. */ LIBMDBX_API int mdbx_env_get_maxkeysize(MDBX_env *env); -LIBMDBX_API int mdbx_get_maxkeysize(size_t pagesize); +LIBMDBX_API int mdbx_get_maxkeysize(intptr_t pagesize); /* Set application information associated with the MDBX_env. * @@ -1673,6 +1673,11 @@ LIBMDBX_API int mdbx_is_dirty(const MDBX_txn *txn, const void *ptr); LIBMDBX_API int mdbx_dbi_sequence(MDBX_txn *txn, MDBX_dbi dbi, uint64_t *result, uint64_t increment); +LIBMDBX_API int mdbx_limits_pgsize_min(void); +LIBMDBX_API int mdbx_limits_pgsize_max(void); +LIBMDBX_API intptr_t mdbx_limits_dbsize_min(intptr_t pagesize); +LIBMDBX_API intptr_t mdbx_limits_dbsize_max(intptr_t pagesize); + /*----------------------------------------------------------------------------*/ /* attribute support functions for Nexenta */ typedef uint_fast64_t mdbx_attr_t; diff --git a/src/mdbx.c b/src/mdbx.c index 95e74ca4..d5a369c9 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -4897,16 +4897,15 @@ int __cold mdbx_env_get_maxkeysize(MDBX_env *env) { #define mdbx_maxgc_ov1page(pagesize) \ (((pagesize)-PAGEHDRSZ) / sizeof(pgno_t) - 1) -int mdbx_get_maxkeysize(size_t pagesize) { - if (pagesize == 0) - pagesize = mdbx_syspagesize(); - - intptr_t nodemax = mdbx_nodemax(pagesize); - if (nodemax < 0) +int mdbx_get_maxkeysize(intptr_t pagesize) { + if (pagesize < 1) + pagesize = (intptr_t)mdbx_syspagesize(); + else if (unlikely(pagesize < (intptr_t)MIN_PAGESIZE || + pagesize > (intptr_t)MAX_PAGESIZE || + !mdbx_is_power2((size_t)pagesize))) return -MDBX_EINVAL; - intptr_t maxkey = mdbx_maxkey(nodemax); - return (maxkey > 0 && maxkey < INT_MAX) ? (int)maxkey : -MDBX_EINVAL; + return mdbx_maxkey(mdbx_nodemax(pagesize)); } static void __cold mdbx_setup_pagesize(MDBX_env *env, const size_t pagesize) { @@ -12233,6 +12232,36 @@ int mdbx_dbi_sequence(MDBX_txn *txn, MDBX_dbi dbi, uint64_t *result, return MDBX_SUCCESS; } +/*----------------------------------------------------------------------------*/ + +__cold int mdbx_limits_pgsize_min(void) { return MIN_PAGESIZE; } + +__cold int mdbx_limits_pgsize_max(void) { return MAX_PAGESIZE; } + +__cold intptr_t mdbx_limits_dbsize_min(intptr_t pagesize) { + if (pagesize < 1) + pagesize = (intptr_t)mdbx_syspagesize(); + else if (unlikely(pagesize < (intptr_t)MIN_PAGESIZE || + pagesize > (intptr_t)MAX_PAGESIZE || + !mdbx_is_power2((size_t)pagesize))) + return -MDBX_EINVAL; + + return MIN_PAGENO * pagesize; +} + +__cold intptr_t mdbx_limits_dbsize_max(intptr_t pagesize) { + if (pagesize < 1) + pagesize = (intptr_t)mdbx_syspagesize(); + else if (unlikely(pagesize < (intptr_t)MIN_PAGESIZE || + pagesize > (intptr_t)MAX_PAGESIZE || + !mdbx_is_power2((size_t)pagesize))) + return -MDBX_EINVAL; + + const uint64_t limit = MAX_PAGENO * (uint64_t)pagesize; + return (limit < (intptr_t)MAX_MAPSIZE) ? (intptr_t)limit + : (intptr_t)MAX_PAGESIZE; +} + /*----------------------------------------------------------------------------*/ /* attribute support functions for Nexenta */ diff --git a/test/main.cc b/test/main.cc index e6e0e177..d417480d 100644 --- a/test/main.cc +++ b/test/main.cc @@ -1,4 +1,4 @@ -/* +/* * Copyright 2017-2018 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. @@ -195,8 +195,8 @@ int main(int argc, char *const argv[]) { } if (config::parse_option( argc, argv, narg, "keylen.max", params.keylen_max, config::no_scale, - 0, std::min(mdbx_get_maxkeysize(0), (int)UINT16_MAX))) { - + 0, + std::min((unsigned)mdbx_get_maxkeysize(0), (unsigned)UINT16_MAX))) { if (params.keylen_min > params.keylen_max) params.keylen_min = params.keylen_max; continue; From 1b2b98234f214e09a97c1bf8f0694e1e3f26a346 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Thu, 23 Aug 2018 15:32:20 +0300 Subject: [PATCH 25/83] mdbx: backport - fix concurrent opening with custom pagesize (get pagesize from meta-page early). --- src/mdbx.c | 49 +++++++++++++++++++++++++++++-------------------- 1 file changed, 29 insertions(+), 20 deletions(-) diff --git a/src/mdbx.c b/src/mdbx.c index d5a369c9..47b4b5f4 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -4365,6 +4365,8 @@ static int __cold mdbx_read_header(MDBX_env *env, MDBX_meta *meta, unsigned retryleft = 42; while (1) { + mdbx_trace("reading meta[%d]: offset %u, bytes %u, retry-left %u", + meta_number, offset, (unsigned)sizeof(page), retryleft); int err = mdbx_pread(env->me_fd, &page, sizeof(page), offset); if (err != MDBX_SUCCESS) { mdbx_error("read meta[%u,%u]: %i, %s", offset, (unsigned)sizeof(page), @@ -4386,9 +4388,12 @@ static int __cold mdbx_read_header(MDBX_env *env, MDBX_meta *meta, mdbx_info("meta[%u] was updated, re-read it", meta_number); } - if (!retryleft) { - mdbx_error("meta[%u] is too volatile, skip it", meta_number); - continue; + if (page.mp_meta.mm_magic_and_version != MDBX_DATA_MAGIC) { + mdbx_error("meta[%u] has invalid magic/version %" PRIx64, meta_number, + page.mp_meta.mm_magic_and_version); + return ((page.mp_meta.mm_magic_and_version >> 8) != MDBX_MAGIC) + ? MDBX_INVALID + : MDBX_VERSION_MISMATCH; } if (page.mp_pgno != meta_number) { @@ -4397,17 +4402,31 @@ static int __cold mdbx_read_header(MDBX_env *env, MDBX_meta *meta, return MDBX_INVALID; } - if (!F_ISSET(page.mp_flags, P_META)) { + if (page.mp_flags != P_META) { mdbx_error("page #%u not a meta-page", meta_number); return MDBX_INVALID; } - if (page.mp_meta.mm_magic_and_version != MDBX_DATA_MAGIC) { - mdbx_error("meta[%u] has invalid magic/version %" PRIx64, meta_number, - page.mp_meta.mm_magic_and_version); - return ((page.mp_meta.mm_magic_and_version >> 8) != MDBX_MAGIC) - ? MDBX_INVALID - : MDBX_VERSION_MISMATCH; + /* LY: check pagesize */ + if (!mdbx_is_power2(page.mp_meta.mm_psize) || + page.mp_meta.mm_psize < MIN_PAGESIZE || + page.mp_meta.mm_psize > MAX_PAGESIZE) { + mdbx_notice("meta[%u] has invalid pagesize (%u), skip it", meta_number, + page.mp_meta.mm_psize); + rc = mdbx_is_power2(page.mp_meta.mm_psize) ? MDBX_VERSION_MISMATCH + : MDBX_INVALID; + continue; + } + + if (meta_number == 0 && guess_pagesize != page.mp_meta.mm_psize) { + meta->mm_psize = page.mp_meta.mm_psize; + mdbx_info("meta[%u] took pagesize %u", meta_number, + page.mp_meta.mm_psize); + } + + if (!retryleft) { + mdbx_error("meta[%u] is too volatile, skip it", meta_number); + continue; } if (page.mp_meta.mm_txnid_a != page.mp_meta.mm_txnid_b) { @@ -4425,16 +4444,6 @@ static int __cold mdbx_read_header(MDBX_env *env, MDBX_meta *meta, continue; } - /* LY: check pagesize */ - if (!mdbx_is_power2(page.mp_meta.mm_psize) || - page.mp_meta.mm_psize < MIN_PAGESIZE || - page.mp_meta.mm_psize > MAX_PAGESIZE) { - mdbx_notice("meta[%u] has invalid pagesize (%u), skip it", meta_number, - page.mp_meta.mm_psize); - rc = MDBX_VERSION_MISMATCH; - continue; - } - mdbx_debug("read meta%" PRIaPGNO " = root %" PRIaPGNO "/%" PRIaPGNO ", geo %" PRIaPGNO "/%" PRIaPGNO "-%" PRIaPGNO "/%" PRIaPGNO " +%u -%u, txn_id %" PRIaTXN ", %s", From 34300150a17d99f18ce9a0bc5aabc51bc1ad67a7 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Thu, 23 Aug 2018 16:13:47 +0300 Subject: [PATCH 26/83] mdbx: backport - don't touch `mm_psize` and `mm_flags` while provoking bad readers (debug-only). --- src/mdbx.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/mdbx.c b/src/mdbx.c index 47b4b5f4..d1567925 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -4806,9 +4806,11 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, target->mm_datasync_sign = MDBX_DATASIGN_WEAK; mdbx_meta_update_begin(env, target, pending->mm_txnid_a); #ifndef NDEBUG - /* debug: provoke failure to catch a violators */ - memset(target->mm_dbs, 0xCC, - sizeof(target->mm_dbs) + sizeof(target->mm_canary)); + /* debug: provoke failure to catch a violators, but don't touch mm_psize + * and mm_flags to allow readers catch actual pagesize. */ + uint8_t *provoke_begin = (uint8_t *)&target->mm_dbs[FREE_DBI].md_root; + uint8_t *provoke_end = (uint8_t *)&target->mm_datasync_sign; + memset(provoke_begin, 0xCC, provoke_end - provoke_begin); mdbx_jitter4testing(false); #endif From 6d438605dd0d7bf3947e49a6e98b33115460dbd1 Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Fri, 24 Aug 2018 04:00:59 +0300 Subject: [PATCH 27/83] mdbx: backport - check comparator for MDBX_GET_BOTH and MDBX_GET_BOTH_RANGE. --- src/mdbx.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/mdbx.c b/src/mdbx.c index d1567925..ea2c6619 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -7130,9 +7130,11 @@ set1: MDBX_val olddata; if (unlikely((rc = mdbx_node_read(mc, leaf, &olddata)) != MDBX_SUCCESS)) return rc; + if (unlikely(mc->mc_dbx->md_dcmp == NULL)) + return MDBX_EINVAL; rc = mc->mc_dbx->md_dcmp(data, &olddata); if (rc) { - if (op == MDBX_GET_BOTH || rc > 0) + if (op != MDBX_GET_BOTH_RANGE || rc > 0) return MDBX_NOTFOUND; rc = 0; } From a9244f807bb18e8fd4e2e20bb78c4fe85caefd10 Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Fri, 24 Aug 2018 04:04:24 +0300 Subject: [PATCH 28/83] mdbx: backport - setup mdbx_cmp_memn() as data-comparator for safety. --- src/mdbx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mdbx.c b/src/mdbx.c index ea2c6619..ac9ef367 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -10768,7 +10768,7 @@ static MDBX_cmp_func *mdbx_default_keycmp(unsigned flags) { static MDBX_cmp_func *mdbx_default_datacmp(unsigned flags) { return !(flags & MDBX_DUPSORT) - ? 0 + ? mdbx_cmp_memn : ((flags & MDBX_INTEGERDUP) ? mdbx_cmp_int_ua : ((flags & MDBX_REVERSEDUP) ? mdbx_cmp_memnr From 014be165c3a04428d732f43d0978b6ac22c140c8 Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Mon, 3 Sep 2018 14:27:38 +0300 Subject: [PATCH 29/83] mdbx: backport - allow GC's PNL be partially unused. --- src/mdbx.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/mdbx.c b/src/mdbx.c index ac9ef367..a6c23c4c 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -2300,8 +2300,7 @@ static int mdbx_page_alloc(MDBX_cursor *mc, unsigned num, MDBX_page **mp, /* Append PNL from FreeDB record to me_reclaimed_pglist */ pgno_t *re_pnl = (pgno_t *)data.iov_base; - mdbx_tassert(txn, re_pnl[0] == 0 || - data.iov_len == (re_pnl[0] + 1) * sizeof(pgno_t)); + mdbx_tassert(txn, data.iov_len >= (re_pnl[0] + 1) * sizeof(pgno_t)); mdbx_tassert(txn, mdbx_pnl_check(re_pnl)); repg_pos = re_pnl[0]; if (!repg_list) { From 204b5a532d2e84ee4b9fb4919851667efffebd0c Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Fri, 31 Aug 2018 03:03:53 +0300 Subject: [PATCH 30/83] mdbx: backport - shorten maxkeysize (will be fixed in the master branch). Change-Id: I660b1b3e454d9b51a24d3b4cc987c8e2980bd435 --- src/mdbx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mdbx.c b/src/mdbx.c index a6c23c4c..e279b447 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -4902,7 +4902,7 @@ int __cold mdbx_env_get_maxkeysize(MDBX_env *env) { #define mdbx_nodemax(pagesize) \ (((((pagesize)-PAGEHDRSZ) / MDBX_MINKEYS) & -(intptr_t)2) - sizeof(indx_t)) -#define mdbx_maxkey(nodemax) ((nodemax) - (NODESIZE + sizeof(MDBX_db))) +#define mdbx_maxkey(nodemax) (((nodemax)-NODESIZE - sizeof(MDBX_db)) / 2) #define mdbx_maxgc_ov1page(pagesize) \ (((pagesize)-PAGEHDRSZ) / sizeof(pgno_t) - 1) From 912728a322192f11bccb2db96da0d963b932e766 Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Sun, 16 Sep 2018 17:53:35 +0300 Subject: [PATCH 31/83] mdbx: backport - fix mdbx_replace(). Change-Id: I2af00f101017795ca2b967479f86e5ea7e8ad37b --- src/mdbx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mdbx.c b/src/mdbx.c index e279b447..8b7f4b82 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -11996,7 +11996,7 @@ int mdbx_replace(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *new_data, rc = mdbx_cursor_get(&mc, &present_key, &present_data, MDBX_SET_KEY); if (unlikely(rc != MDBX_SUCCESS)) { old_data->iov_base = NULL; - old_data->iov_len = rc; + old_data->iov_len = 0; if (rc != MDBX_NOTFOUND || (flags & MDBX_CURRENT)) goto bailout; } else if (flags & MDBX_NOOVERWRITE) { From 337f7589f83bce306283bbbd0624a3e173660515 Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Sat, 22 Sep 2018 22:48:01 +0300 Subject: [PATCH 32/83] mdbx: backport - fix mdbx_pnl_search(). --- src/mdbx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/mdbx.c b/src/mdbx.c index 8b7f4b82..577f3c01 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -650,8 +650,8 @@ static unsigned __hot mdbx_pnl_search(MDBX_PNL pnl, pgno_t id) { while (n > 0) { unsigned pivot = n >> 1; cursor = base + pivot + 1; - val = MDBX_PNL_ASCENDING ? mdbx_cmp2int(pnl[cursor], id) - : mdbx_cmp2int(id, pnl[cursor]); + val = MDBX_PNL_ASCENDING ? mdbx_cmp2int(id, pnl[cursor]) + : mdbx_cmp2int(pnl[cursor], id); if (val < 0) { n = pivot; From d757ba1266d8bd7e6310889a0b85b8e1650344f8 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Thu, 13 Sep 2018 19:21:57 +0300 Subject: [PATCH 33/83] mdbx: backport - fix MDBX_CORRUPTED due open/shrink collision. --- src/mdbx.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/src/mdbx.c b/src/mdbx.c index 577f3c01..ffa2cfbc 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -4406,6 +4406,11 @@ static int __cold mdbx_read_header(MDBX_env *env, MDBX_meta *meta, return MDBX_INVALID; } + if (!retryleft) { + mdbx_error("meta[%u] is too volatile, skip it", meta_number); + continue; + } + /* LY: check pagesize */ if (!mdbx_is_power2(page.mp_meta.mm_psize) || page.mp_meta.mm_psize < MIN_PAGESIZE || @@ -4423,11 +4428,6 @@ static int __cold mdbx_read_header(MDBX_env *env, MDBX_meta *meta, page.mp_meta.mm_psize); } - if (!retryleft) { - mdbx_error("meta[%u] is too volatile, skip it", meta_number); - continue; - } - if (page.mp_meta.mm_txnid_a != page.mp_meta.mm_txnid_b) { mdbx_warning("meta[%u] not completely updated, skip it", meta_number); continue; @@ -4494,11 +4494,16 @@ static int __cold mdbx_read_header(MDBX_env *env, MDBX_meta *meta, const uint64_t used_bytes = page.mp_meta.mm_geo.next * (uint64_t)page.mp_meta.mm_psize; if (used_bytes > *filesize) { - mdbx_notice("meta[%u] used-bytes (%" PRIu64 ") beyond filesize (%" PRIu64 - "), skip it", - meta_number, used_bytes, *filesize); - rc = MDBX_CORRUPTED; - continue; + rc = mdbx_filesize(env->me_fd, filesize); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + if (used_bytes > *filesize) { + mdbx_notice("meta[%u] used-bytes (%" PRIu64 + ") beyond filesize (%" PRIu64 "), skip it", + meta_number, used_bytes, *filesize); + rc = MDBX_CORRUPTED; + continue; + } } /* LY: check mapsize limits */ From cdd510d20e9bea67be8b00af93a3e4bd32450286 Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Sat, 22 Sep 2018 23:05:04 +0300 Subject: [PATCH 34/83] mdbx: backport - prevent DB corruption due rebalance bugs. Won't fix https://github.com/leo-yuriev/libmdbx/issues/38 in the 'stable/0.1' branch, but add checks to prevent DB corruption. --- src/bits.h | 2 ++ src/mdbx.c | 16 +++++++++++++++- 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/src/bits.h b/src/bits.h index 37d528f4..dd97adf5 100644 --- a/src/bits.h +++ b/src/bits.h @@ -1043,6 +1043,8 @@ static __inline unsigned mdbx_log2(size_t value) { /* Test if a page is a sub page */ #define IS_SUBP(p) F_ISSET((p)->mp_flags, P_SUBP) +#define PAGETYPE(p) ((p)->mp_flags & (P_BRANCH | P_LEAF | P_LEAF2 | P_OVERFLOW)) + /* The number of overflow pages needed to store the given size. */ #define OVPAGES(env, size) (bytes2pgno(env, PAGEHDRSZ - 1 + (size)) + 1) diff --git a/src/mdbx.c b/src/mdbx.c index ffa2cfbc..b968554f 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -8927,6 +8927,14 @@ static int mdbx_node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, int fromleft) { DKBUF; + mdbx_tassert(csrc->mc_txn, PAGETYPE(csrc->mc_pg[csrc->mc_top]) == + PAGETYPE(cdst->mc_pg[cdst->mc_top])); + if (unlikely(PAGETYPE(csrc->mc_pg[csrc->mc_top]) != + PAGETYPE(cdst->mc_pg[cdst->mc_top]))) { + cdst->mc_txn->mt_flags |= MDBX_TXN_ERROR; + return MDBX_PROBLEM; + } + /* Mark src and dst as dirty. */ if (unlikely((rc = mdbx_page_touch(csrc)) || (rc = mdbx_page_touch(cdst)))) return rc; @@ -9161,6 +9169,12 @@ static int mdbx_page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { mdbx_cassert(csrc, csrc->mc_snum > 1); /* can't merge root page */ mdbx_cassert(csrc, cdst->mc_snum > 1); + mdbx_tassert(csrc->mc_txn, PAGETYPE(psrc) == PAGETYPE(pdst)); + if (unlikely(PAGETYPE(psrc) != PAGETYPE(pdst))) { + cdst->mc_txn->mt_flags |= MDBX_TXN_ERROR; + return MDBX_PROBLEM; + } + /* Mark dst as dirty. */ if (unlikely(rc = mdbx_page_touch(cdst))) return rc; @@ -9362,7 +9376,7 @@ static int mdbx_rebalance(MDBX_cursor *mc) { m2 = m2->mc_next) { MDBX_cursor *m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; - if (!(m3->mc_flags & C_INITIALIZED) || (m3->mc_snum < mc->mc_snum)) + if (m3 == mc || !(m3->mc_flags & C_INITIALIZED)) continue; if (m3->mc_pg[0] == mp) { m3->mc_snum = 0; From f57ffc987c340bf68ab5bcfe5764961d78953d3d Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Sat, 22 Sep 2018 23:08:11 +0300 Subject: [PATCH 35/83] mdbx: backport - drop inherited broken audit (will be fixed in the master branch). Internal self-audit (inherited from LMDB) is invalid and useless for sub-db and dupsort cases. --- src/bits.h | 3 - src/mdbx.c | 166 ----------------------------------------------------- 2 files changed, 169 deletions(-) diff --git a/src/bits.h b/src/bits.h index dd97adf5..a4789ec0 100644 --- a/src/bits.h +++ b/src/bits.h @@ -817,14 +817,11 @@ void mdbx_panic(const char *fmt, ...) #define mdbx_assert_enabled() unlikely(mdbx_runtime_flags &MDBX_DBG_ASSERT) -#define mdbx_audit_enabled() unlikely(mdbx_runtime_flags &MDBX_DBG_AUDIT) - #define mdbx_debug_enabled(type) \ unlikely(mdbx_runtime_flags &(type & (MDBX_DBG_TRACE | MDBX_DBG_EXTRA))) #else #define mdbx_debug_enabled(type) (0) -#define mdbx_audit_enabled() (0) #ifndef NDEBUG #define mdbx_assert_enabled() (1) #else diff --git a/src/mdbx.c b/src/mdbx.c index b968554f..4ac5d340 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -1219,169 +1219,6 @@ char *mdbx_dkey(const MDBX_val *key, char *const buf, const size_t bufsize) { return buf; } -#if 0 /* LY: debug stuff */ -static const char *mdbx_leafnode_type(MDBX_node *n) { - static char *const tp[2][2] = {{"", ": DB"}, {": sub-page", ": sub-DB"}}; - return F_ISSET(n->mn_flags, F_BIGDATA) ? ": overflow page" - : tp[F_ISSET(n->mn_flags, F_DUPDATA)] - [F_ISSET(n->mn_flags, F_SUBDATA)]; -} - -/* Display all the keys in the page. */ -static void mdbx_page_list(MDBX_page *mp) { - pgno_t pgno = mp->mp_pgno; - const char *type, *state = (mp->mp_flags & P_DIRTY) ? ", dirty" : ""; - MDBX_node *node; - unsigned i, nkeys, nsize, total = 0; - MDBX_val key; - DKBUF; - - switch (mp->mp_flags & - (P_BRANCH | P_LEAF | P_LEAF2 | P_META | P_OVERFLOW | P_SUBP)) { - case P_BRANCH: - type = "Branch page"; - break; - case P_LEAF: - type = "Leaf page"; - break; - case P_LEAF | P_SUBP: - type = "Sub-page"; - break; - case P_LEAF | P_LEAF2: - type = "LEAF2 page"; - break; - case P_LEAF | P_LEAF2 | P_SUBP: - type = "LEAF2 sub-page"; - break; - case P_OVERFLOW: - mdbx_print("Overflow page %" PRIu64 " pages %u%s\n", pgno, mp->mp_pages, - state); - return; - case P_META: - mdbx_print("Meta-page %" PRIu64 " txnid %" PRIu64 "\n", pgno, - ((MDBX_meta *)PAGEDATA(mp))->mm_txnid); - return; - default: - mdbx_print("Bad page %" PRIu64 " flags 0x%X\n", pgno, mp->mp_flags); - return; - } - - nkeys = NUMKEYS(mp); - mdbx_print("%s %" PRIu64 " numkeys %u%s\n", type, pgno, nkeys, state); - - for (i = 0; i < nkeys; i++) { - if (IS_LEAF2(mp)) { /* LEAF2 pages have no mp_ptrs[] or node headers */ - key.iov_len = nsize = mp->mp_leaf2_ksize; - key.iov_base = LEAF2KEY(mp, i, nsize); - total += nsize; - mdbx_print("key %u: nsize %u, %s\n", i, nsize, DKEY(&key)); - continue; - } - node = NODEPTR(mp, i); - key.iov_len = node->mn_ksize; - key.iov_base = node->mn_data; - nsize = NODESIZE + key.iov_len; - if (IS_BRANCH(mp)) { - mdbx_print("key %u: page %" PRIu64 ", %s\n", i, NODEPGNO(node), - DKEY(&key)); - total += nsize; - } else { - if (F_ISSET(node->mn_flags, F_BIGDATA)) - nsize += sizeof(pgno_t); - else - nsize += NODEDSZ(node); - total += nsize; - nsize += sizeof(indx_t); - mdbx_print("key %u: nsize %u, %s%s\n", i, nsize, DKEY(&key), - mdbx_leafnode_type(node)); - } - total = EVEN(total); - } - mdbx_print("Total: header %u + contents %u + unused %u\n", - IS_LEAF2(mp) ? PAGEHDRSZ : PAGEHDRSZ + mp->mp_lower, total, - SIZELEFT(mp)); -} - -static void mdbx_cursor_chk(MDBX_cursor *mc) { - unsigned i; - MDBX_node *node; - MDBX_page *mp; - - if (!mc->mc_snum || !(mc->mc_flags & C_INITIALIZED)) - return; - for (i = 0; i < mc->mc_top; i++) { - mp = mc->mc_pg[i]; - node = NODEPTR(mp, mc->mc_ki[i]); - if (unlikely(NODEPGNO(node) != mc->mc_pg[i + 1]->mp_pgno)) - mdbx_print("oops!\n"); - } - if (unlikely(mc->mc_ki[i] >= NUMKEYS(mc->mc_pg[i]))) - mdbx_print("ack!\n"); - if (XCURSOR_INITED(mc)) { - node = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); - if (((node->mn_flags & (F_DUPDATA | F_SUBDATA)) == F_DUPDATA) && - mc->mc_xcursor->mx_cursor.mc_pg[0] != NODEDATA(node)) { - mdbx_print("blah!\n"); - } - } -} -#endif /* 0 */ - -/* Count all the pages in each DB and in the freelist and make sure - * it matches the actual number of pages being used. - * All named DBs must be open for a correct count. */ -static int mdbx_audit(MDBX_txn *txn) { - MDBX_cursor mc; - MDBX_val key, data; - int rc; - - pgno_t freecount = 0; - rc = mdbx_cursor_init(&mc, txn, FREE_DBI, NULL); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - while ((rc = mdbx_cursor_get(&mc, &key, &data, MDBX_NEXT)) == 0) - freecount += *(pgno_t *)data.iov_base; - mdbx_tassert(txn, rc == MDBX_NOTFOUND); - - pgno_t count = 0; - for (MDBX_dbi i = 0; i < txn->mt_numdbs; i++) { - MDBX_xcursor mx; - if (!(txn->mt_dbflags[i] & DB_VALID)) - continue; - rc = mdbx_cursor_init(&mc, txn, i, &mx); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - if (txn->mt_dbs[i].md_root == P_INVALID) - continue; - count += txn->mt_dbs[i].md_branch_pages + txn->mt_dbs[i].md_leaf_pages + - txn->mt_dbs[i].md_overflow_pages; - if (txn->mt_dbs[i].md_flags & MDBX_DUPSORT) { - rc = mdbx_page_search(&mc, NULL, MDBX_PS_FIRST); - for (; rc == MDBX_SUCCESS; rc = mdbx_cursor_sibling(&mc, 1)) { - MDBX_page *mp = mc.mc_pg[mc.mc_top]; - for (unsigned j = 0; j < NUMKEYS(mp); j++) { - MDBX_node *leaf = NODEPTR(mp, j); - if (leaf->mn_flags & F_SUBDATA) { - MDBX_db db; - memcpy(&db, NODEDATA(leaf), sizeof(db)); - count += - db.md_branch_pages + db.md_leaf_pages + db.md_overflow_pages; - } - } - } - mdbx_tassert(txn, rc == MDBX_NOTFOUND); - } - } - if (freecount + count + NUM_METAS != txn->mt_next_pgno) { - mdbx_print("audit: %" PRIaTXN " freecount: %" PRIaPGNO " count: %" PRIaPGNO - " total: %" PRIaPGNO " next_pgno: %" PRIaPGNO "\n", - txn->mt_txnid, freecount, count + NUM_METAS, - freecount + count + NUM_METAS, txn->mt_next_pgno); - return MDBX_CORRUPTED; - } - return MDBX_SUCCESS; -} - int mdbx_cmp(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *a, const MDBX_val *b) { mdbx_assert(NULL, txn->mt_signature == MDBX_MT_SIGNATURE); @@ -4297,9 +4134,6 @@ int mdbx_txn_commit(MDBX_txn *txn) { env->me_reclaimed_pglist = NULL; mdbx_pnl_shrink(&txn->mt_befree_pages); - if (mdbx_audit_enabled()) - mdbx_audit(txn); - rc = mdbx_page_flush(txn, 0); if (likely(rc == MDBX_SUCCESS)) { MDBX_meta meta, *head = mdbx_meta_head(env); From e32ca55258c18ad2f9384e196eb70dea45399a50 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Tue, 11 Sep 2018 16:55:22 +0300 Subject: [PATCH 36/83] mdbx: backport - fix tracking around mdbx_cursor_del(). --- src/mdbx.c | 67 +++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 49 insertions(+), 18 deletions(-) diff --git a/src/mdbx.c b/src/mdbx.c index 4ac5d340..9c3a73ec 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -9347,6 +9347,7 @@ static int mdbx_cursor_del0(MDBX_cursor *mc) { MDBX_cursor *m2, *m3; MDBX_dbi dbi = mc->mc_dbi; + mdbx_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); ki = mc->mc_ki[mc->mc_top]; mp = mc->mc_pg[mc->mc_top]; mdbx_node_del(mc, mc->mc_db->md_xsize); @@ -9355,9 +9356,9 @@ static int mdbx_cursor_del0(MDBX_cursor *mc) { /* Adjust other cursors pointing to mp */ for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) { m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; - if (!(m2->mc_flags & m3->mc_flags & C_INITIALIZED)) + if (m3 == mc || !(m2->mc_flags & m3->mc_flags & C_INITIALIZED)) continue; - if (m3 == mc || m3->mc_snum < mc->mc_snum) + if (m3->mc_snum < mc->mc_snum) continue; if (m3->mc_pg[mc->mc_top] == mp) { if (m3->mc_ki[mc->mc_top] == ki) { @@ -9388,32 +9389,35 @@ static int mdbx_cursor_del0(MDBX_cursor *mc) { return rc; } + ki = mc->mc_ki[mc->mc_top]; mp = mc->mc_pg[mc->mc_top]; + mdbx_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); nkeys = NUMKEYS(mp); mdbx_cassert(mc, (mc->mc_db->md_entries > 0 && nkeys > 0) || ((mc->mc_flags & C_SUB) && mc->mc_db->md_entries == 0 && nkeys == 0)); - /* Adjust other cursors pointing to mp */ - for (m2 = mc->mc_txn->mt_cursors[dbi]; !rc && m2; m2 = m2->mc_next) { + /* Adjust THIS and other cursors pointing to mp */ + for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) { m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; - if (!(m2->mc_flags & m3->mc_flags & C_INITIALIZED)) + if (m3 == mc || !(m2->mc_flags & m3->mc_flags & C_INITIALIZED)) continue; if (m3->mc_snum < mc->mc_snum) continue; if (m3->mc_pg[mc->mc_top] == mp) { /* if m3 points past last node in page, find next sibling */ - if (m3->mc_ki[mc->mc_top] >= mc->mc_ki[mc->mc_top]) { - if (m3->mc_ki[mc->mc_top] >= nkeys) { - rc = mdbx_cursor_sibling(m3, 1); - if (rc == MDBX_NOTFOUND) { - m3->mc_flags |= C_EOF; - rc = MDBX_SUCCESS; - continue; - } else if (unlikely(rc != MDBX_SUCCESS)) - break; - } - if (mc->mc_db->md_flags & MDBX_DUPSORT) { + if (m3->mc_ki[mc->mc_top] >= nkeys) { + rc = mdbx_cursor_sibling(m3, true); + if (rc == MDBX_NOTFOUND) { + m3->mc_flags |= C_EOF; + rc = MDBX_SUCCESS; + continue; + } else if (unlikely(rc != MDBX_SUCCESS)) + break; + } + if (m3->mc_ki[mc->mc_top] >= ki || m3->mc_pg[mc->mc_top] != mp) { + if ((mc->mc_db->md_flags & MDBX_DUPSORT) != 0 && + (m3->mc_flags & C_EOF) == 0) { MDBX_node *node = NODEPTR(m3->mc_pg[m3->mc_top], m3->mc_ki[m3->mc_top]); /* If this node has dupdata, it may need to be reinited @@ -9426,14 +9430,41 @@ static int mdbx_cursor_del0(MDBX_cursor *mc) { m3->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(node); } else { rc = mdbx_xcursor_init1(m3, node); - if (likely(rc == MDBX_SUCCESS)) - m3->mc_xcursor->mx_cursor.mc_flags |= C_DEL; + if (unlikely(rc != MDBX_SUCCESS)) + break; + m3->mc_xcursor->mx_cursor.mc_flags |= C_DEL; } } } } } } + + if (mc->mc_ki[mc->mc_top] >= nkeys) { + rc = mdbx_cursor_sibling(mc, true); + if (rc == MDBX_NOTFOUND) { + mc->mc_flags |= C_EOF; + rc = MDBX_SUCCESS; + } + } + if ((mc->mc_db->md_flags & MDBX_DUPSORT) != 0 && + (mc->mc_flags & C_EOF) == 0) { + MDBX_node *node = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); + /* If this node has dupdata, it may need to be reinited + * because its data has moved. + * If the xcursor was not initd it must be reinited. + * Else if node points to a subDB, nothing is needed. */ + if (node->mn_flags & F_DUPDATA) { + if (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) { + if (!(node->mn_flags & F_SUBDATA)) + mc->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(node); + } else { + rc = mdbx_xcursor_init1(mc, node); + if (likely(rc != MDBX_SUCCESS)) + mc->mc_xcursor->mx_cursor.mc_flags |= C_DEL; + } + } + } mc->mc_flags |= C_DEL; } From d232737087a80eecf1d0f3e4d346010137ff2bac Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Tue, 18 Sep 2018 18:16:24 +0300 Subject: [PATCH 37/83] mdbx: backport - add MDBX_FORCE_ASSERT. Change-Id: I68a9f7b42663ea157c7c0a5a58797c94127b45ed --- src/bits.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/bits.h b/src/bits.h index a4789ec0..008579a9 100644 --- a/src/bits.h +++ b/src/bits.h @@ -822,7 +822,7 @@ void mdbx_panic(const char *fmt, ...) #else #define mdbx_debug_enabled(type) (0) -#ifndef NDEBUG +#if !defined(NDEBUG) || defined(MDBX_FORCE_ASSERT) #define mdbx_assert_enabled() (1) #else #define mdbx_assert_enabled() (0) From 3f10e58df2a1d58f83593e22dbf988a6f555e93f Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Tue, 18 Sep 2018 21:07:01 +0300 Subject: [PATCH 38/83] mdbx: backport - re-define assert macro via mdbx_assert. Change-Id: I317801ba4200bdf1aa5cacf75d21a8e633fbc48a --- src/bits.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/bits.h b/src/bits.h index 008579a9..2aad9502 100644 --- a/src/bits.h +++ b/src/bits.h @@ -928,6 +928,9 @@ void mdbx_panic(const char *fmt, ...) /* assert(3) variant in transaction context */ #define mdbx_tassert(txn, expr) mdbx_assert((txn)->mt_env, expr) +#undef assert +#define assert(expr) mdbx_assert(NULL, expr) + static __inline void mdbx_jitter4testing(bool tiny) { #ifndef NDEBUG if (MDBX_DBG_JITTER & mdbx_runtime_flags) From 353b6b8af0164db14a9a6bdccd6a97e6ab5aeb78 Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Tue, 18 Sep 2018 21:10:14 +0300 Subject: [PATCH 39/83] mdbx: backport - refine assections (minor). Change-Id: Ic924988b8ce043d6106df381c996dd2c8ff9ca1f --- src/mdbx.c | 57 +++++++++++++++++++++++++++++------------------------- 1 file changed, 31 insertions(+), 26 deletions(-) diff --git a/src/mdbx.c b/src/mdbx.c index 9c3a73ec..7ee6bd26 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -1886,9 +1886,9 @@ static int mdbx_mapresize(MDBX_env *env, const pgno_t size_pgno, bailout: if (rc == MDBX_SUCCESS) { #if defined(_WIN32) || defined(_WIN64) - assert(size_bytes == env->me_dxb_mmap.current); - assert(size_bytes <= env->me_dxb_mmap.filesize); - assert(limit_bytes == env->me_dxb_mmap.length); + mdbx_assert(env, size_bytes == env->me_dxb_mmap.current); + mdbx_assert(env, size_bytes <= env->me_dxb_mmap.filesize); + mdbx_assert(env, limit_bytes == env->me_dxb_mmap.length); #endif env->me_dbgeo.now = size_bytes; env->me_dbgeo.upper = limit_bytes; @@ -1991,7 +1991,7 @@ static int mdbx_page_alloc(MDBX_cursor *mc, unsigned num, MDBX_page **mp, if (likely(flags & MDBX_ALLOC_CACHE)) { /* If there are any loose pages, just use them */ - assert(mp && num); + mdbx_assert(env, mp && num); if (likely(num == 1 && txn->mt_loose_pages)) { np = txn->mt_loose_pages; txn->mt_loose_pages = NEXT_LOOSE_PAGE(np); @@ -2323,7 +2323,7 @@ static int mdbx_page_alloc(MDBX_cursor *mc, unsigned num, MDBX_page **mp, *mp = NULL; txn->mt_flags |= MDBX_TXN_ERROR; } - assert(rc); + mdbx_assert(env, rc != MDBX_SUCCESS); return rc; } @@ -2593,7 +2593,7 @@ static int mdbx_env_sync_ex(MDBX_env *env, int force, int nonblock) { env->me_sync_pending > pgno2bytes(env, 16 /* FIXME: define threshold */) && (flags & MDBX_NOSYNC) == 0) { - assert(((flags ^ env->me_flags) & MDBX_WRITEMAP) == 0); + mdbx_assert(env, ((flags ^ env->me_flags) & MDBX_WRITEMAP) == 0); const size_t usedbytes = pgno_align2os_bytes(env, head->mm_geo.next); mdbx_txn_unlock(env); @@ -2809,8 +2809,10 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) { r->mr_pid = pid; mdbx_rdt_unlock(env); - if (likely(env->me_flags & MDBX_ENV_TXKEY)) + if (likely(env->me_flags & MDBX_ENV_TXKEY)) { + mdbx_assert(env, env->me_live_reader == env->me_pid); mdbx_thread_rthc_set(env->me_txkey, r); + } } while (1) { @@ -2927,7 +2929,7 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) { return MDBX_SUCCESS; } bailout: - assert(rc != MDBX_SUCCESS); + mdbx_tassert(txn, rc != MDBX_SUCCESS); mdbx_txn_end(txn, MDBX_END_SLOT | MDBX_END_FAIL_BEGIN); return rc; } @@ -3136,7 +3138,7 @@ static void mdbx_dbis_update(MDBX_txn *txn, int keep) { if (ptr) { env->me_dbxs[i].md_name.iov_len = 0; mdbx_compiler_barrier(); - assert(env->me_dbflags[i] == 0); + mdbx_assert(env, env->me_dbflags[i] == 0); env->me_dbiseqs[i]++; env->me_dbxs[i].md_name.iov_base = NULL; free(ptr); @@ -3256,8 +3258,8 @@ int mdbx_txn_reset(MDBX_txn *txn) { /* LY: don't close DBI-handles in MDBX mode */ int rc = mdbx_txn_end(txn, MDBX_END_RESET | MDBX_END_UPDATE); if (rc == MDBX_SUCCESS) { - assert(txn->mt_signature == MDBX_MT_SIGNATURE); - assert(txn->mt_owner == 0); + mdbx_tassert(txn, txn->mt_signature == MDBX_MT_SIGNATURE); + mdbx_tassert(txn, txn->mt_owner == 0); } return rc; } @@ -3811,6 +3813,7 @@ static int mdbx_page_flush(MDBX_txn *txn, pgno_t keep) { continue; } pgno = dl[i].mid; + mdbx_tassert(txn, pgno >= MIN_PAGENO); /* clear dirty flag */ dp->mp_flags &= ~P_DIRTY; dp->mp_validator = 0 /* TODO */; @@ -4171,7 +4174,7 @@ fail: * before mapping it into memory. */ static int __cold mdbx_read_header(MDBX_env *env, MDBX_meta *meta, uint64_t *filesize) { - assert(offsetof(MDBX_page, mp_meta) == PAGEHDRSZ); + STATIC_ASSERT(offsetof(MDBX_page, mp_meta) == PAGEHDRSZ); int rc = mdbx_filesize(env->me_fd, filesize); if (unlikely(rc != MDBX_SUCCESS)) @@ -5350,7 +5353,7 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, int lck_rc) { break; if (lck_rc == /* lck exclusive */ MDBX_RESULT_TRUE) { - assert(META_IS_STEADY(&meta) && !META_IS_STEADY(head)); + mdbx_assert(env, META_IS_STEADY(&meta) && !META_IS_STEADY(head)); if (env->me_flags & MDBX_RDONLY) { mdbx_error("rollback needed: (from head %" PRIaTXN " to steady %" PRIaTXN "), but unable in read-only mode", @@ -5484,8 +5487,8 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, int lck_rc) { /* Open and/or initialize the lock region for the environment. */ static int __cold mdbx_setup_lck(MDBX_env *env, char *lck_pathname, mode_t mode) { - assert(env->me_fd != INVALID_HANDLE_VALUE); - assert(env->me_lfd == INVALID_HANDLE_VALUE); + mdbx_assert(env, env->me_fd != INVALID_HANDLE_VALUE); + mdbx_assert(env, env->me_lfd == INVALID_HANDLE_VALUE); int err = mdbx_openfile(lck_pathname, O_RDWR | O_CREAT, mode, &env->me_lfd); if (err != MDBX_SUCCESS) { @@ -7110,7 +7113,7 @@ int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, mc->mc_ki[mc->mc_top] = (uint16_t)nkeys; return MDBX_NOTFOUND; } - assert(nkeys > 0); + mdbx_cassert(mc, nkeys > 0); rc = MDBX_SUCCESS; if (IS_LEAF2(mp)) { @@ -7718,7 +7721,7 @@ int mdbx_cursor_put(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, if ((rc2 = mdbx_ovpage_free(mc, omp)) != MDBX_SUCCESS) return rc2; } else if (data->iov_len == olddata.iov_len) { - assert(EVEN(key->iov_len) == EVEN(leaf->mn_ksize)); + mdbx_cassert(mc, EVEN(key->iov_len) == EVEN(leaf->mn_ksize)); /* same size, just replace it. Note that we could * also reuse this node if the new data is smaller, * but instead we opt to shrink the node in that case. */ @@ -7735,8 +7738,8 @@ int mdbx_cursor_put(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, mdbx_cassert(mc, key->iov_len < UINT16_MAX); leaf->mn_ksize = (uint16_t)key->iov_len; memcpy(NODEKEY(leaf), key->iov_base, key->iov_len); - assert((char *)NODEKEY(leaf) + NODEDSZ(leaf) < - (char *)(mc->mc_pg[mc->mc_top]) + env->me_psize); + mdbx_cassert(mc, (char *)NODEKEY(leaf) + NODEDSZ(leaf) < + (char *)(mc->mc_pg[mc->mc_top]) + env->me_psize); goto fix_parent; } return MDBX_SUCCESS; @@ -10689,14 +10692,16 @@ static int mdbx_dbi_bind(MDBX_txn *txn, const MDBX_dbi dbi, unsigned user_flags, if (!txn->mt_dbxs[dbi].md_cmp || MDBX_DEBUG) { if (!keycmp) keycmp = mdbx_default_keycmp(user_flags); - assert(!txn->mt_dbxs[dbi].md_cmp || txn->mt_dbxs[dbi].md_cmp == keycmp); + mdbx_tassert(txn, !txn->mt_dbxs[dbi].md_cmp || + txn->mt_dbxs[dbi].md_cmp == keycmp); txn->mt_dbxs[dbi].md_cmp = keycmp; } if (!txn->mt_dbxs[dbi].md_dcmp || MDBX_DEBUG) { if (!datacmp) datacmp = mdbx_default_datacmp(user_flags); - assert(!txn->mt_dbxs[dbi].md_dcmp || txn->mt_dbxs[dbi].md_dcmp == datacmp); + mdbx_tassert(txn, !txn->mt_dbxs[dbi].md_dcmp || + txn->mt_dbxs[dbi].md_dcmp == datacmp); txn->mt_dbxs[dbi].md_dcmp = datacmp; } @@ -10797,7 +10802,7 @@ int mdbx_dbi_open_ex(MDBX_txn *txn, const char *table_name, unsigned user_flags, if (env->me_dbflags[i] & MDBX_VALID) { txn->mt_dbs[i].md_flags = env->me_dbflags[i] & PERSISTENT_FLAGS; txn->mt_dbflags[i] = DB_VALID | DB_USRVALID | DB_STALE; - assert(txn->mt_dbxs[i].md_cmp != NULL); + mdbx_tassert(txn, txn->mt_dbxs[i].md_cmp != NULL); } } txn->mt_numdbs = env->me_numdbs; @@ -10826,7 +10831,7 @@ int mdbx_dbi_open_ex(MDBX_txn *txn, const char *table_name, unsigned user_flags, MDBX_db db_dummy; if (unlikely(rc)) { /* MDBX_NOTFOUND and MDBX_CREATE: Create new DB */ - assert(rc == MDBX_NOTFOUND); + mdbx_tassert(txn, rc == MDBX_NOTFOUND); memset(&db_dummy, 0, sizeof(db_dummy)); db_dummy.md_root = P_INVALID; db_dummy.md_flags = user_flags & PERSISTENT_FLAGS; @@ -10849,7 +10854,7 @@ int mdbx_dbi_open_ex(MDBX_txn *txn, const char *table_name, unsigned user_flags, env->me_dbflags[slot] = 0; rc = mdbx_dbi_bind(txn, slot, user_flags, keycmp, datacmp); if (unlikely(rc != MDBX_SUCCESS)) { - assert((dbflag & DB_CREAT) == 0); + mdbx_tassert(txn, (dbflag & DB_CREAT) == 0); bailout: free(namedup); } else { @@ -11260,7 +11265,7 @@ int __cold mdbx_reader_check(MDBX_env *env, int *dead) { * MDBX_SUCCESS - done * Otherwise errcode. */ int __cold mdbx_reader_check0(MDBX_env *env, int rdt_locked, int *dead) { - assert(rdt_locked >= 0); + mdbx_assert(env, rdt_locked >= 0); if (unlikely(env->me_pid != mdbx_getpid())) { env->me_flags |= MDBX_FATAL_ERROR; @@ -12119,7 +12124,7 @@ int mdbx_dbi_sequence(MDBX_txn *txn, MDBX_dbi dbi, uint64_t *result, if (unlikely(new < increment)) return MDBX_RESULT_TRUE; - assert(new > dbs->md_seq); + mdbx_tassert(txn, new > dbs->md_seq); dbs->md_seq = new; txn->mt_flags |= MDBX_TXN_DIRTY; txn->mt_dbflags[dbi] |= DB_DIRTY; From f3e9731da42e2db63396f3854e908464a549bdc2 Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Wed, 19 Sep 2018 00:21:42 +0300 Subject: [PATCH 40/83] mdbx: backport - move macros/inlines to fix Windows builds. Change-Id: I48aaf6b77466bb8b13294b84de73fb6063c88190 --- src/bits.h | 119 +++++-------------------------------------------- src/mdbx.c | 127 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 137 insertions(+), 109 deletions(-) diff --git a/src/bits.h b/src/bits.h index 2aad9502..d6de6059 100644 --- a/src/bits.h +++ b/src/bits.h @@ -928,20 +928,8 @@ void mdbx_panic(const char *fmt, ...) /* assert(3) variant in transaction context */ #define mdbx_tassert(txn, expr) mdbx_assert((txn)->mt_env, expr) -#undef assert -#define assert(expr) mdbx_assert(NULL, expr) - -static __inline void mdbx_jitter4testing(bool tiny) { -#ifndef NDEBUG - if (MDBX_DBG_JITTER & mdbx_runtime_flags) - mdbx_osal_jitter(tiny); -#else - (void)tiny; -#endif -} - /*----------------------------------------------------------------------------*/ -/* Internal prototypes and inlines */ +/* Internal prototypes */ int mdbx_reader_check0(MDBX_env *env, int rlocked, int *dead); int mdbx_rthc_alloc(mdbx_thread_key_t *key, MDBX_reader *begin, @@ -952,24 +940,6 @@ void mdbx_rthc_global_init(void); void mdbx_rthc_global_dtor(void); void mdbx_rthc_thread_dtor(void *ptr); -static __inline bool mdbx_is_power2(size_t x) { return (x & (x - 1)) == 0; } - -static __inline size_t mdbx_roundup2(size_t value, size_t granularity) { - assert(mdbx_is_power2(granularity)); - return (value + granularity - 1) & ~(granularity - 1); -} - -static __inline unsigned mdbx_log2(size_t value) { - assert(mdbx_is_power2(value)); - - unsigned log = 0; - while (value > 1) { - log += 1; - value >>= 1; - } - return log; -} - #define MDBX_IS_ERROR(rc) \ ((rc) != MDBX_RESULT_TRUE && (rc) != MDBX_RESULT_FALSE) @@ -1115,71 +1085,12 @@ typedef struct MDBX_node { * This is node header plus key plus data size. */ #define LEAFSIZE(k, d) (NODESIZE + (k)->iov_len + (d)->iov_len) -/* Address of node i in page p */ -static __inline MDBX_node *NODEPTR(MDBX_page *p, unsigned i) { - assert(NUMKEYS(p) > (unsigned)(i)); - return (MDBX_node *)((char *)(p) + (p)->mp_ptrs[i] + PAGEHDRSZ); -} - /* Address of the key for the node */ #define NODEKEY(node) (void *)((node)->mn_data) /* Address of the data for a node */ #define NODEDATA(node) (void *)((char *)(node)->mn_data + (node)->mn_ksize) -/* Get the page number pointed to by a branch node */ -static __inline pgno_t NODEPGNO(const MDBX_node *node) { - pgno_t pgno; - if (UNALIGNED_OK) { - pgno = node->mn_ksize_and_pgno; - if (sizeof(pgno_t) > 4) - pgno &= MAX_PAGENO; - } else { - pgno = node->mn_lo | ((pgno_t)node->mn_hi << 16); - if (sizeof(pgno_t) > 4) - pgno |= ((uint64_t)node->mn_flags) << 32; - } - return pgno; -} - -/* Set the page number in a branch node */ -static __inline void SETPGNO(MDBX_node *node, pgno_t pgno) { - assert(pgno <= MAX_PAGENO); - - if (UNALIGNED_OK) { - if (sizeof(pgno_t) > 4) - pgno |= ((uint64_t)node->mn_ksize) << 48; - node->mn_ksize_and_pgno = pgno; - } else { - node->mn_lo = (uint16_t)pgno; - node->mn_hi = (uint16_t)(pgno >> 16); - if (sizeof(pgno_t) > 4) - node->mn_flags = (uint16_t)((uint64_t)pgno >> 32); - } -} - -/* Get the size of the data in a leaf node */ -static __inline size_t NODEDSZ(const MDBX_node *node) { - size_t size; - if (UNALIGNED_OK) { - size = node->mn_dsize; - } else { - size = node->mn_lo | ((size_t)node->mn_hi << 16); - } - return size; -} - -/* Set the size of the data for a leaf node */ -static __inline void SETDSZ(MDBX_node *node, size_t size) { - assert(size < INT_MAX); - if (UNALIGNED_OK) { - node->mn_dsize = (uint32_t)size; - } else { - node->mn_lo = (uint16_t)size; - node->mn_hi = (uint16_t)(size >> 16); - } -} - /* The size of a key in a node */ #define NODEKSZ(node) ((node)->mn_ksize) @@ -1232,19 +1143,8 @@ static __inline void SETDSZ(MDBX_node *node, size_t size) { #define mdbx_cmp2int(a, b) (((a) > (b)) - ((b) > (a))) #endif -static __inline size_t pgno2bytes(const MDBX_env *env, pgno_t pgno) { - mdbx_assert(env, (1u << env->me_psize2log) == env->me_psize); - return ((size_t)pgno) << env->me_psize2log; -} - -static __inline MDBX_page *pgno2page(const MDBX_env *env, pgno_t pgno) { - return (MDBX_page *)(env->me_map + pgno2bytes(env, pgno)); -} - -static __inline pgno_t bytes2pgno(const MDBX_env *env, size_t bytes) { - mdbx_assert(env, (env->me_psize >> env->me_psize2log) == 1); - return (pgno_t)(bytes >> env->me_psize2log); -} +/* Do not spill pages to disk if txn is getting full, may fail instead */ +#define MDBX_NOSPILL 0x8000 static __inline pgno_t pgno_add(pgno_t base, pgno_t augend) { assert(base <= MAX_PAGENO); @@ -1256,10 +1156,11 @@ static __inline pgno_t pgno_sub(pgno_t base, pgno_t subtrahend) { return (subtrahend < base - MIN_PAGENO) ? base - subtrahend : MIN_PAGENO; } -static __inline size_t pgno_align2os_bytes(const MDBX_env *env, pgno_t pgno) { - return mdbx_roundup2(pgno2bytes(env, pgno), env->me_os_psize); -} - -static __inline pgno_t pgno_align2os_pgno(const MDBX_env *env, pgno_t pgno) { - return bytes2pgno(env, pgno_align2os_bytes(env, pgno)); +static __inline void mdbx_jitter4testing(bool tiny) { +#ifndef NDEBUG + if (MDBX_DBG_JITTER & mdbx_runtime_flags) + mdbx_osal_jitter(tiny); +#else + (void)tiny; +#endif } diff --git a/src/mdbx.c b/src/mdbx.c index 7ee6bd26..25d9be1a 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -37,7 +37,134 @@ #include "./bits.h" +/*----------------------------------------------------------------------------*/ +/* Internal inlines */ + +#undef assert +#define assert(expr) mdbx_assert(NULL, expr) + +static __inline bool mdbx_is_power2(size_t x) { return (x & (x - 1)) == 0; } + +static __inline size_t mdbx_roundup2(size_t value, size_t granularity) { + assert(mdbx_is_power2(granularity)); + return (value + granularity - 1) & ~(granularity - 1); +} + +static __inline unsigned mdbx_log2(size_t value) { + assert(mdbx_is_power2(value)); + + unsigned log = 0; + while (value > 1) { + log += 1; + value >>= 1; + } + return log; +} + +/* Address of node i in page p */ +static __inline MDBX_node *NODEPTR(MDBX_page *p, unsigned i) { + assert(NUMKEYS(p) > (unsigned)(i)); + return (MDBX_node *)((char *)(p) + (p)->mp_ptrs[i] + PAGEHDRSZ); +} + +/* Get the page number pointed to by a branch node */ +static __inline pgno_t NODEPGNO(const MDBX_node *node) { + pgno_t pgno; + if (UNALIGNED_OK) { + pgno = node->mn_ksize_and_pgno; + if (sizeof(pgno_t) > 4) + pgno &= MAX_PAGENO; + } else { + pgno = node->mn_lo | ((pgno_t)node->mn_hi << 16); + if (sizeof(pgno_t) > 4) + pgno |= ((uint64_t)node->mn_flags) << 32; + } + return pgno; +} + +/* Set the page number in a branch node */ +static __inline void SETPGNO(MDBX_node *node, pgno_t pgno) { + assert(pgno <= MAX_PAGENO); + + if (UNALIGNED_OK) { + if (sizeof(pgno_t) > 4) + pgno |= ((uint64_t)node->mn_ksize) << 48; + node->mn_ksize_and_pgno = pgno; + } else { + node->mn_lo = (uint16_t)pgno; + node->mn_hi = (uint16_t)(pgno >> 16); + if (sizeof(pgno_t) > 4) + node->mn_flags = (uint16_t)((uint64_t)pgno >> 32); + } +} + +/* Get the size of the data in a leaf node */ +static __inline size_t NODEDSZ(const MDBX_node *node) { + size_t size; + if (UNALIGNED_OK) { + size = node->mn_dsize; + } else { + size = node->mn_lo | ((size_t)node->mn_hi << 16); + } + return size; +} + +/* Set the size of the data for a leaf node */ +static __inline void SETDSZ(MDBX_node *node, size_t size) { + assert(size < INT_MAX); + if (UNALIGNED_OK) { + node->mn_dsize = (uint32_t)size; + } else { + node->mn_lo = (uint16_t)size; + node->mn_hi = (uint16_t)(size >> 16); + } +} + +static __inline size_t pgno2bytes(const MDBX_env *env, pgno_t pgno) { + mdbx_assert(env, (1u << env->me_psize2log) == env->me_psize); + return ((size_t)pgno) << env->me_psize2log; +} + +static __inline MDBX_page *pgno2page(const MDBX_env *env, pgno_t pgno) { + return (MDBX_page *)(env->me_map + pgno2bytes(env, pgno)); +} + +static __inline pgno_t bytes2pgno(const MDBX_env *env, size_t bytes) { + mdbx_assert(env, (env->me_psize >> env->me_psize2log) == 1); + return (pgno_t)(bytes >> env->me_psize2log); +} + +static __inline size_t pgno_align2os_bytes(const MDBX_env *env, pgno_t pgno) { + return mdbx_roundup2(pgno2bytes(env, pgno), env->me_os_psize); +} + +static __inline pgno_t pgno_align2os_pgno(const MDBX_env *env, pgno_t pgno) { + return bytes2pgno(env, pgno_align2os_bytes(env, pgno)); +} + +/* Perform act while tracking temporary cursor mn */ +#define WITH_CURSOR_TRACKING(mn, act) \ + do { \ + mdbx_cassert(&(mn), \ + mn.mc_txn->mt_cursors != NULL /* must be not rdonly txt */); \ + MDBX_cursor mc_dummy, *tracked, \ + **tp = &(mn).mc_txn->mt_cursors[mn.mc_dbi]; \ + if ((mn).mc_flags & C_SUB) { \ + mc_dummy.mc_flags = C_INITIALIZED; \ + mc_dummy.mc_xcursor = (MDBX_xcursor *)&(mn); \ + tracked = &mc_dummy; \ + } else { \ + tracked = &(mn); \ + } \ + tracked->mc_next = *tp; \ + *tp = tracked; \ + { act; } \ + *tp = tracked->mc_next; \ + } while (0) + +/*----------------------------------------------------------------------------*/ /* LY: temporary workaround for Elbrus's memcmp() bug. */ + #if defined(__e2k__) && !__GLIBC_PREREQ(2, 24) int __hot mdbx_e2k_memcmp_bug_workaround(const void *s1, const void *s2, size_t n) { From 6150a8c9035555ad6b906ce9631e22edc3ae1d35 Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Sun, 23 Sep 2018 12:36:49 +0300 Subject: [PATCH 41/83] mdbx: backport - fix/refine mdbx_update_gc() (squashed). --- src/mdbx.c | 352 +++++++++++++++++++++++++++++++++++------------------ 1 file changed, 234 insertions(+), 118 deletions(-) diff --git a/src/mdbx.c b/src/mdbx.c index 25d9be1a..64e28a58 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -3465,6 +3465,9 @@ static int mdbx_update_gc(MDBX_txn *txn) { if (unlikely(rc != MDBX_SUCCESS)) return rc; + mc.mc_next = txn->mt_cursors[FREE_DBI]; + txn->mt_cursors[FREE_DBI] = &mc; + const char *dbg_prefix_mode = lifo ? " lifo" : " fifo"; mdbx_trace("\n>>> @%" PRIaTXN, txn->mt_txnid); (void)dbg_prefix_mode; @@ -3480,17 +3483,51 @@ retry: goto bailout; } - unsigned placed = 0, cleaned_gc_slot = 0, reused_gc_slot = 0, + unsigned settled = 0, cleaned_gc_slot = 0, reused_gc_slot = 0, filled_gc_slot = ~0u; - txnid_t cleaned_gc_id = 0, - head_gc_id = lifo ? *env->me_oldest : env->me_last_reclaimed; + txnid_t cleaned_gc_id = 0, head_gc_id = env->me_last_reclaimed; while (1) { /* Come back here after each Put() in case befree-list changed */ MDBX_val key, data; + mdbx_trace(" >> continue"); mdbx_tassert(txn, mdbx_pnl_check(env->me_reclaimed_pglist)); - if (!lifo) { + if (txn->mt_lifo_reclaimed) { + if (cleaned_gc_slot < txn->mt_lifo_reclaimed[0]) { + settled = 0; + reused_gc_slot = 0; + filled_gc_slot = ~0u; + cleaned_gc_slot = 0; + head_gc_id = ~(txnid_t)0; + /* LY: cleanup reclaimed records. */ + do { + cleaned_gc_id = txn->mt_lifo_reclaimed[++cleaned_gc_slot]; + mdbx_tassert(txn, + cleaned_gc_slot > 0 && cleaned_gc_id < *env->me_oldest); + head_gc_id = + (head_gc_id > cleaned_gc_id) ? cleaned_gc_id : head_gc_id; + key.iov_base = &cleaned_gc_id; + key.iov_len = sizeof(cleaned_gc_id); + rc = mdbx_cursor_get(&mc, &key, NULL, MDBX_SET); + if (rc == MDBX_NOTFOUND) + continue; + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + rc = mdbx_prep_backlog(txn, &mc); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + mdbx_tassert(txn, cleaned_gc_id < *env->me_oldest); + mdbx_trace("%s.cleanup-reclaimed-id [%u]%" PRIaTXN, dbg_prefix_mode, + cleaned_gc_slot, cleaned_gc_id); + mc.mc_flags |= C_RECLAIMING; + rc = mdbx_cursor_del(&mc, 0); + mc.mc_flags ^= C_RECLAIMING; + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + } while (cleaned_gc_slot < txn->mt_lifo_reclaimed[0]); + } + } else { /* If using records from freeDB which we have not yet deleted, * now delete them and any we reserved for me_reclaimed_pglist. */ while (cleaned_gc_id < env->me_last_reclaimed) { @@ -3501,40 +3538,16 @@ retry: if (unlikely(rc != MDBX_SUCCESS)) goto bailout; cleaned_gc_id = head_gc_id = *(txnid_t *)key.iov_base; - mdbx_tassert(txn, cleaned_gc_id < *env->me_oldest); - placed = 0; mdbx_tassert(txn, cleaned_gc_id <= env->me_last_reclaimed); - mc.mc_flags |= C_RECLAIMING; + mdbx_tassert(txn, cleaned_gc_id < *env->me_oldest); mdbx_trace("%s.cleanup-reclaimed-id %" PRIaTXN, dbg_prefix_mode, cleaned_gc_id); + mc.mc_flags |= C_RECLAIMING; rc = mdbx_cursor_del(&mc, 0); mc.mc_flags ^= C_RECLAIMING; if (unlikely(rc != MDBX_SUCCESS)) goto bailout; - } - } else if (txn->mt_lifo_reclaimed) { - /* LY: cleanup reclaimed records. */ - while (cleaned_gc_slot < txn->mt_lifo_reclaimed[0]) { - cleaned_gc_id = txn->mt_lifo_reclaimed[++cleaned_gc_slot]; - head_gc_id = (head_gc_id > cleaned_gc_id) ? cleaned_gc_id : head_gc_id; - key.iov_base = &cleaned_gc_id; - key.iov_len = sizeof(cleaned_gc_id); - rc = mdbx_cursor_get(&mc, &key, NULL, MDBX_SET); - if (likely(rc != MDBX_NOTFOUND)) { - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; - rc = mdbx_prep_backlog(txn, &mc); - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; - mdbx_tassert(txn, cleaned_gc_id < *env->me_oldest); - mc.mc_flags |= C_RECLAIMING; - mdbx_trace("%s.cleanup-reclaimed-id [%u]%" PRIaTXN, dbg_prefix_mode, - cleaned_gc_slot, cleaned_gc_id); - rc = mdbx_cursor_del(&mc, 0); - mc.mc_flags ^= C_RECLAIMING; - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; - } + settled = 0; } } @@ -3666,81 +3679,162 @@ retry: // handle reclaimed and loost pages - merge and store both into gc mdbx_tassert(txn, mdbx_pnl_check(env->me_reclaimed_pglist)); mdbx_tassert(txn, txn->mt_loose_count == 0); + + mdbx_trace(" >> reserving"); const unsigned amount = env->me_reclaimed_pglist ? env->me_reclaimed_pglist[0] : 0; - const unsigned left = amount - placed; - - mdbx_trace("%s: amount %u, placed %d, left %d", dbg_prefix_mode, amount, - placed, (int)left); + const unsigned left = amount - settled; + mdbx_trace("%s: amount %u, placed %d, left %d, lifo-reclaimed-slots %u, " + "reused-gc-slots %u", + dbg_prefix_mode, amount, settled, (int)left, + txn->mt_lifo_reclaimed ? (unsigned)txn->mt_lifo_reclaimed[0] : 0, + reused_gc_slot); if (0 >= (int)left) break; - mdbx_trace(" >> reserving"); + if (unlikely(head_gc_id == 0)) { + head_gc_id = mdbx_find_oldest(txn) - 1; + if (txn->mt_lifo_reclaimed == NULL) { + rc = mdbx_cursor_get(&mc, &key, NULL, MDBX_FIRST); + if (unlikely(rc != MDBX_SUCCESS)) { + if (rc != MDBX_NOTFOUND) + goto bailout; + } else if (unlikely(key.iov_len != sizeof(txnid_t))) { + rc = MDBX_CORRUPTED; + goto bailout; + } else { + txnid_t first_txn; + memcpy(&first_txn, key.iov_base, sizeof(txnid_t)); + if (head_gc_id >= first_txn) + head_gc_id = first_txn - 1; + } + } + } + + const unsigned prefer_max_scatter = 257; txnid_t reservation_gc_id; - const unsigned lifo_gc_slots = - txn->mt_lifo_reclaimed ? (unsigned)txn->mt_lifo_reclaimed[0] : 0; if (lifo) { - if (reused_gc_slot >= lifo_gc_slots) { + mdbx_tassert(txn, txn->mt_lifo_reclaimed != NULL); + if (unlikely(!txn->mt_lifo_reclaimed)) { + txn->mt_lifo_reclaimed = mdbx_txl_alloc(); + if (unlikely(!txn->mt_lifo_reclaimed)) { + rc = MDBX_ENOMEM; + goto bailout; + } + } + + if (head_gc_id > 1 && txn->mt_lifo_reclaimed[0] < prefer_max_scatter && + left > ((unsigned)txn->mt_lifo_reclaimed[0] - reused_gc_slot) * + env->me_maxgc_ov1page) { /* LY: need just a txn-id for save page list. */ rc = mdbx_page_alloc(&mc, 0, NULL, MDBX_ALLOC_GC | MDBX_ALLOC_KICK); - if (likely(rc == MDBX_SUCCESS)) + if (likely(rc == MDBX_SUCCESS)) { /* LY: ok, reclaimed from freedb. */ + mdbx_trace("%s: took @%" PRIaTXN " from GC, continue", + dbg_prefix_mode, MDBX_PNL_LAST(txn->mt_lifo_reclaimed)); continue; + } if (unlikely(rc != MDBX_NOTFOUND)) /* LY: other troubles... */ goto bailout; - if (unlikely(!txn->mt_lifo_reclaimed)) { - txn->mt_lifo_reclaimed = mdbx_txl_alloc(); - if (unlikely(!txn->mt_lifo_reclaimed)) { - rc = MDBX_ENOMEM; - goto bailout; - } - } /* LY: freedb is empty, will look any free txn-id in high2low order. */ - rc = mdbx_txl_append(&txn->mt_lifo_reclaimed, --head_gc_id); - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; - cleaned_gc_slot += 1 /* mark GC cleanup is not needed. */; + do { + --head_gc_id; + mdbx_assert(env, + txn->mt_lifo_reclaimed[0] == 0 || + txn->mt_lifo_reclaimed[txn->mt_lifo_reclaimed[0]] > + head_gc_id); + rc = mdbx_txl_append(&txn->mt_lifo_reclaimed, head_gc_id); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + cleaned_gc_slot += 1 /* mark GC cleanup is not needed. */; - mdbx_trace("%s: append @%" PRIaTXN - " to lifo-reclaimed, cleaned-gc-slot = %u", - dbg_prefix_mode, head_gc_id, cleaned_gc_slot); + mdbx_trace("%s: append @%" PRIaTXN + " to lifo-reclaimed, cleaned-gc-slot = %u", + dbg_prefix_mode, head_gc_id, cleaned_gc_slot); + } while (head_gc_id > 1 && + txn->mt_lifo_reclaimed[0] < prefer_max_scatter && + left > ((unsigned)txn->mt_lifo_reclaimed[0] - reused_gc_slot) * + env->me_maxgc_ov1page); } - mdbx_tassert(txn, txn->mt_lifo_reclaimed != NULL); - reservation_gc_id = txn->mt_lifo_reclaimed[++reused_gc_slot]; + + if ((unsigned)txn->mt_lifo_reclaimed[0] <= reused_gc_slot) { + mdbx_notice("** restart: reserve depleted (reused_gc_slot %u >= " + "lifo_reclaimed %u" PRIaTXN, + reused_gc_slot, (unsigned)txn->mt_lifo_reclaimed[0]); + goto retry; + } + const unsigned i = (unsigned)txn->mt_lifo_reclaimed[0] - reused_gc_slot; + mdbx_tassert(txn, i > 0 && i <= txn->mt_lifo_reclaimed[0]); + reservation_gc_id = txn->mt_lifo_reclaimed[i]; mdbx_trace("%s: take @%" PRIaTXN " from lifo-reclaimed[%u]", - dbg_prefix_mode, reservation_gc_id, reused_gc_slot); - head_gc_id = - (head_gc_id > reservation_gc_id) ? reservation_gc_id : head_gc_id; + dbg_prefix_mode, reservation_gc_id, i); } else { mdbx_tassert(txn, txn->mt_lifo_reclaimed == NULL); - reused_gc_slot++ /* just count reserved records */; reservation_gc_id = head_gc_id--; mdbx_trace("%s: take @%" PRIaTXN " from head-gc-id", dbg_prefix_mode, reservation_gc_id); } + ++reused_gc_slot; - mdbx_trace("%s: head_gc_id %" PRIaTXN - ", reused_gc_slot %u, lifo_gc_slots %u, reservation-id " + unsigned chunk = left; + if (unlikely(chunk > env->me_maxgc_ov1page)) { + const unsigned avail_gc_slots = + txn->mt_lifo_reclaimed + ? (unsigned)txn->mt_lifo_reclaimed[0] - reused_gc_slot + 1 + : (head_gc_id < INT16_MAX) ? (unsigned)head_gc_id : INT16_MAX; + if (avail_gc_slots > 1) { + if (chunk < env->me_maxgc_ov1page * 2) + chunk /= 2; + else { + const unsigned threshold = + env->me_maxgc_ov1page * ((avail_gc_slots < prefer_max_scatter) + ? avail_gc_slots + : prefer_max_scatter); + if (left < threshold) + chunk = env->me_maxgc_ov1page; + else { + const unsigned tail = left - threshold + env->me_maxgc_ov1page + 1; + unsigned span = 1; + unsigned avail = (unsigned)((pgno2bytes(env, span) - PAGEHDRSZ) / + sizeof(pgno_t)) /*- 1 + span */; + if (tail > avail) { + for (unsigned i = amount - span; i > 0; --i) { + if (MDBX_PNL_ASCENDING + ? (env->me_reclaimed_pglist[i] + span) + : (env->me_reclaimed_pglist[i] - span) == + env->me_reclaimed_pglist[i + span]) { + span += 1; + avail = (unsigned)((pgno2bytes(env, span) - PAGEHDRSZ) / + sizeof(pgno_t)) - + 1 + span; + if (avail >= tail) + break; + } + } + } + + chunk = (avail >= tail) ? tail - span + : (avail_gc_slots > 3 && + reused_gc_slot < prefer_max_scatter - 3) + ? avail - span + : tail; + } + } + } + } + mdbx_tassert(txn, chunk > 0); + + mdbx_trace("%s: head_gc_id %" PRIaTXN ", reused_gc_slot %u, reservation-id " "%" PRIaTXN, - dbg_prefix_mode, head_gc_id, reused_gc_slot, lifo_gc_slots, - reservation_gc_id); + dbg_prefix_mode, head_gc_id, reused_gc_slot, reservation_gc_id); - const bool no_slots_more = - head_gc_id < 2 && (!lifo || reused_gc_slot >= lifo_gc_slots); - const unsigned chunk = - (left < env->me_maxgc_ov1page || no_slots_more) - ? left - : (left < env->me_maxgc_ov1page * 2) - ? /* the half to each of the last two chunks */ left / 2 - : env->me_maxgc_ov1page; - - mdbx_trace("%s: chunk %u, no_slots_more %s, gc-per-ovpage %u", - dbg_prefix_mode, chunk, no_slots_more ? "yes" : "no", + mdbx_trace("%s: chunk %u, gc-per-ovpage %u", dbg_prefix_mode, chunk, env->me_maxgc_ov1page); - mdbx_tassert(txn, reservation_gc_id < *env->me_oldest); + mdbx_tassert(txn, + reservation_gc_id > 0 && reservation_gc_id < *env->me_oldest); if (unlikely(reservation_gc_id < 1)) { /* LY: not any txn in the past of freedb. */ rc = MDBX_PROBLEM; @@ -3751,7 +3845,7 @@ retry: key.iov_base = &reservation_gc_id; data.iov_len = (chunk + 1) * sizeof(pgno_t); mdbx_trace("%s.reserve: %u [%u...%u] @%" PRIaTXN, dbg_prefix_mode, chunk, - placed + 1, placed + chunk + 1, reservation_gc_id); + settled + 1, settled + chunk + 1, reservation_gc_id); rc = mdbx_cursor_put(&mc, &key, &data, MDBX_RESERVE | MDBX_NOOVERWRITE); mdbx_tassert(txn, mdbx_pnl_check(env->me_reclaimed_pglist)); if (unlikely(rc != MDBX_SUCCESS)) @@ -3759,8 +3853,8 @@ retry: /* PNL is initially empty, zero out at least the length */ memset(data.iov_base, 0, sizeof(pgno_t)); - placed += chunk; - mdbx_trace("%s.placed %u (+%u), continue", dbg_prefix_mode, placed, chunk); + settled += chunk; + mdbx_trace("%s.placed %u (+%u), continue", dbg_prefix_mode, settled, chunk); continue; } @@ -3770,7 +3864,9 @@ retry: mdbx_trace(" >> filling"); /* Fill in the reserved records */ - filled_gc_slot = reused_gc_slot; + filled_gc_slot = txn->mt_lifo_reclaimed + ? (unsigned)txn->mt_lifo_reclaimed[0] - reused_gc_slot + : reused_gc_slot; rc = MDBX_SUCCESS; mdbx_tassert(txn, mdbx_pnl_check(env->me_reclaimed_pglist)); if (env->me_reclaimed_pglist && env->me_reclaimed_pglist[0]) { @@ -3778,8 +3874,8 @@ retry: key.iov_len = data.iov_len = 0; /* avoid MSVC warning */ key.iov_base = data.iov_base = NULL; - unsigned left = env->me_reclaimed_pglist[0]; - pgno_t *end = env->me_reclaimed_pglist + left; + const unsigned amount = env->me_reclaimed_pglist[0]; + unsigned left = amount; if (txn->mt_lifo_reclaimed == nullptr) { mdbx_tassert(txn, lifo == 0); rc = mdbx_cursor_first(&mc, &key, &data); @@ -3796,8 +3892,7 @@ retry: if (txn->mt_lifo_reclaimed == nullptr) { mdbx_tassert(txn, lifo == 0); fill_gc_id = *(txnid_t *)key.iov_base; - if (filled_gc_slot-- /* just countdown reserved records */ == 0 || - fill_gc_id > env->me_last_reclaimed) { + if (filled_gc_slot-- == 0 || fill_gc_id > env->me_last_reclaimed) { mdbx_notice( "** restart: reserve depleted (filled_slot %u, fill_id %" PRIaTXN " > last_reclaimed %" PRIaTXN, @@ -3806,18 +3901,18 @@ retry: } } else { mdbx_tassert(txn, lifo != 0); - if (filled_gc_slot == 0) { - mdbx_notice("** restart: reserve depleted (filled_slot == 0)"); + if (++filled_gc_slot > (unsigned)txn->mt_lifo_reclaimed[0]) { + mdbx_notice("** restart: reserve depleted (filled_gc_slot %u > " + "lifo_reclaimed %u" PRIaTXN, + filled_gc_slot, (unsigned)txn->mt_lifo_reclaimed[0]); goto retry; } - mdbx_tassert(txn, filled_gc_slot > 0 && - filled_gc_slot <= txn->mt_lifo_reclaimed[0]); - fill_gc_id = txn->mt_lifo_reclaimed[filled_gc_slot--]; - mdbx_trace("%s.seek-reservaton @%" PRIaTXN " at lifo_reclaimed[%u]", - dbg_prefix_mode, fill_gc_id, (unsigned)filled_gc_slot); + fill_gc_id = txn->mt_lifo_reclaimed[filled_gc_slot]; + mdbx_trace("%s.seek-reservation @%" PRIaTXN " at lifo_reclaimed[%u]", + dbg_prefix_mode, fill_gc_id, filled_gc_slot); key.iov_base = &fill_gc_id; key.iov_len = sizeof(fill_gc_id); - rc = mdbx_cursor_get(&mc, &key, &data, MDBX_SET); + rc = mdbx_cursor_get(&mc, &key, &data, MDBX_SET_KEY); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } @@ -3825,36 +3920,52 @@ retry: txn, cleaned_gc_slot == (txn->mt_lifo_reclaimed ? txn->mt_lifo_reclaimed[0] : 0)); - mdbx_tassert(txn, data.iov_len >= sizeof(pgno_t) * 2); + mdbx_tassert(txn, fill_gc_id > 0 && fill_gc_id < *env->me_oldest); const size_t space = (data.iov_len / sizeof(pgno_t)) - 1; const unsigned chunk = (space > left) ? left : (unsigned)space; - data.iov_len = (chunk + 1) * sizeof(pgno_t); + mdbx_tassert(txn, data.iov_len >= sizeof(pgno_t) * 2 && chunk > 0); + if ((space > chunk && loop < 3) || + (space > chunk + env->me_maxgc_ov1page && loop < 21)) + data.iov_len = (chunk + 1) * sizeof(pgno_t); + data.iov_base = NULL; mdbx_tassert(txn, fill_gc_id > 0 && fill_gc_id < *env->me_oldest); key.iov_base = &fill_gc_id; key.iov_len = sizeof(fill_gc_id); - end -= chunk; - data.iov_base = end; - pgno_t save = end[0]; - end[0] = (pgno_t)chunk; - mdbx_tassert(txn, mdbx_pnl_check(end)); mc.mc_flags |= C_RECLAIMING; - rc = mdbx_cursor_put(&mc, &key, &data, MDBX_CURRENT); + rc = mdbx_cursor_put(&mc, &key, &data, MDBX_CURRENT | MDBX_RESERVE); mc.mc_flags ^= C_RECLAIMING; - mdbx_tassert(txn, mdbx_pnl_check(end)); - mdbx_tassert( - txn, cleaned_gc_slot == - (txn->mt_lifo_reclaimed ? txn->mt_lifo_reclaimed[0] : 0)); - pgno_t *from = end + 1, *to = end + end[0]; - mdbx_trace("%s.fill: %u [ %u:%" PRIaPGNO "...%u:%" PRIaPGNO - "] @%" PRIaTXN, - dbg_prefix_mode, (unsigned)end[0], - (unsigned)(from - env->me_reclaimed_pglist), *from, - (unsigned)(to - env->me_reclaimed_pglist), *to, fill_gc_id); - end[0] = save; + mdbx_tassert(txn, mdbx_pnl_check(env->me_reclaimed_pglist)); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; + if (unlikely(txn->mt_loose_count || + amount != (unsigned)env->me_reclaimed_pglist[0])) { + memset(data.iov_base, 0, sizeof(pgno_t)); + mdbx_notice("** restart: reclaimed-list changed (%u -> %u, %u)", amount, + (unsigned)env->me_reclaimed_pglist[0], txn->mt_loose_count); + goto retry; + } + if (unlikely(txn->mt_lifo_reclaimed + ? cleaned_gc_slot < (unsigned)txn->mt_lifo_reclaimed[0] + : cleaned_gc_id < env->me_last_reclaimed)) { + memset(data.iov_base, 0, sizeof(pgno_t)); + mdbx_notice("** restart: reclaimed-slots changed"); + goto retry; + } + + pgno_t *dst = data.iov_base; + *dst = chunk; + pgno_t *src = env->me_reclaimed_pglist + left - chunk + 1; + memcpy(dst + 1, src, chunk * sizeof(pgno_t)); + mdbx_tassert(txn, mdbx_pnl_check(dst)); + pgno_t *from = src, *to = src + chunk; + mdbx_trace("%s.fill: %u [ %u:%" PRIaPGNO "...%u:%" PRIaPGNO + "] @%" PRIaTXN, + dbg_prefix_mode, chunk, + (unsigned)(from - env->me_reclaimed_pglist), from[0], + (unsigned)(to - env->me_reclaimed_pglist), to[-1], fill_gc_id); + left -= chunk; if (left == 0) { rc = MDBX_SUCCESS; @@ -3870,16 +3981,21 @@ retry: } mdbx_tassert(txn, rc == MDBX_SUCCESS); - if (txn->mt_lifo_reclaimed) { - mdbx_tassert(txn, cleaned_gc_slot == txn->mt_lifo_reclaimed[0]); - if (unlikely(filled_gc_slot != 0)) { - mdbx_notice("** restart: reserve excess (filled-slot %u > 0)", - filled_gc_slot); - goto retry; - } + if (unlikely(txn->mt_loose_count != 0 || + filled_gc_slot != (txn->mt_lifo_reclaimed + ? (unsigned)txn->mt_lifo_reclaimed[0] + : 0))) { + mdbx_notice("** restart: reserve excess (filled-slot %u, loose-count %u)", + filled_gc_slot, txn->mt_loose_count); + goto retry; } + mdbx_tassert(txn, txn->mt_lifo_reclaimed == NULL || + cleaned_gc_slot == txn->mt_lifo_reclaimed[0]); + bailout: + txn->mt_cursors[FREE_DBI] = mc.mc_next; + if (txn->mt_lifo_reclaimed) { txn->mt_lifo_reclaimed[0] = 0; if (txn != env->me_txn0) { From 6da477d37f8646dbd33d2c817a5f6d4799722e96 Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Sun, 23 Sep 2018 12:37:23 +0300 Subject: [PATCH 42/83] mdbx-ci: backport - refines for Windows (squashed). - push logs to appveyor separately. - rename 'test.exe' to 'mdbx_test.exe'. - add test.db to appveyor artefacts (windows). --- appveyor.yml | 18 ++++++++++-------- test/test.vcxproj | 4 ++++ 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/appveyor.yml b/appveyor.yml index 98ac34bd..3b0660ae 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -32,21 +32,23 @@ build_script: test_script: - ps: | - if (($env:PLATFORM -eq "x86") -and (Test-Path "C:\projects\libmdbx\Win32\$env:CONFIGURATION\test.exe" -PathType Leaf)) { - $test = "C:\projects\libmdbx\Win32\$env:CONFIGURATION\test.exe" + if (($env:PLATFORM -eq "x86") -and (Test-Path "C:\projects\libmdbx\Win32\$env:CONFIGURATION\mdbx_test.exe" -PathType Leaf)) { + $mdbx_test = "C:\projects\libmdbx\Win32\$env:CONFIGURATION\mdbx_test.exe" $mdbx_chk = "C:\projects\libmdbx\Win32\$env:CONFIGURATION\mdbx_chk.exe" } elseif (($env:PLATFORM -ne "ARM") -and ($env:PLATFORM -ne "ARM64")) { - $test = "C:\projects\libmdbx\$env:PLATFORM\$env:CONFIGURATION\test.exe" + $mdbx_test = "C:\projects\libmdbx\$env:PLATFORM\$env:CONFIGURATION\mdbx_test.exe" $mdbx_chk = "C:\projects\libmdbx\$env:PLATFORM\$env:CONFIGURATION\mdbx_chk.exe" } else { - $test = "" + $mdbx_test = "" $mdbx_chk = "" } - if ($test -ne "") { - & "$test" --pathname=tmp.db --dont-cleanup-after basic | Tee-Object -file test.log | Select-Object -last 42 - & "$mdbx_chk" -nvv tmp.db | Tee-Object -file chk.log | Select-Object -last 42 + if ($mdbx_test -ne "") { + & "$mdbx_test" --pathname=test.db --dont-cleanup-after basic | Tee-Object -file test.log | Select-Object -last 42 + & "$mdbx_chk" -nvv test.db | Tee-Object -file chk.log | Select-Object -last 42 } on_failure: -- ps: Push-AppveyorArtifact test.log chk.log +- ps: Push-AppveyorArtifact test.log +- ps: Push-AppveyorArtifact test.db +- ps: Push-AppveyorArtifact chk.log diff --git a/test/test.vcxproj b/test/test.vcxproj index 3ee13cf8..a8c21d38 100644 --- a/test/test.vcxproj +++ b/test/test.vcxproj @@ -78,21 +78,25 @@ true $(SolutionDir)$(Platform)\$(Configuration)\ $(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\ + mdbx_test false $(SolutionDir)$(Platform)\$(Configuration)\ $(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\ + mdbx_test true $(SolutionDir)$(Platform)\$(Configuration)\ $(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\ + mdbx_test false $(SolutionDir)$(Platform)\$(Configuration)\ $(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\ + mdbx_test From b51d92d449bac5ba0128dc5cb6e1ab5867ed838a Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Sun, 23 Sep 2018 12:37:28 +0300 Subject: [PATCH 43/83] mdbx-test: backport - update test (squashed). - add support for 'default' options values. - add min/max cases for option values. - add support for db-geometry params. - fix int-types for 32-bit builds (minor). - fix key/value generation for long-length cases. - fix update_flags for non-MDBX_DUPSORT. - 'none' for config-verbs. - check commandline length under Windows. - workaround for QueryFullProcessImageNameA() bug. - add setloglevel(). - workaroung for MSVC bug. - avoid extra 'jitter' testcase loops. - cleanup DUPSORT flags. - refine key/value min/max handling. - dump keygen params. - fix/refine keygen. - alter keygen defaults (rotate 3, offset 41). - default test-db size 4mb or 256mb. - fix/refine keygen for non-MDBX_DUPSORT. - seeding keygen with actor_id for better spreading. --- test/config.cc | 115 ++++++++++++++++++++++++++++++++++++++----- test/config.h | 35 +++++++++++-- test/hill.cc | 8 +-- test/jitter.cc | 6 ++- test/keygen.cc | 79 +++++++++++++++++++---------- test/keygen.h | 13 +++-- test/log.cc | 42 ++++++++++++++-- test/log.h | 3 +- test/main.cc | 105 +++++++++++++++++++++++++++++---------- test/osal-unix.cc | 14 +++++- test/osal-windows.cc | 15 ++++-- test/test.cc | 40 ++------------- 12 files changed, 354 insertions(+), 121 deletions(-) diff --git a/test/config.cc b/test/config.cc index ae9367e6..7fa46208 100644 --- a/test/config.cc +++ b/test/config.cc @@ -43,6 +43,11 @@ bool parse_option(int argc, char *const argv[], int &narg, const char *option, if (narg + 1 < argc && strncmp("--", argv[narg + 1], 2) != 0) { *value = argv[narg + 1]; + if (strcmp(*value, "default") == 0) { + if (!default_value) + failure("Option '--%s' doen't accept default value\n", option); + *value = default_value; + } ++narg; return true; } @@ -57,9 +62,15 @@ bool parse_option(int argc, char *const argv[], int &narg, const char *option, bool parse_option(int argc, char *const argv[], int &narg, const char *option, std::string &value, bool allow_empty) { + return parse_option(argc, argv, narg, option, value, allow_empty, + allow_empty ? "" : nullptr); +} + +bool parse_option(int argc, char *const argv[], int &narg, const char *option, + std::string &value, bool allow_empty, + const char *default_value) { const char *value_cstr; - if (!parse_option(argc, argv, narg, option, &value_cstr, - allow_empty ? "" : nullptr)) + if (!parse_option(argc, argv, narg, option, &value_cstr, default_value)) return false; if (!allow_empty && strlen(value_cstr) == 0) @@ -110,12 +121,28 @@ bool parse_option(int argc, char *const argv[], int &narg, const char *option, bool parse_option(int argc, char *const argv[], int &narg, const char *option, uint64_t &value, const scale_mode scale, - const uint64_t minval, const uint64_t maxval) { + const uint64_t minval, const uint64_t maxval, + const uint64_t default_value) { const char *value_cstr; if (!parse_option(argc, argv, narg, option, &value_cstr)) return false; + if (default_value && strcmp(value_cstr, "default") == 0) { + value = default_value; + return true; + } + + if (strcmp(value_cstr, "min") == 0 || strcmp(value_cstr, "minimal") == 0) { + value = minval; + return true; + } + + if (strcmp(value_cstr, "max") == 0 || strcmp(value_cstr, "maximal") == 0) { + value = maxval; + return true; + } + char *suffix = nullptr; errno = 0; unsigned long long raw = strtoull(value_cstr, &suffix, 0); @@ -179,28 +206,58 @@ bool parse_option(int argc, char *const argv[], int &narg, const char *option, bool parse_option(int argc, char *const argv[], int &narg, const char *option, unsigned &value, const scale_mode scale, - const unsigned minval, const unsigned maxval) { + const unsigned minval, const unsigned maxval, + const unsigned default_value) { uint64_t huge; - if (!parse_option(argc, argv, narg, option, huge, scale, minval, maxval)) + if (!parse_option(argc, argv, narg, option, huge, scale, minval, maxval, + default_value)) return false; value = (unsigned)huge; return true; } bool parse_option(int argc, char *const argv[], int &narg, const char *option, - uint8_t &value, const uint8_t minval, const uint8_t maxval) { + uint8_t &value, const uint8_t minval, const uint8_t maxval, + const uint8_t default_value) { uint64_t huge; - if (!parse_option(argc, argv, narg, option, huge, no_scale, minval, maxval)) + if (!parse_option(argc, argv, narg, option, huge, no_scale, minval, maxval, + default_value)) return false; value = (uint8_t)huge; return true; } +bool parse_option(int argc, char *const argv[], int &narg, const char *option, + int64_t &value, const int64_t minval, const int64_t maxval, + const int64_t default_value) { + uint64_t proxy = (uint64_t)value; + if (parse_option(argc, argv, narg, option, proxy, config::binary, + (uint64_t)minval, (uint64_t)maxval, + (uint64_t)default_value)) { + value = (int64_t)proxy; + return true; + } + return false; +} + +bool parse_option(int argc, char *const argv[], int &narg, const char *option, + int32_t &value, const int32_t minval, const int32_t maxval, + const int32_t default_value) { + uint64_t proxy = (uint64_t)value; + if (parse_option(argc, argv, narg, option, proxy, config::binary, + (uint64_t)minval, (uint64_t)maxval, + (uint64_t)default_value)) { + value = (int32_t)proxy; + return true; + } + return false; +} + bool parse_option(int argc, char *const argv[], int &narg, const char *option, bool &value) { - const char *value_cstr = NULL; + const char *value_cstr = nullptr; if (!parse_option(argc, argv, narg, option, &value_cstr, "yes")) { const char *current = argv[narg]; if (strncmp(current, "--no-", 5) == 0 && strcmp(current + 5, option) == 0) { @@ -269,7 +326,7 @@ static void dump_verbs(const char *caption, size_t bits, ++verbs; } - logging::feed("\n"); + logging::feed("%s\n", (*comma == '\0') ? "none" : ""); } static void dump_duration(const char *caption, unsigned duration) { @@ -300,8 +357,12 @@ void dump(const char *title) { : i->params.pathname_log.c_str()); } - log_info("database: %s, size %" PRIu64 "\n", i->params.pathname_db.c_str(), - i->params.size); + log_info("database: %s, size %" PRIuPTR "[%" PRIiPTR "..%" PRIiPTR + ", %i %i, %i]\n", + i->params.pathname_db.c_str(), i->params.size_now, + i->params.size_lower, i->params.size_upper, + i->params.shrink_threshold, i->params.growth_step, + i->params.pagesize); dump_verbs("mode", i->params.mode_flags, mode_bits); dump_verbs("table", i->params.table_flags, table_bits); @@ -318,7 +379,13 @@ void dump(const char *title) { log_info("threads %u\n", i->params.nthreads); - log_info("keygen.case: %s\n", keygencase2str(i->params.keygen.keycase)); + log_info( + "keygen.params: case %s, width %u, mesh %u, rotate %u, offset %" PRIu64 + ", split %u/%u\n", + keygencase2str(i->params.keygen.keycase), i->params.keygen.width, + i->params.keygen.mesh, i->params.keygen.rotate, i->params.keygen.offset, + i->params.keygen.split, + i->params.keygen.width - i->params.keygen.split); log_info("keygen.seed: %u\n", i->params.keygen.seed); log_info("key: minlen %u, maxlen %u\n", i->params.keylen_min, i->params.keylen_max); @@ -481,3 +548,27 @@ bool actor_config::deserialize(const char *str, actor_config &config) { TRACE("<< actor_config::deserialize: OK\n"); return true; } + +unsigned actor_params::mdbx_keylen_min() const { + return (table_flags & MDBX_INTEGERKEY) ? 4 : 0; +} + +unsigned actor_params::mdbx_keylen_max() const { + return (table_flags & MDBX_INTEGERKEY) + ? 8 + : std::min((unsigned)mdbx_get_maxkeysize(pagesize), + (unsigned)UINT16_MAX); +} + +unsigned actor_params::mdbx_datalen_min() const { + return (table_flags & MDBX_INTEGERDUP) ? 4 : 0; +} + +unsigned actor_params::mdbx_datalen_max() const { + return (table_flags & MDBX_INTEGERDUP) + ? 8 + : std::min((table_flags & MDBX_DUPSORT) + ? (unsigned)mdbx_get_maxkeysize(pagesize) + : (unsigned)MDBX_MAXDATASIZE, + (unsigned)UINT16_MAX); +} diff --git a/test/config.h b/test/config.h index 86f37fbe..2d0fede0 100644 --- a/test/config.h +++ b/test/config.h @@ -62,6 +62,10 @@ bool parse_option(int argc, char *const argv[], int &narg, const char *option, bool parse_option(int argc, char *const argv[], int &narg, const char *option, std::string &value, bool allow_empty = false); +bool parse_option(int argc, char *const argv[], int &narg, const char *option, + std::string &value, bool allow_empty, + const char *default_value); + bool parse_option(int argc, char *const argv[], int &narg, const char *option, bool &value); @@ -75,16 +79,25 @@ bool parse_option(int argc, char *const argv[], int &narg, const char *option, bool parse_option(int argc, char *const argv[], int &narg, const char *option, uint64_t &value, const scale_mode scale, - const uint64_t minval = 0, const uint64_t maxval = INT64_MAX); + const uint64_t minval = 0, const uint64_t maxval = INT64_MAX, + const uint64_t default_value = 0); bool parse_option(int argc, char *const argv[], int &narg, const char *option, unsigned &value, const scale_mode scale, - const unsigned minval = 0, const unsigned maxval = INT32_MAX); + const unsigned minval = 0, const unsigned maxval = INT32_MAX, + const unsigned default_value = 0); bool parse_option(int argc, char *const argv[], int &narg, const char *option, uint8_t &value, const uint8_t minval = 0, - const uint8_t maxval = 255); + const uint8_t maxval = 255, const uint8_t default_value = 0); +bool parse_option(int argc, char *const argv[], int &narg, const char *option, + int64_t &value, const int64_t minval, const int64_t maxval, + const int64_t default_value = -1); + +bool parse_option(int argc, char *const argv[], int &narg, const char *option, + int32_t &value, const int32_t minval, const int32_t maxval, + const int32_t default_value = -1); //----------------------------------------------------------------------------- #pragma pack(push, 1) @@ -121,6 +134,8 @@ struct keygen_params_pod { * Иначе говоря, нет смысла в со-координации генерации паттернов для * ключей и значений. Более того, генерацию значений всегда необходимо * рассматривать в контексте связки с одним значением ключа. + * - Тем не менее, во всех случаях достаточно важным является равномерная + * всех возможных сочетаний длин ключей и данных. * * width: * Большинство тестов предполагают создание или итерирование некоторого @@ -156,7 +171,7 @@ struct keygen_params_pod { * псевдо-случайные значений ключей без псевдо-случайности в значениях. * * Такое ограничение соответствуют внутренней алгоритмике libmdbx. Проще - * говоря мы можем проверить движок псевдо-случайной последовательностью + * говоря, мы можем проверить движок псевдо-случайной последовательностью * ключей на таблицах без дубликатов (без multi-value), а затем проверить * корректность работу псевдо-случайной последовательностью значений на * таблицах с дубликатами (с multi-value), опционально добавляя @@ -203,7 +218,12 @@ struct actor_params_pod { unsigned mode_flags; unsigned table_flags; - uint64_t size; + intptr_t size_lower; + intptr_t size_now; + intptr_t size_upper; + int shrink_threshold; + int growth_step; + int pagesize; unsigned test_duration; unsigned test_nops; @@ -246,6 +266,11 @@ struct actor_params : public config::actor_params_pod { std::string pathname_log; std::string pathname_db; void set_defaults(const std::string &tmpdir); + + unsigned mdbx_keylen_min() const; + unsigned mdbx_keylen_max() const; + unsigned mdbx_datalen_min() const; + unsigned mdbx_datalen_max() const; }; struct actor_config : public config::actor_config_pod { diff --git a/test/hill.cc b/test/hill.cc index 0d609b86..753a095b 100644 --- a/test/hill.cc +++ b/test/hill.cc @@ -53,7 +53,7 @@ bool testcase_hill::run() { */ /* TODO: работа в несколько потоков */ - keyvalue_maker.setup(config.params, 0 /* thread_number */); + keyvalue_maker.setup(config.params, config.actor_id, 0 /* thread_number */); keygen::buffer a_key = keygen::alloc(config.params.keylen_max); keygen::buffer a_data_0 = keygen::alloc(config.params.datalen_max); @@ -65,7 +65,9 @@ bool testcase_hill::run() { ? MDBX_NODUPDATA : MDBX_NODUPDATA | MDBX_NOOVERWRITE; const unsigned update_flags = - MDBX_CURRENT | MDBX_NODUPDATA | MDBX_NOOVERWRITE; + (config.params.table_flags & MDBX_DUPSORT) + ? MDBX_CURRENT | MDBX_NODUPDATA | MDBX_NOOVERWRITE + : MDBX_NODUPDATA; uint64_t serial_count = 0; unsigned txn_nops = 0; @@ -115,7 +117,7 @@ bool testcase_hill::run() { rc = mdbx_replace(txn_guard.get(), dbi, &a_key->value, &a_data_0->value, &a_data_1->value, update_flags); if (unlikely(rc != MDBX_SUCCESS)) - failure_perror("mdbx_put(update-a: 1->0)", rc); + failure_perror("mdbx_replace(update-a: 1->0)", rc); if (++txn_nops >= config.params.batch_write) { txn_restart(false, false); diff --git a/test/jitter.cc b/test/jitter.cc index e7faf2a3..25514004 100644 --- a/test/jitter.cc +++ b/test/jitter.cc @@ -58,7 +58,11 @@ bool testcase_jitter::run() { jitter_delay(); db_close(); - report(1); + + /* just 'align' nops with other tests with batching */ + const auto batching = + std::max(config.params.batch_read, config.params.batch_write); + report(std::max(1u, batching / 2)); } return true; } diff --git a/test/keygen.cc b/test/keygen.cc index 1b18fa00..c7a70606 100644 --- a/test/keygen.cc +++ b/test/keygen.cc @@ -30,7 +30,7 @@ serial_t injective(const serial_t serial, /* LY: All these "magic" prime numbers were found * and verified with a bit of brute force. */ - static const uint64_t m[64 - serial_minwith] = { + static const uint64_t m[64 - serial_minwith + 1] = { /* 8 - 24 */ 113, 157, 397, 653, 1753, 5641, 9697, 23873, 25693, 80833, 105953, 316937, 309277, 834497, 1499933, 4373441, 10184137, @@ -43,26 +43,31 @@ serial_t injective(const serial_t serial, 2420886491930041, 3601632139991929, 11984491914483833, 21805846439714153, 23171543400565993, 53353226456762893, 155627817337932409, 227827205384840249, 816509268558278821, 576933057762605689, - 2623957345935638441, 5048241705479929949, 4634245581946485653}; - static const uint8_t s[64 - serial_minwith] = { + 2623957345935638441, 5048241705479929949, 4634245581946485653, + 4613509448041658233, 4952535426879925961}; + static const uint8_t s[64 - serial_minwith + 1] = { /* 8 - 24 */ 2, 3, 4, 4, 2, 4, 3, 3, 7, 3, 3, 4, 8, 3, 10, 3, 11, /* 25 - 64 */ 11, 9, 9, 9, 11, 10, 5, 14, 11, 16, 14, 12, 13, 16, 19, 10, 10, 21, 7, 20, - 10, 14, 22, 19, 3, 21, 18, 19, 26, 24, 2, 21, 25, 29, 24, 10, 11, 14}; + 10, 14, 22, 19, 3, 21, 18, 19, 26, 24, 2, 21, 25, 29, 24, 10, 11, 14, 20, + 19}; - serial_t result = serial * m[bits - 8]; + const auto mult = m[bits - 8]; + const auto shift = s[bits - 8]; + serial_t result = serial * mult; if (salt) { const unsigned left = bits / 2; const unsigned right = bits - left; result = (result << left) | ((result & mask(bits)) >> right); - result = (result ^ salt) * m[bits - 8]; + result = (result ^ salt) * mult; } - result ^= result << s[bits - 8]; + result ^= result << shift; result &= mask(bits); - log_trace("keygen-injective: serial %" PRIu64 " into %" PRIu64, serial, - result); + log_trace("keygen-injective: serial %" PRIu64 "/%u @%" PRIx64 ",%u,%" PRIu64 + " => %" PRIu64 "/%u", + serial, bits, mult, shift, salt, result, bits); return result; } @@ -73,8 +78,9 @@ void __hot maker::pair(serial_t serial, const buffer &key, buffer &value, assert(mapping.mesh <= mapping.width); assert(mapping.rotate <= mapping.width); assert(mapping.offset <= mask(mapping.width)); - assert(!(key_essentials.flags & (MDBX_INTEGERDUP | MDBX_REVERSEDUP))); - assert(!(value_essentials.flags & (MDBX_INTEGERKEY | MDBX_REVERSEKEY))); + assert(!(key_essentials.flags & + ~(MDBX_INTEGERKEY | MDBX_REVERSEKEY | MDBX_DUPSORT))); + assert(!(value_essentials.flags & ~(MDBX_INTEGERDUP | MDBX_REVERSEDUP))); log_trace("keygen-pair: serial %" PRIu64 ", data-age %" PRIu64, serial, value_age); @@ -82,31 +88,49 @@ void __hot maker::pair(serial_t serial, const buffer &key, buffer &value, if (mapping.mesh >= serial_minwith) { serial = (serial & ~mask(mapping.mesh)) | injective(serial, mapping.mesh, salt); - log_trace("keygen-pair: mesh %" PRIu64, serial); + log_trace("keygen-pair: mesh@%u => %" PRIu64, mapping.mesh, serial); } if (mapping.rotate) { const unsigned right = mapping.rotate; const unsigned left = mapping.width - right; serial = (serial << left) | ((serial & mask(mapping.width)) >> right); - log_trace("keygen-pair: rotate %" PRIu64 ", 0x%" PRIx64, serial, serial); + log_trace("keygen-pair: rotate@%u => %" PRIu64 ", 0x%" PRIx64, + mapping.rotate, serial, serial); } - serial = (serial + mapping.offset) & mask(mapping.width); - log_trace("keygen-pair: offset %" PRIu64, serial); - serial += base; + if (mapping.offset) { + serial = (serial + mapping.offset) & mask(mapping.width); + log_trace("keygen-pair: offset@%" PRIu64 " => %" PRIu64, mapping.offset, + serial); + } + if (base) { + serial += base; + log_trace("keygen-pair: base@%" PRIu64 " => %" PRIu64, base, serial); + } serial_t key_serial = serial; - serial_t value_serial = value_age; + serial_t value_serial = value_age << mapping.split; if (mapping.split) { - key_serial = serial >> mapping.split; - value_serial = - (serial & mask(mapping.split)) | (value_age << mapping.split); + if (key_essentials.flags & MDBX_DUPSORT) { + key_serial >>= mapping.split; + value_serial += serial & mask(mapping.split); + } else { + /* Без MDBX_DUPSORT требуется уникальность ключей, а для этого нельзя + * отбрасывать какие-либо биты serial после инъективного преобразования. + * Поэтому key_serial не трогаем, а в value_serial нелинейно вмешиваем + * запрошенное количество бит из serial */ + value_serial += + (serial ^ (serial >> mapping.split)) & mask(mapping.split); + } + + value_serial |= value_age << mapping.split; + log_trace("keygen-pair: split@%u => k%" PRIu64 ", v%" PRIu64, mapping.split, + key_serial, value_serial); } log_trace("keygen-pair: key %" PRIu64 ", value %" PRIu64, key_serial, value_serial); - mk(key_serial, key_essentials, *key); mk(value_serial, value_essentials, *value); @@ -118,10 +142,10 @@ void __hot maker::pair(serial_t serial, const buffer &key, buffer &value, } } -void maker::setup(const config::actor_params_pod &actor, +void maker::setup(const config::actor_params_pod &actor, unsigned actor_id, unsigned thread_number) { key_essentials.flags = - actor.table_flags & (MDBX_INTEGERKEY | MDBX_REVERSEKEY); + actor.table_flags & (MDBX_INTEGERKEY | MDBX_REVERSEKEY | MDBX_DUPSORT); assert(actor.keylen_min <= UINT8_MAX); key_essentials.minlen = (uint8_t)actor.keylen_min; assert(actor.keylen_max <= UINT16_MAX); @@ -137,7 +161,7 @@ void maker::setup(const config::actor_params_pod &actor, assert(thread_number < 2); (void)thread_number; mapping = actor.keygen; - salt = actor.keygen.seed * UINT64_C(14653293970879851569); + salt = (actor.keygen.seed + actor_id) * UINT64_C(14653293970879851569); // FIXME: TODO base = 0; @@ -165,7 +189,7 @@ bool maker::increment(serial_t &serial, int delta) { //----------------------------------------------------------------------------- -size_t length(serial_t serial) { +static size_t length(serial_t serial) { size_t n = 0; if (serial > UINT32_MAX) { n = 4; @@ -199,7 +223,10 @@ void __hot maker::mk(const serial_t serial, const essentials ¶ms, assert(params.maxlen >= length(serial)); out.value.iov_base = out.bytes; - out.value.iov_len = params.minlen; + out.value.iov_len = + (params.maxlen > params.minlen) + ? params.minlen + serial % (params.maxlen - params.minlen) + : params.minlen; if (params.flags & (MDBX_INTEGERKEY | MDBX_INTEGERDUP)) { assert(params.maxlen == params.minlen); diff --git a/test/keygen.h b/test/keygen.h index c1e907bc..bbd97b29 100644 --- a/test/keygen.h +++ b/test/keygen.h @@ -44,7 +44,7 @@ namespace keygen { * - абсолютное значение ключей или разность между отдельными значениями; * * Соответственно, в общих чертах, схема генерации следующая: - * - вводится плоская одномерная "координата" uint64_t; + * - вводится плоская одномерная "координата" serial (uint64_t); * - генерация специфических паттернов (последовательностей) * реализуется посредством соответствующих преобразований "координат", при * этом все подобные преобразования выполняются только над "координатой"; @@ -74,7 +74,7 @@ typedef uint64_t serial_t; enum : serial_t { serial_minwith = 8, serial_maxwith = sizeof(serial_t) * 8, - serial_allones = ~(serial_t)0 + serial_allones = ~(serial_t)0u }; struct result { @@ -85,6 +85,10 @@ struct result { uint32_t u32; uint64_t u64; }; + + std::string as_string() const { + return std::string((const char *)value.iov_base, value.iov_len); + } }; //----------------------------------------------------------------------------- @@ -115,11 +119,10 @@ public: void pair(serial_t serial, const buffer &key, buffer &value, serial_t value_age); - void setup(const config::actor_params_pod &actor, unsigned thread_number); + void setup(const config::actor_params_pod &actor, unsigned actor_id, + unsigned thread_number); bool increment(serial_t &serial, int delta); }; -size_t length(serial_t serial); - } /* namespace keygen */ diff --git a/test/log.cc b/test/log.cc index 521e1d69..0e325e3a 100644 --- a/test/log.cc +++ b/test/log.cc @@ -37,6 +37,31 @@ void __noreturn failure_perror(const char *what, int errnum) { //----------------------------------------------------------------------------- +static void mdbx_logger(int type, const char *function, int line, + const char *msg, va_list args) { + logging::loglevel level = logging::info; + if (type & MDBX_DBG_EXTRA) + level = logging::extra; + if (type & MDBX_DBG_TRACE) + level = logging::trace; + if (type & MDBX_DBG_PRINT) + level = logging::verbose; + + if (!function) + function = "unknown"; + if (type & MDBX_DBG_ASSERT) { + log_error("mdbx: assertion failure: %s, %d", function, line); + level = logging::failure; + } + + if (logging::output( + level, + strncmp(function, "mdbx_", 5) == 0 ? "%s: " : "mdbx: %s: ", function)) + logging::feed_ap(msg, args); + if (type & MDBX_DBG_ASSERT) + abort(); +} + namespace logging { static std::string prefix; @@ -44,8 +69,19 @@ static std::string suffix; static loglevel level; static FILE *last; -void setup(loglevel _level, const std::string &_prefix) { +void setlevel(loglevel _level) { level = (_level > error) ? failure : _level; + int mdbx_dbg_opts = MDBX_DBG_ASSERT | MDBX_DBG_JITTER | MDBX_DBG_DUMP; + if (level <= trace) + mdbx_dbg_opts |= MDBX_DBG_TRACE; + if (level <= verbose) + mdbx_dbg_opts |= MDBX_DBG_PRINT; + int rc = mdbx_setup_debug(mdbx_dbg_opts, mdbx_logger); + log_trace("set mdbx debug-opts: 0x%02x", rc); +} + +void setup(loglevel _level, const std::string &_prefix) { + setlevel(_level); prefix = _prefix; } @@ -157,7 +193,7 @@ bool output(const logging::loglevel priority, const char *format, va_list ap) { return true; } -bool feed(const char *format, va_list ap) { +bool feed_ap(const char *format, va_list ap) { if (!last) return false; @@ -176,7 +212,7 @@ bool feed(const char *format, ...) { va_list ap; va_start(ap, format); - feed(format, ap); + feed_ap(format, ap); va_end(ap); return true; } diff --git a/test/log.h b/test/log.h index e97e954c..e09cccaa 100644 --- a/test/log.h +++ b/test/log.h @@ -46,11 +46,12 @@ enum loglevel { const char *level2str(const loglevel level); void setup(loglevel level, const std::string &prefix); void setup(const std::string &prefix); +void setlevel(loglevel level); bool output(const loglevel priority, const char *format, va_list ap); bool __printf_args(2, 3) output(const loglevel priority, const char *format, ...); -bool feed(const char *format, va_list ap); +bool feed_ap(const char *format, va_list ap); bool __printf_args(1, 2) feed(const char *format, ...); class local_suffix { diff --git a/test/main.cc b/test/main.cc index d417480d..3384311b 100644 --- a/test/main.cc +++ b/test/main.cc @@ -35,25 +35,31 @@ void actor_params::set_defaults(const std::string &tmpdir) { mode_flags = MDBX_NOSUBDIR | MDBX_WRITEMAP | MDBX_MAPASYNC | MDBX_NORDAHEAD | MDBX_NOMEMINIT | MDBX_COALESCE | MDBX_LIFORECLAIM; table_flags = MDBX_DUPSORT; - size = 1024 * 1024 * 4; + + size_lower = -1; + size_now = 1024 * 1024 * ((table_flags & MDBX_DUPSORT) ? 4 : 256); + size_upper = -1; + shrink_threshold = -1; + growth_step = -1; + pagesize = -1; keygen.seed = 1; keygen.keycase = kc_random; - keygen.width = 32; - keygen.mesh = 32; + keygen.width = (table_flags & MDBX_DUPSORT) ? 32 : 64; + keygen.mesh = keygen.width; keygen.split = keygen.width / 2; - keygen.rotate = 0; - keygen.offset = 0; + keygen.rotate = 3; + keygen.offset = 41; test_duration = 0; test_nops = 1000; nrepeat = 1; nthreads = 1; - keylen_min = 0; - keylen_max = 42; - datalen_min = 0; - datalen_max = 256; + keylen_min = mdbx_keylen_min(); + keylen_max = mdbx_keylen_max(); + datalen_min = mdbx_datalen_min(); + datalen_max = std::min(mdbx_datalen_max(), 256u * 1024 + 42); batch_read = 4; batch_write = 4; @@ -148,10 +154,53 @@ int main(int argc, char *const argv[]) { config::mode_bits)) continue; if (config::parse_option(argc, argv, narg, "table", params.table_flags, - config::table_bits)) + config::table_bits)) { + if ((params.table_flags & MDBX_DUPFIXED) == 0) + params.table_flags &= ~MDBX_INTEGERDUP; + if ((params.table_flags & MDBX_DUPSORT) == 0) + params.table_flags &= + ~(MDBX_DUPFIXED | MDBX_REVERSEDUP | MDBX_INTEGERDUP); continue; - if (config::parse_option(argc, argv, narg, "size", params.size, - config::binary, 4096 * 4)) + } + + if (config::parse_option(argc, argv, narg, "pagesize", params.pagesize, + mdbx_limits_pgsize_min(), + mdbx_limits_pgsize_max())) { + const unsigned keylen_max = params.mdbx_keylen_max(); + if (params.keylen_min > keylen_max) + params.keylen_min = keylen_max; + if (params.keylen_max > keylen_max) + params.keylen_max = keylen_max; + const unsigned datalen_max = params.mdbx_datalen_max(); + if (params.datalen_min > datalen_max) + params.datalen_min = datalen_max; + if (params.datalen_max > datalen_max) + params.datalen_max = datalen_max; + continue; + } + if (config::parse_option(argc, argv, narg, "size-lower", params.size_lower, + mdbx_limits_dbsize_min(params.pagesize), + mdbx_limits_dbsize_max(params.pagesize))) + continue; + if (config::parse_option(argc, argv, narg, "size", params.size_now, + mdbx_limits_dbsize_min(params.pagesize), + mdbx_limits_dbsize_max(params.pagesize))) + continue; + if (config::parse_option(argc, argv, narg, "size-upper", params.size_upper, + mdbx_limits_dbsize_min(params.pagesize), + mdbx_limits_dbsize_max(params.pagesize))) + continue; + if (config::parse_option( + argc, argv, narg, "shrink-threshold", params.shrink_threshold, 0, + (int)std::min((intptr_t)INT_MAX, + mdbx_limits_dbsize_max(params.pagesize) - + mdbx_limits_dbsize_min(params.pagesize)))) + continue; + if (config::parse_option( + argc, argv, narg, "growth-step", params.growth_step, 0, + (int)std::min((intptr_t)INT_MAX, + mdbx_limits_dbsize_max(params.pagesize) - + mdbx_limits_dbsize_min(params.pagesize)))) continue; if (config::parse_option(argc, argv, narg, "keygen.width", @@ -188,30 +237,36 @@ int main(int argc, char *const argv[]) { config::duration, 1)) continue; if (config::parse_option(argc, argv, narg, "keylen.min", params.keylen_min, - config::no_scale, 0, UINT8_MAX)) { - if (params.keylen_max < params.keylen_min) + config::no_scale, params.mdbx_keylen_min(), + params.mdbx_keylen_max())) { + if ((params.table_flags & MDBX_INTEGERKEY) || + params.keylen_max < params.keylen_min) params.keylen_max = params.keylen_min; continue; } - if (config::parse_option( - argc, argv, narg, "keylen.max", params.keylen_max, config::no_scale, - 0, - std::min((unsigned)mdbx_get_maxkeysize(0), (unsigned)UINT16_MAX))) { - if (params.keylen_min > params.keylen_max) + if (config::parse_option(argc, argv, narg, "keylen.max", params.keylen_max, + config::no_scale, params.mdbx_keylen_min(), + params.mdbx_keylen_max())) { + if ((params.table_flags & MDBX_INTEGERKEY) || + params.keylen_min > params.keylen_max) params.keylen_min = params.keylen_max; continue; } if (config::parse_option(argc, argv, narg, "datalen.min", - params.datalen_min, config::no_scale, 0, - UINT8_MAX)) { - if (params.datalen_max < params.datalen_min) + params.datalen_min, config::no_scale, + params.mdbx_datalen_min(), + params.mdbx_datalen_max())) { + if ((params.table_flags & MDBX_DUPFIXED) || + params.datalen_max < params.datalen_min) params.datalen_max = params.datalen_min; continue; } if (config::parse_option(argc, argv, narg, "datalen.max", - params.datalen_max, config::no_scale, 0, - std::min((int)UINT16_MAX, MDBX_MAXDATASIZE))) { - if (params.datalen_min > params.datalen_max) + params.datalen_max, config::no_scale, + params.mdbx_datalen_min(), + params.mdbx_datalen_max())) { + if ((params.table_flags & MDBX_DUPFIXED) || + params.datalen_min > params.datalen_max) params.datalen_min = params.datalen_max; continue; } diff --git a/test/osal-unix.cc b/test/osal-unix.cc index 8132e267..6661ae42 100644 --- a/test/osal-unix.cc +++ b/test/osal-unix.cc @@ -182,6 +182,9 @@ void osal_killall_actors(void) { } int osal_actor_poll(mdbx_pid_t &pid, unsigned timeout) { + struct timespec ts; + ts.tv_nsec = 0; + ts.tv_sec = timeout; retry: int status, options = WNOHANG; #ifdef WUNTRACED @@ -209,9 +212,16 @@ retry: } if (pid == 0) { - if (timeout && sleep(timeout)) + /* child still running */ + if (ts.tv_sec == 0 && ts.tv_nsec == 0) + ts.tv_nsec = 1; + if (nanosleep(&ts, &ts) == 0) { + /* timeout and no signal fomr child */ + pid = 0; + return 0; + } + if (errno == EINTR) goto retry; - return 0; } switch (errno) { diff --git a/test/osal-windows.cc b/test/osal-windows.cc index b8cdb535..7d59f657 100644 --- a/test/osal-windows.cc +++ b/test/osal-windows.cc @@ -262,15 +262,24 @@ int osal_actor_start(const actor_config &config, mdbx_pid_t &pid) { STARTUPINFOA StartupInfo; GetStartupInfoA(&StartupInfo); - char exename[_MAX_PATH]; + char exename[_MAX_PATH + 1]; DWORD exename_size = sizeof(exename); if (!QueryFullProcessImageNameA(GetCurrentProcess(), 0, exename, &exename_size)) failure_perror("QueryFullProcessImageName()", GetLastError()); - std::string cmdline = "test_mdbx.child "; + if (exename[1] != ':') { + exename_size = GetModuleFileName(NULL, exename, sizeof(exename)); + if (exename_size >= sizeof(exename)) + return ERROR_BAD_LENGTH; + } + + std::string cmdline = "$ "; ArgvQuote(cmdline, thunk_param(config)); + if (cmdline.size() >= 32767) + return ERROR_BAD_LENGTH; + PROCESS_INFORMATION ProcessInformation; if (!CreateProcessA(exename, const_cast(cmdline.c_str()), NULL, // Retuned process handle is not inheritable. @@ -280,7 +289,7 @@ int osal_actor_start(const actor_config &config, mdbx_pid_t &pid) { NULL, // Inherit the parent's environment. NULL, // Inherit the parent's current directory. &StartupInfo, &ProcessInformation)) - return GetLastError(); + failure_perror(exename, GetLastError()); CloseHandle(ProcessInformation.hThread); pid = ProcessInformation.dwProcessId; diff --git a/test/test.cc b/test/test.cc index 3750af52..c28bbd22 100644 --- a/test/test.cc +++ b/test/test.cc @@ -68,31 +68,6 @@ const char *keygencase2str(const keygen_case keycase) { //----------------------------------------------------------------------------- -static void mdbx_logger(int type, const char *function, int line, - const char *msg, va_list args) { - logging::loglevel level = logging::info; - if (type & MDBX_DBG_EXTRA) - level = logging::extra; - if (type & MDBX_DBG_TRACE) - level = logging::trace; - if (type & MDBX_DBG_PRINT) - level = logging::verbose; - - if (!function) - function = "unknown"; - if (type & MDBX_DBG_ASSERT) { - log_error("mdbx: assertion failure: %s, %d", function, line); - level = logging::failure; - } - - if (logging::output( - level, - strncmp(function, "mdbx_", 5) == 0 ? "%s: " : "mdbx: %s: ", function)) - logging::feed(msg, args); - if (type & MDBX_DBG_ASSERT) - abort(); -} - int testcase::oom_callback(MDBX_env *env, int pid, mdbx_tid_t tid, uint64_t txn, unsigned gap, int retry) { @@ -117,16 +92,8 @@ void testcase::db_prepare() { log_trace(">> db_prepare"); assert(!db_guard); - int mdbx_dbg_opts = MDBX_DBG_ASSERT | MDBX_DBG_JITTER | MDBX_DBG_DUMP; - if (config.params.loglevel <= logging::trace) - mdbx_dbg_opts |= MDBX_DBG_TRACE; - if (config.params.loglevel <= logging::verbose) - mdbx_dbg_opts |= MDBX_DBG_PRINT; - int rc = mdbx_setup_debug(mdbx_dbg_opts, mdbx_logger); - log_trace("set mdbx debug-opts: 0x%02x", rc); - MDBX_env *env = nullptr; - rc = mdbx_env_create(&env); + int rc = mdbx_env_create(&env); if (unlikely(rc != MDBX_SUCCESS)) failure_perror("mdbx_env_create()", rc); @@ -149,7 +116,10 @@ void testcase::db_prepare() { if (unlikely(rc != MDBX_SUCCESS)) failure_perror("mdbx_env_set_oomfunc()", rc); - rc = mdbx_env_set_mapsize(env, (size_t)config.params.size); + rc = mdbx_env_set_geometry( + env, config.params.size_lower, config.params.size_now, + config.params.size_upper, config.params.growth_step, + config.params.shrink_threshold, config.params.pagesize); if (unlikely(rc != MDBX_SUCCESS)) failure_perror("mdbx_env_set_mapsize()", rc); From 5a29214ad9f230cc7653d0707a477cf1f30024bf Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Sun, 23 Sep 2018 12:37:33 +0300 Subject: [PATCH 44/83] mdbx-test: backport - update 'gc.sh' script (squashed). --- test/gc.sh | 52 +++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 41 insertions(+), 11 deletions(-) diff --git a/test/gc.sh b/test/gc.sh index 81e32ba6..b8431d0f 100755 --- a/test/gc.sh +++ b/test/gc.sh @@ -1,6 +1,7 @@ #!/bin/bash set -euo pipefail -TESTDB_PREFIX=${1:-/dev/shm/mdbx-gc-test} +make check +TESTDB_PREFIX=${1:-/dev/shm/mdbx-gc-test}. function rep9 { printf "%*s" $1 '' | tr ' ' '9'; } function join { local IFS="$1"; shift; echo "$*"; } @@ -18,16 +19,45 @@ function bits2list { join , "${list[@]}" } -for nops in {1..7}; do - for ((wbatch=nops; wbatch > 0; --wbatch)); do - for ((bits=2**${#options[@]}; --bits >= 0; )); do - echo "=================================== $(date)" - rm -f ${TESTDB_PREFIX}* - echo --nops=$( rep9 $nops ) --batch.write=$( rep9 $wbatch ) --mode=$(bits2list options $bits) - ./mdbx_test --pathname=${TESTDB_PREFIX} --size=8G --keylen.min=1 --keylen.max=250 --datalen.min=1 --datalen.max=500 \ - --nops=$( rep9 $nops ) --batch.write=$( rep9 $wbatch ) --mode=$(bits2list options $bits) \ - --hill | bzip2 -c > ${TESTDB_PREFIX}.log.bz2 - ./mdbx_chk -nvv ${TESTDB_PREFIX} | tee ${TESTDB_PREFIX}-chk.log +function probe { + echo "=============================================== $(date)" + echo "${caption}: $*" + rm -f ${TESTDB_PREFIX}* \ + && ./mdbx_test --pathname=${TESTDB_PREFIX}db "$@" | lz4 > ${TESTDB_PREFIX}log.lz4 \ + && ./mdbx_chk -nvv ${TESTDB_PREFIX}db | tee ${TESTDB_PREFIX}chk \ + || (echo "FAILED"; exit 1) +} + +############################################################################### + +count=0 +for nops in {2..7}; do + for ((wbatch=nops-1; wbatch > 0; --wbatch)); do + loops=$(((3333 >> nops) / nops + 1)) + for ((rep=0; rep++ < loops; )); do + for ((bits=2**${#options[@]}; --bits >= 0; )); do + seed=$(date +%N) + caption="Probe #$((++count)) int-key,w/o-dups, repeat ${rep} of ${loops}" probe \ + --pagesize=min --size=6G --table=+key.integer,-data.dups --keylen.min=min --keylen.max=max --datalen.min=min --datalen.max=1111 \ + --nops=$( rep9 $nops ) --batch.write=$( rep9 $wbatch ) --mode=$(bits2list options $bits) \ + --keygen.seed=${seed} basic + caption="Probe #$((++count)) int-key,with-dups, repeat ${rep} of ${loops}" probe \ + --pagesize=min --size=6G --table=+key.integer,+data.dups --keylen.min=min --keylen.max=max --datalen.min=min --datalen.max=max \ + --nops=$( rep9 $nops ) --batch.write=$( rep9 $wbatch ) --mode=$(bits2list options $bits) \ + --keygen.seed=${seed} basic + caption="Probe #$((++count)) int-key,int-data, repeat ${rep} of ${loops}" probe \ + --pagesize=min --size=6G --table=+key.integer,+data.integer --keylen.min=min --keylen.max=max --datalen.min=min --datalen.max=max \ + --nops=$( rep9 $nops ) --batch.write=$( rep9 $wbatch ) --mode=$(bits2list options $bits) \ + --keygen.seed=${seed} basic + caption="Probe #$((++count)) w/o-dups, repeat ${rep} of ${loops}" probe \ + --pagesize=min --size=6G --table=-data.dups --keylen.min=min --keylen.max=max --datalen.min=min --datalen.max=1111 \ + --nops=$( rep9 $nops ) --batch.write=$( rep9 $wbatch ) --mode=$(bits2list options $bits) \ + --keygen.seed=${seed} --hill + caption="Probe #$((++count)) with-dups, repeat ${rep} of ${loops}" probe \ + --pagesize=min --size=6G --table=+data.dups --keylen.min=min --keylen.max=max --datalen.min=min --datalen.max=max \ + --nops=$( rep9 $nops ) --batch.write=$( rep9 $wbatch ) --mode=$(bits2list options $bits) \ + --keygen.seed=${seed} --hill + done done done done From d2854e07603c1c73c17ad362bd02536dbc18f1bc Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Sun, 23 Sep 2018 12:37:37 +0300 Subject: [PATCH 45/83] mdbx: backport - refine mdbx_chk (squashed). - refine 'mismatch idl length' error message. - add/fix printf-format checking. - refine dbi-structure. --- src/tools/mdbx_chk.c | 180 ++++++++++++++++++++++++------------------- 1 file changed, 100 insertions(+), 80 deletions(-) diff --git a/src/tools/mdbx_chk.c b/src/tools/mdbx_chk.c index 0fd23ae6..9ad65f8d 100644 --- a/src/tools/mdbx_chk.c +++ b/src/tools/mdbx_chk.c @@ -1,4 +1,4 @@ -/* mdbx_chk.c - memory-mapped database check tool */ +/* mdbx_chk.c - memory-mapped database check tool */ /* * Copyright 2015-2018 Leonid Yuriev @@ -61,12 +61,18 @@ static void signal_handler(int sig) { #define EXIT_FAILURE_CHECK_MAJOR (EXIT_FAILURE + 1) #define EXIT_FAILURE_CHECK_MINOR EXIT_FAILURE +typedef struct { + const char *name; + struct { + uint64_t total; + uint64_t empty; + } pages; + uint64_t payload_bytes; + uint64_t lost_bytes; +} walk_dbi_t; + struct { - const char *dbi_names[MAX_DBI]; - uint64_t dbi_pages[MAX_DBI]; - uint64_t dbi_empty_pages[MAX_DBI]; - uint64_t dbi_payload_bytes[MAX_DBI]; - uint64_t dbi_lost_bytes[MAX_DBI]; + walk_dbi_t dbi[MAX_DBI]; short *pagemap; uint64_t total_payload_bytes; uint64_t pgcount; @@ -95,7 +101,7 @@ struct problem *problems_list; uint64_t total_problems; static void -#ifdef __GNU__ +#ifdef __GNUC__ __attribute__((format(printf, 1, 2))) #endif print(const char *msg, ...) { @@ -110,7 +116,7 @@ static void } static void -#ifdef __GNU__ +#ifdef __GNUC__ __attribute__((format(printf, 1, 2))) #endif error(const char *msg, ...) { @@ -131,9 +137,9 @@ static void pagemap_cleanup(void) { int i; for (i = 1; i < MAX_DBI; ++i) { - if (walk.dbi_names[i]) { - free((void *)walk.dbi_names[i]); - walk.dbi_names[i] = NULL; + if (walk.dbi[i].name) { + free((void *)walk.dbi[i].name); + walk.dbi[i].name = NULL; } } @@ -141,32 +147,35 @@ static void pagemap_cleanup(void) { walk.pagemap = NULL; } -static int pagemap_lookup_dbi(const char *dbi) { - static int last; - int i; +static walk_dbi_t *pagemap_lookup_dbi(const char *dbi_name) { + static walk_dbi_t *last; - if (last > 0 && strcmp(walk.dbi_names[last], dbi) == 0) + if (last && strcmp(last->name, dbi_name) == 0) return last; - for (i = 1; walk.dbi_names[i] && last < MAX_DBI; ++i) - if (strcmp(walk.dbi_names[i], dbi) == 0) - return last = i; - - if (i == MAX_DBI) - return -1; - - walk.dbi_names[i] = strdup(dbi); + walk_dbi_t *dbi = walk.dbi + 1; + while (dbi->name) { + if (strcmp(dbi->name, dbi_name) == 0) + return last = dbi; + if (++dbi == walk.dbi + MAX_DBI) + return NULL; + } + dbi->name = strdup(dbi_name); if (verbose > 1) { - print(" - found '%s' area\n", dbi); + print(" - found '%s' area\n", dbi_name); fflush(NULL); } - return last = i; + return last = dbi; } -static void problem_add(const char *object, uint64_t entry_number, - const char *msg, const char *extra, ...) { +static void +#ifdef __GNUC__ + __attribute__((format(printf, 4, 5))) +#endif + problem_add(const char *object, uint64_t entry_number, const char *msg, + const char *extra, ...) { total_problems++; if (!quiet) { @@ -233,7 +242,7 @@ static uint64_t problems_pop(struct problem *list) { } static int pgvisitor(uint64_t pgno, unsigned pgnumber, void *ctx, - const char *dbi, const char *type, size_t nentries, + const char *dbi_name, const char *type, size_t nentries, size_t payload_bytes, size_t header_bytes, size_t unused_bytes) { (void)ctx; @@ -241,54 +250,58 @@ static int pgvisitor(uint64_t pgno, unsigned pgnumber, void *ctx, if (type) { uint64_t page_bytes = payload_bytes + header_bytes + unused_bytes; size_t page_size = (size_t)pgnumber * envstat.ms_psize; - int index = pagemap_lookup_dbi(dbi); - if (index < 0) + walk_dbi_t *dbi = pagemap_lookup_dbi(dbi_name); + if (!dbi) return MDBX_ENOMEM; - if (verbose > 2 && (!only_subdb || strcmp(only_subdb, dbi) == 0)) { + if (verbose > 2 && (!only_subdb || strcmp(only_subdb, dbi_name) == 0)) { if (pgnumber == 1) print(" %s-page %" PRIu64, type, pgno); else print(" %s-span %" PRIu64 "[%u]", type, pgno, pgnumber); print(" of %s: header %" PRIiPTR ", payload %" PRIiPTR ", unused %" PRIiPTR "\n", - dbi, header_bytes, payload_bytes, unused_bytes); + dbi_name, header_bytes, payload_bytes, unused_bytes); } walk.pgcount += pgnumber; if (unused_bytes > page_size) - problem_add("page", pgno, "illegal unused-bytes", "%u < %i < %u", 0, - unused_bytes, envstat.ms_psize); + problem_add("page", pgno, "illegal unused-bytes", + "%u < %" PRIuPTR " < %u", 0, unused_bytes, envstat.ms_psize); if (header_bytes < (int)sizeof(long) || (size_t)header_bytes >= envstat.ms_psize - sizeof(long)) problem_add("page", pgno, "illegal header-length", - "%" PRIuPTR " < %i < %" PRIuPTR "", sizeof(long), + "%" PRIuPTR " < %" PRIuPTR " < %" PRIuPTR, sizeof(long), header_bytes, envstat.ms_psize - sizeof(long)); if (payload_bytes < 1) { if (nentries > 1) { problem_add("page", pgno, "zero size-of-entry", - "payload %i bytes, %i entries", payload_bytes, nentries); + "payload %" PRIuPTR " bytes, %" PRIuPTR " entries", + payload_bytes, nentries); if ((size_t)header_bytes + unused_bytes < page_size) { /* LY: hush a misuse error */ page_bytes = page_size; } } else { - problem_add("page", pgno, "empty", "payload %i bytes, %i entries", + problem_add("page", pgno, "empty", + "payload %" PRIuPTR " bytes, %" PRIuPTR " entries", payload_bytes, nentries); - walk.dbi_empty_pages[index] += 1; + dbi->pages.empty += 1; } } if (page_bytes != page_size) { problem_add("page", pgno, "misused", - "%" PRIu64 " != %" PRIu64 " (%ih + %ip + %iu)", page_size, - page_bytes, header_bytes, payload_bytes, unused_bytes); + "%" PRIu64 " != %" PRIu64 " (%" PRIuPTR "h + %" PRIuPTR + "p + %" PRIuPTR "u)", + page_size, page_bytes, header_bytes, payload_bytes, + unused_bytes); if (page_size > page_bytes) - walk.dbi_lost_bytes[index] += page_size - page_bytes; + dbi->lost_bytes += page_size - page_bytes; } else { - walk.dbi_payload_bytes[index] += payload_bytes + header_bytes; + dbi->payload_bytes += payload_bytes + header_bytes; walk.total_payload_bytes += payload_bytes + header_bytes; } @@ -299,10 +312,10 @@ static int pgvisitor(uint64_t pgno, unsigned pgnumber, void *ctx, "%" PRIu64 " > %" PRIu64 "", pgno, lastpgno); else if (walk.pagemap[pgno]) problem_add("page", pgno, "already used", "in %s", - walk.dbi_names[walk.pagemap[pgno]]); + walk.dbi[walk.pagemap[pgno]].name); else { - walk.pagemap[pgno] = (short)index; - walk.dbi_pages[index] += 1; + walk.pagemap[pgno] = (short)(dbi - walk.dbi); + dbi->pages.total += 1; } ++pgno; } while (--pgnumber); @@ -337,16 +350,22 @@ static int handle_freedb(const uint64_t record_number, const MDBX_val *key, problem_add("entry", record_number, "wrong txn-id", "%" PRIaTXN "", txnid); if (data->iov_len < sizeof(pgno_t) || data->iov_len % sizeof(pgno_t)) - problem_add("entry", record_number, "wrong idl size", "%" PRIuPTR "", + problem_add("entry", txnid, "wrong idl size", "%" PRIuPTR "", data->iov_len); else { const pgno_t number = *iptr++; - if (number >= MDBX_PNL_UM_MAX) - problem_add("entry", record_number, "wrong idl length", "%" PRIiPTR "", - number); - else if ((number + 1) * sizeof(pgno_t) != data->iov_len) - problem_add("entry", record_number, "mismatch idl length", - "%" PRIuSIZE " != %" PRIuSIZE "", + if (number < 1 || number >= INT_MAX / 2) + problem_add("entry", txnid, "wrong idl length", "%" PRIaPGNO, number); + else if ((number + 1) * sizeof(pgno_t) > data->iov_len) + problem_add("entry", txnid, "trimmed idl", + "%" PRIuSIZE " > %" PRIuSIZE " (corruption)", + (number + 1) * sizeof(pgno_t), data->iov_len); + else if (data->iov_len - (number + 1) * sizeof(pgno_t) >= + /* LY: allow gap upto one page. it is ok + * and better than shink-and-retry inside mdbx_update_gc() */ + envstat.ms_psize) + problem_add("entry", txnid, "extra idl space", + "%" PRIuSIZE " < %" PRIuSIZE " (minor, not a trouble)", (number + 1) * sizeof(pgno_t), data->iov_len); else { freedb_pages += number; @@ -359,12 +378,12 @@ static int handle_freedb(const uint64_t record_number, const MDBX_val *key, for (unsigned i = 0; i < number; ++i) { const pgno_t pg = iptr[i]; if (pg < NUM_METAS || pg > envinfo.mi_last_pgno) - problem_add("entry", record_number, "wrong idl entry", + problem_add("entry", txnid, "wrong idl entry", "%u < %" PRIaPGNO " < %" PRIu64 "", NUM_METAS, pg, envinfo.mi_last_pgno); else if (MDBX_PNL_DISORDERED(prev, pg)) { bad = " [bad sequence]"; - problem_add("entry", record_number, "bad sequence", + problem_add("entry", txnid, "bad sequence", "%" PRIaPGNO " <> %" PRIaPGNO "", prev, pg); } prev = pg; @@ -516,7 +535,7 @@ static int process_db(MDBX_dbi dbi, char *name, visitor *handler, bool silent) { if (key.iov_len > maxkeysize) { problem_add("entry", record_count, "key length exceeds max-key-size", - "%" PRIuPTR " > %u", key.iov_len, maxkeysize); + "%" PRIuPTR " > %" PRIuPTR, key.iov_len, maxkeysize); } else if ((flags & MDBX_INTEGERKEY) && key.iov_len != sizeof(uint64_t) && key.iov_len != sizeof(uint32_t)) { problem_add("entry", record_count, "wrong key length", @@ -758,7 +777,7 @@ static void print_size(const char *prefix, const uint64_t value, } int main(int argc, char *argv[]) { - int i, rc; + int rc; char *prog = argv[0]; char *envname; int problems_maindb = 0, problems_freedb = 0, problems_meta = 0; @@ -778,14 +797,14 @@ int main(int argc, char *argv[]) { } #endif - walk.dbi_names[0] = "@gc"; + walk.dbi[FREE_DBI].name = "@gc"; atexit(pagemap_cleanup); if (argc < 2) { usage(prog); } - while ((i = getopt(argc, argv, "Vvqnwcds:")) != EOF) { + for (int i; (i = getopt(argc, argv, "Vvqnwcds:")) != EOF;) { switch (i) { case 'V': printf("%s (%s, build %s)\n", mdbx_version.git.describe, @@ -988,24 +1007,25 @@ int main(int argc, char *argv[]) { goto bailout; } - uint64_t n; - for (n = 0; n < lastpgno; ++n) + for (uint64_t n = 0; n < lastpgno; ++n) if (!walk.pagemap[n]) - walk.dbi_pages[0] += 1; + walk.dbi[FREE_DBI].pages.total += 1; empty_pages = lost_bytes = 0; - for (i = 1; i < MAX_DBI && walk.dbi_names[i]; ++i) { - empty_pages += walk.dbi_empty_pages[i]; - lost_bytes += walk.dbi_lost_bytes[i]; + for (walk_dbi_t *dbi = walk.dbi; ++dbi < walk.dbi + MAX_DBI && dbi->name;) { + empty_pages += dbi->pages.empty; + lost_bytes += dbi->lost_bytes; } if (verbose) { uint64_t total_page_bytes = walk.pgcount * envstat.ms_psize; print(" - dbi pages: %" PRIu64 " total", walk.pgcount); if (verbose > 1) - for (i = 1; i < MAX_DBI && walk.dbi_names[i]; ++i) - print(", %s %" PRIu64 "", walk.dbi_names[i], walk.dbi_pages[i]); - print(", %s %" PRIu64 "\n", walk.dbi_names[0], walk.dbi_pages[0]); + for (walk_dbi_t *dbi = walk.dbi; + ++dbi < walk.dbi + MAX_DBI && dbi->name;) + print(", %s %" PRIu64, dbi->name, dbi->pages.total); + print(", %s %" PRIu64 "\n", walk.dbi[FREE_DBI].name, + walk.dbi[FREE_DBI].pages.total); if (verbose > 1) { print(" - space info: total %" PRIu64 " bytes, payload %" PRIu64 " (%.1f%%), unused " @@ -1015,19 +1035,19 @@ int main(int argc, char *argv[]) { total_page_bytes - walk.total_payload_bytes, (total_page_bytes - walk.total_payload_bytes) * 100.0 / total_page_bytes); - for (i = 1; i < MAX_DBI && walk.dbi_names[i]; ++i) { - uint64_t dbi_bytes = walk.dbi_pages[i] * envstat.ms_psize; + for (walk_dbi_t *dbi = walk.dbi; + ++dbi < walk.dbi + MAX_DBI && dbi->name;) { + uint64_t dbi_bytes = dbi->pages.total * envstat.ms_psize; print(" %s: subtotal %" PRIu64 " bytes (%.1f%%)," " payload %" PRIu64 " (%.1f%%), unused %" PRIu64 " (%.1f%%)", - walk.dbi_names[i], dbi_bytes, - dbi_bytes * 100.0 / total_page_bytes, walk.dbi_payload_bytes[i], - walk.dbi_payload_bytes[i] * 100.0 / dbi_bytes, - dbi_bytes - walk.dbi_payload_bytes[i], - (dbi_bytes - walk.dbi_payload_bytes[i]) * 100.0 / dbi_bytes); - if (walk.dbi_empty_pages[i]) - print(", %" PRIu64 " empty pages", walk.dbi_empty_pages[i]); - if (walk.dbi_lost_bytes[i]) - print(", %" PRIu64 " bytes lost", walk.dbi_lost_bytes[i]); + dbi->name, dbi_bytes, dbi_bytes * 100.0 / total_page_bytes, + dbi->payload_bytes, dbi->payload_bytes * 100.0 / dbi_bytes, + dbi_bytes - dbi->payload_bytes, + (dbi_bytes - dbi->payload_bytes) * 100.0 / dbi_bytes); + if (dbi->pages.empty) + print(", %" PRIu64 " empty pages", dbi->pages.empty); + if (dbi->lost_bytes) + print(", %" PRIu64 " bytes lost", dbi->lost_bytes); print("\n"); } } @@ -1084,9 +1104,9 @@ int main(int argc, char *argv[]) { error("used pages mismatch (%" PRIu64 " != %" PRIu64 ")\n", walk.pgcount, lastpgno - freedb_pages); } - if (walk.dbi_pages[0] != freedb_pages) { + if (walk.dbi[FREE_DBI].pages.total != freedb_pages) { error("gc pages mismatch (%" PRIu64 " != %" PRIu64 ")\n", - walk.dbi_pages[0], freedb_pages); + walk.dbi[FREE_DBI].pages.total, freedb_pages); } } else if (verbose) { print(" - skip check used and gc pages (btree-traversal with " From 5049c86517d84d0f2c98273155b8a735705ed0bd Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Sun, 23 Sep 2018 13:06:06 +0300 Subject: [PATCH 46/83] mdbx: backport - avoid empty and unneeded large/overflow pages (squashed). --- src/mdbx.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/mdbx.c b/src/mdbx.c index 64e28a58..70dd069b 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -7906,7 +7906,10 @@ int mdbx_cursor_put(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, if (F_ISSET(leaf->mn_flags, F_BIGDATA)) { MDBX_page *omp; pgno_t pg; - int level, ovpages, dpages = OVPAGES(env, data->iov_len); + int level, ovpages, + dpages = (LEAFSIZE(key, data) > env->me_nodemax) + ? OVPAGES(env, data->iov_len) + : 0; memcpy(&pg, olddata.iov_base, sizeof(pg)); if (unlikely((rc2 = mdbx_page_get(mc, pg, &omp, &level)) != 0)) @@ -7914,7 +7917,8 @@ int mdbx_cursor_put(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, ovpages = omp->mp_pages; /* Is the ov page large enough? */ - if (ovpages >= dpages) { + if (ovpages == + /* LY: add configuragle theshold to keep reserve space */ dpages) { if (!(omp->mp_flags & P_DIRTY) && (level || (env->me_flags & MDBX_WRITEMAP))) { rc = mdbx_page_unspill(mc->mc_txn, omp, &omp); From de44ecccd1d74028000049d6252efa045e3ec322 Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Sun, 23 Sep 2018 14:31:29 +0300 Subject: [PATCH 47/83] mdbx: backport - update MAX_PAGENO and MAX_MAPSIZE64. --- src/bits.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/bits.h b/src/bits.h index d6de6059..76419db5 100644 --- a/src/bits.h +++ b/src/bits.h @@ -160,7 +160,7 @@ * size up to 2^44 bytes, in case of 4K pages. */ typedef uint32_t pgno_t; #define PRIaPGNO PRIu32 -#define MAX_PAGENO ((pgno_t)UINT64_C(0xffffFFFFffff)) +#define MAX_PAGENO UINT32_C(0x7FFFffff) #define MIN_PAGENO NUM_METAS /* A transaction ID. */ @@ -389,9 +389,7 @@ typedef struct MDBX_page { #else #define MAX_MAPSIZE32 UINT32_C(0x7ff80000) #endif -#define MAX_MAPSIZE64 \ - ((sizeof(pgno_t) > 4) ? UINT64_C(0x7fffFFFFfff80000) \ - : MAX_PAGENO * (uint64_t)MAX_PAGESIZE) +#define MAX_MAPSIZE64 (MAX_PAGENO * (uint64_t)MAX_PAGESIZE) #define MAX_MAPSIZE ((sizeof(size_t) < 8) ? MAX_MAPSIZE32 : MAX_MAPSIZE64) From 02276500c9f852d7e9cf07bcada064dd7b0d3f7c Mon Sep 17 00:00:00 2001 From: Howard Chu Date: Wed, 29 Aug 2018 01:25:01 +0100 Subject: [PATCH 48/83] mdbx-doc: import - GET_MULTIPLE etc don't return the key (ITS#8908). Unnecessary since these are DUPs, the key will always be the same --- mdbx.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/mdbx.h b/mdbx.h index 2d15cc24..17272a5b 100644 --- a/mdbx.h +++ b/mdbx.h @@ -344,17 +344,17 @@ typedef enum MDBX_cursor_op { MDBX_GET_BOTH, /* MDBX_DUPSORT-only: Position at key/data pair. */ MDBX_GET_BOTH_RANGE, /* MDBX_DUPSORT-only: position at key, nearest data. */ MDBX_GET_CURRENT, /* Return key/data at current cursor position */ - MDBX_GET_MULTIPLE, /* MDBX_DUPFIXED-only: Return key and up to a page of - * duplicate data items from current cursor position. - * Move cursor to prepare for MDBX_NEXT_MULTIPLE.*/ + MDBX_GET_MULTIPLE, /* MDBX_DUPFIXED-only: Return up to a page of duplicate + * data items from current cursor position. + * Move cursor to prepare for MDBX_NEXT_MULTIPLE. */ MDBX_LAST, /* Position at last key/data item */ MDBX_LAST_DUP, /* MDBX_DUPSORT-only: Position at last data item * of current key. */ MDBX_NEXT, /* Position at next data item */ MDBX_NEXT_DUP, /* MDBX_DUPSORT-only: Position at next data item * of current key. */ - MDBX_NEXT_MULTIPLE, /* MDBX_DUPFIXED-only: Return key and up to a page of - * duplicate data items from next cursor position. + MDBX_NEXT_MULTIPLE, /* MDBX_DUPFIXED-only: Return up to a page of duplicate + * data items from next cursor position. * Move cursor to prepare for MDBX_NEXT_MULTIPLE. */ MDBX_NEXT_NODUP, /* Position at first data item of next key */ MDBX_PREV, /* Position at previous data item */ @@ -366,7 +366,7 @@ typedef enum MDBX_cursor_op { MDBX_SET_RANGE, /* Position at first key greater than or equal to * specified key. */ MDBX_PREV_MULTIPLE /* MDBX_DUPFIXED-only: Position at previous page and - * return key and up to a page of duplicate data items. */ + * return up to a page of duplicate data items. */ } MDBX_cursor_op; /* Return Codes @@ -966,7 +966,7 @@ LIBMDBX_API int mdbx_env_set_assert(MDBX_env *env, MDBX_assert_func *func); * Returns A non-zero error value on failure and 0 on success, some * possible errors are: * - MDBX_PANIC - a fatal error occurred earlier and the environment - * must be shut down. + * must be shut down. * - MDBX_MAP_RESIZED - another process wrote data beyond this MDBX_env's * mapsize and this environment's map must be resized * as well. See mdbx_env_set_mapsize(). From a3aa2b5a57f8b8ff97e34393de23dd4ca3c17c70 Mon Sep 17 00:00:00 2001 From: moneromooo-monero Date: Tue, 15 May 2018 10:53:13 +0100 Subject: [PATCH 49/83] mdbx-doc: import - mdb_cursor_del does not invalidate the cursor (ITS#8857). --- mdbx.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mdbx.h b/mdbx.h index 17272a5b..64aaeec8 100644 --- a/mdbx.h +++ b/mdbx.h @@ -1461,6 +1461,9 @@ LIBMDBX_API int mdbx_cursor_put(MDBX_cursor *cursor, MDBX_val *key, /* Delete current key/data pair * * This function deletes the key/data pair to which the cursor refers. + * This does not invalidate the cursor, so operations such as MDBX_NEXT + * can still be used on it. Both MDBX_NEXT and MDBX_GET_CURRENT will return + * the same record after this operation. * * [in] cursor A cursor handle returned by mdbx_cursor_open() * [in] flags Options for this operation. This parameter must be set to 0 From cc84f85722b6f3e6050de49e7a465c33b7923dde Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Sun, 30 Sep 2018 12:10:43 +0300 Subject: [PATCH 50/83] mdbx-ci: backport - disable CI for old MSVC compilers. Change-Id: Ia1072745664d9a97d4114149a305a6399bde71aa --- appveyor.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/appveyor.yml b/appveyor.yml index 3b0660ae..b2008ce6 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -4,10 +4,10 @@ environment: matrix: - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2017 TOOLSET: v141 - - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015 - TOOLSET: v140 - - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2013 - TOOLSET: v120 +# - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015 +# TOOLSET: v140 +# - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2013 +# TOOLSET: v120 branches: except: From 5acf2b126f4d2d2c4e07b2d160276f4729cbdcd2 Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Fri, 19 Oct 2018 13:33:27 +0300 Subject: [PATCH 51/83] mdbx: backport - fix mdbx_dbi_sequence(). Change-Id: Ic620896ef42c1c2d85c07c146b72e773ab43a67d --- src/mdbx.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/mdbx.c b/src/mdbx.c index 70dd069b..1751087a 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -12356,6 +12356,15 @@ int mdbx_dbi_sequence(MDBX_txn *txn, MDBX_dbi dbi, uint64_t *result, if (unlikely(TXN_DBI_CHANGED(txn, dbi))) return MDBX_BAD_DBI; + if (unlikely(txn->mt_dbflags[dbi] & DB_STALE)) { + MDBX_cursor mc; + MDBX_xcursor mx; + /* Stale, must read the DB's root. cursor_init does it for us. */ + int rc = mdbx_cursor_init(&mc, txn, dbi, &mx); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + } + MDBX_db *dbs = &txn->mt_dbs[dbi]; if (likely(result)) *result = dbs->md_seq; From 08130df5951a1a037fbf18939fbd2c58fc9a0549 Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Sat, 20 Oct 2018 17:17:31 +0300 Subject: [PATCH 52/83] mdbx-windows: backport - workaround for Windows10 bugs. This resolves https://github.com/leo-yuriev/libmdbx/issues/47 Change-Id: I6e0d6dfbfec15b68200438b68a2996c357d46b77 --- src/lck-windows.c | 23 +++++++++++---- src/mdbx.c | 74 ++++++++++++++++++++++++++++++++++++----------- src/osal.c | 10 +++---- 3 files changed, 80 insertions(+), 27 deletions(-) diff --git a/src/lck-windows.c b/src/lck-windows.c index 1e9e0cc9..21d6f05a 100644 --- a/src/lck-windows.c +++ b/src/lck-windows.c @@ -196,12 +196,19 @@ static int suspend_and_append(mdbx_handle_array_t **array, (*array)->limit = limit * 2; } - HANDLE hThread = OpenThread(THREAD_SUSPEND_RESUME, FALSE, ThreadId); + HANDLE hThread = OpenThread(THREAD_SUSPEND_RESUME | THREAD_QUERY_INFORMATION, + FALSE, ThreadId); if (hThread == NULL) return GetLastError(); + if (SuspendThread(hThread) == -1) { + int err = GetLastError(); + DWORD ExitCode; + if (err == /* workaround for Win10 UCRT bug */ ERROR_ACCESS_DENIED || + !GetExitCodeThread(hThread, &ExitCode) || ExitCode != STILL_ACTIVE) + err = MDBX_SUCCESS; CloseHandle(hThread); - return GetLastError(); + return err; } (*array)->handles[(*array)->count++] = hThread; @@ -285,9 +292,15 @@ int mdbx_suspend_threads_before_remap(MDBX_env *env, int mdbx_resume_threads_after_remap(mdbx_handle_array_t *array) { int rc = MDBX_SUCCESS; for (unsigned i = 0; i < array->count; ++i) { - if (ResumeThread(array->handles[i]) == -1) - rc = GetLastError(); - CloseHandle(array->handles[i]); + const HANDLE hThread = array->handles[i]; + if (ResumeThread(hThread) == -1) { + const int err = GetLastError(); + DWORD ExitCode; + if (err != /* workaround for Win10 UCRT bug */ ERROR_ACCESS_DENIED && + GetExitCodeThread(hThread, &ExitCode) && ExitCode == STILL_ACTIVE) + rc = err; + } + CloseHandle(hThread); } return rc; } diff --git a/src/mdbx.c b/src/mdbx.c index 1751087a..3a9f8426 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -2032,18 +2032,20 @@ bailout: VALGRIND_CREATE_BLOCK(env->me_map, env->me_mapsize, "mdbx"); } #endif - } else if (rc != MDBX_RESULT_TRUE) { - mdbx_error("failed resize datafile/mapping: " - "present %" PRIuPTR " -> %" PRIuPTR ", " - "limit %" PRIuPTR " -> %" PRIuPTR ", errcode %d", - env->me_dbgeo.now, size_bytes, env->me_dbgeo.upper, limit_bytes, - rc); } else { - mdbx_notice("unable resize datafile/mapping: " - "present %" PRIuPTR " -> %" PRIuPTR ", " - "limit %" PRIuPTR " -> %" PRIuPTR ", errcode %d", - env->me_dbgeo.now, size_bytes, env->me_dbgeo.upper, limit_bytes, - rc); + if (rc != MDBX_RESULT_TRUE) { + mdbx_error("failed resize datafile/mapping: " + "present %" PRIuPTR " -> %" PRIuPTR ", " + "limit %" PRIuPTR " -> %" PRIuPTR ", errcode %d", + env->me_dbgeo.now, size_bytes, env->me_dbgeo.upper, + limit_bytes, rc); + } else { + mdbx_notice("unable resize datafile/mapping: " + "present %" PRIuPTR " -> %" PRIuPTR ", " + "limit %" PRIuPTR " -> %" PRIuPTR ", errcode %d", + env->me_dbgeo.now, size_bytes, env->me_dbgeo.upper, + limit_bytes, rc); + } if (!env->me_dxb_mmap.address) { env->me_flags |= MDBX_FATAL_ERROR; if (env->me_txn) @@ -2892,6 +2894,16 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) { rc = mdbx_rdt_lock(env); if (unlikely(MDBX_IS_ERROR(rc))) return rc; + if (unlikely(env->me_flags & MDBX_FATAL_ERROR)) { + mdbx_rdt_unlock(env); + return MDBX_PANIC; + } +#if defined(_WIN32) || defined(_WIN64) + if (unlikely(!env->me_map)) { + mdbx_rdt_unlock(env); + return MDBX_EPERM; + } +#endif /* Windows */ rc = MDBX_SUCCESS; if (unlikely(env->me_live_reader != pid)) { @@ -2990,6 +3002,16 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) { rc = mdbx_txn_lock(env, F_ISSET(flags, MDBX_TRYTXN)); if (unlikely(rc)) return rc; + if (unlikely(env->me_flags & MDBX_FATAL_ERROR)) { + mdbx_txn_unlock(env); + return MDBX_PANIC; + } +#if defined(_WIN32) || defined(_WIN64) + if (unlikely(!env->me_map)) { + mdbx_txn_unlock(env); + return MDBX_EPERM; + } +#endif /* Windows */ mdbx_jitter4testing(false); MDBX_meta *meta = mdbx_meta_head(env); @@ -3049,8 +3071,11 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) { goto bailout; } rc = mdbx_mapresize(env, txn->mt_end_pgno, upper_pgno); - if (rc != MDBX_SUCCESS) + if (rc != MDBX_SUCCESS) { + if (rc == MDBX_RESULT_TRUE) + rc = MDBX_MAP_RESIZED; goto bailout; + } } txn->mt_owner = mdbx_thread_self(); return MDBX_SUCCESS; @@ -3098,6 +3123,7 @@ int mdbx_txn_begin(MDBX_env *env, MDBX_txn *parent, unsigned flags, if (unlikely(!env || !ret)) return MDBX_EINVAL; + *ret = NULL; if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) return MDBX_EBADSIGN; @@ -3107,8 +3133,12 @@ int mdbx_txn_begin(MDBX_env *env, MDBX_txn *parent, unsigned flags, if (unlikely(env->me_flags & MDBX_FATAL_ERROR)) return MDBX_PANIC; +#if !defined(_WIN32) && !defined(_WIN64) + /* Don't check env->me_map until lock to avoid race with re-mapping for + * shrinking */ if (unlikely(!env->me_map)) return MDBX_EPERM; +#endif /* Windows */ flags &= MDBX_TXN_BEGIN_FLAGS; flags |= env->me_flags & MDBX_WRITEMAP; @@ -3119,16 +3149,21 @@ int mdbx_txn_begin(MDBX_env *env, MDBX_txn *parent, unsigned flags, if (parent) { if (unlikely(parent->mt_signature != MDBX_MT_SIGNATURE)) - return MDBX_EINVAL; + return MDBX_EBADSIGN; if (unlikely(parent->mt_owner != mdbx_thread_self())) return MDBX_THREAD_MISMATCH; +#if defined(_WIN32) || defined(_WIN64) + if (unlikely(!env->me_map)) + return MDBX_EPERM; +#endif /* Windows */ + /* Nested transactions: Max 1 child, write txns only, no writemap */ flags |= parent->mt_flags; - if (unlikely(flags & (MDBX_RDONLY | MDBX_WRITEMAP | MDBX_TXN_BLOCKED))) { + if (unlikely(flags & (MDBX_RDONLY | MDBX_WRITEMAP | MDBX_TXN_BLOCKED))) return (parent->mt_flags & MDBX_TXN_RDONLY) ? MDBX_EINVAL : MDBX_BAD_TXN; - } + /* Child txns save MDBX_pgstate and use own copy of cursors */ size = env->me_maxdbs * (sizeof(MDBX_db) + sizeof(MDBX_cursor *) + 1); size += tsize = sizeof(MDBX_ntxn); @@ -3204,10 +3239,12 @@ int mdbx_txn_begin(MDBX_env *env, MDBX_txn *parent, unsigned flags, rc = mdbx_txn_renew0(txn, flags); } - if (unlikely(rc)) { + if (unlikely(rc != MDBX_SUCCESS)) { if (txn != env->me_txn0) free(txn); } else { + mdbx_assert(env, + (txn->mt_flags & ~(MDBX_TXN_RDONLY | MDBX_TXN_WRITEMAP)) == 0); txn->mt_signature = MDBX_MT_SIGNATURE; *ret = txn; mdbx_debug("begin txn %" PRIaTXN "%c %p on env %p, root page %" PRIaPGNO @@ -4400,8 +4437,11 @@ int mdbx_txn_commit(MDBX_txn *txn) { rc = mdbx_sync_locked( env, env->me_flags | txn->mt_flags | MDBX_SHRINK_ALLOWED, &meta); } - if (unlikely(rc != MDBX_SUCCESS)) + if (unlikely(rc != MDBX_SUCCESS)) { + env->me_flags |= MDBX_FATAL_ERROR; goto fail; + } + env->me_lck->mti_readers_refresh_flag = false; end_mode = MDBX_END_COMMITTED | MDBX_END_UPDATE | MDBX_END_EOTDONE; diff --git a/src/osal.c b/src/osal.c index 52a8cae8..faa7cc16 100644 --- a/src/osal.c +++ b/src/osal.c @@ -973,11 +973,11 @@ int mdbx_mresize(int flags, mdbx_mmap_t *map, size_t size, size_t limit) { &ReservedSize, MEM_RESERVE, PAGE_NOACCESS); if (!NT_SUCCESS(status)) { ReservedAddress = NULL; - if (status != /* STATUS_CONFLICTING_ADDRESSES */ 0xC0000018 || - limit == map->length) + if (status != /* STATUS_CONFLICTING_ADDRESSES */ 0xC0000018) goto bailout_ntstatus /* no way to recovery */; - /* assume we can change base address if mapping size changed */ + /* assume we can change base address if mapping size changed or prev address + * couldn't be used */ map->address = NULL; } @@ -1034,8 +1034,8 @@ retry_mapview:; if (!NT_SUCCESS(status)) { if (status == /* STATUS_CONFLICTING_ADDRESSES */ 0xC0000018 && - map->address && limit != map->length) { - /* try remap at another base address, but only if the limit is changing */ + map->address) { + /* try remap at another base address */ map->address = NULL; goto retry_mapview; } From affd28654c481d2fc1d13e1bba7a646879754066 Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Sun, 21 Oct 2018 18:28:19 +0300 Subject: [PATCH 53/83] mdbx: backport - fix mdbx_txn_abort(). This resolves https://github.com/leo-yuriev/libfpta/issues/20 Change-Id: I43c0c960d5c871d837b307cd370ee7327db01ff6 --- src/mdbx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mdbx.c b/src/mdbx.c index 3a9f8426..645a2f26 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -3435,7 +3435,7 @@ int mdbx_txn_abort(MDBX_txn *txn) { if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) return MDBX_EBADSIGN; - if (unlikely(txn->mt_owner && txn->mt_owner != mdbx_thread_self())) + if (unlikely(txn->mt_owner != mdbx_thread_self())) return MDBX_THREAD_MISMATCH; if (F_ISSET(txn->mt_flags, MDBX_TXN_RDONLY)) From 86e63f0b6b23fd55b65910551d965f391427633a Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Sun, 4 Nov 2018 18:57:15 +0300 Subject: [PATCH 54/83] mdbx: backport - refine mdbx_env_copy() internals (required for next patch). Change-Id: I9e8f0dc87398564524a5ec98eda2cb9bde100909 --- src/mdbx.c | 259 ++++++++++++++++++++++++++++++----------------------- src/osal.c | 20 +++++ src/osal.h | 2 + 3 files changed, 168 insertions(+), 113 deletions(-) diff --git a/src/mdbx.c b/src/mdbx.c index 645a2f26..cef8e69a 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -1,4 +1,4 @@ -/* +/* * Copyright 2015-2018 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. @@ -10576,184 +10576,215 @@ done: } /* Copy environment with compaction. */ -static int __cold mdbx_env_compact(MDBX_env *env, mdbx_filehandle_t fd) { - MDBX_txn *txn = NULL; - mdbx_thread_t thr; - mdbx_copy ctx; - memset(&ctx, 0, sizeof(ctx)); - - int rc = mdbx_condmutex_init(&ctx.mc_condmutex); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - - const size_t buffer_size = pgno2bytes(env, NUM_METAS) + MDBX_WBUF * 2; - uint8_t *buffer = NULL; - rc = mdbx_memalign_alloc(env->me_os_psize, buffer_size, (void **)&buffer); - if (unlikely(rc != MDBX_SUCCESS)) - goto done; - - ctx.mc_wbuf[0] = buffer + pgno2bytes(env, NUM_METAS); - memset(ctx.mc_wbuf[0], 0, MDBX_WBUF * 2); - ctx.mc_wbuf[1] = ctx.mc_wbuf[0] + MDBX_WBUF; - ctx.mc_next_pgno = NUM_METAS; - ctx.mc_env = env; - ctx.mc_fd = fd; - rc = mdbx_thread_create(&thr, mdbx_env_copythr, &ctx); - if (unlikely(rc != MDBX_SUCCESS)) - goto done; - - rc = mdbx_txn_begin(env, NULL, MDBX_RDONLY, &txn); - if (unlikely(rc != MDBX_SUCCESS)) - goto finish; - +static int __cold mdbx_env_compact(MDBX_env *env, MDBX_txn *read_txn, + mdbx_filehandle_t fd, uint8_t *buffer) { MDBX_page *const meta = mdbx_init_metas(env, buffer); + /* copy canary sequenses if present */ + if (read_txn->mt_canary.v) { + meta->mp_meta.mm_canary = read_txn->mt_canary; + meta->mp_meta.mm_canary.v = mdbx_meta_txnid_stable(env, &meta->mp_meta); + } /* Set metapage 1 with current main DB */ - pgno_t new_root, root = txn->mt_dbs[MAIN_DBI].md_root; - if ((new_root = root) != P_INVALID) { + pgno_t new_root, root = read_txn->mt_dbs[MAIN_DBI].md_root; + if ((new_root = root) == P_INVALID) { + /* When the DB is empty, handle it specially to + * fix any breakage like page leaks from ITS#8174. */ + meta->mp_meta.mm_dbs[MAIN_DBI].md_flags = + read_txn->mt_dbs[MAIN_DBI].md_flags; + } else { /* Count free pages + freeDB pages. Subtract from last_pg * to find the new last_pg, which also becomes the new root. */ pgno_t freecount = 0; MDBX_cursor mc; MDBX_val key, data; - rc = mdbx_cursor_init(&mc, txn, FREE_DBI, NULL); + int rc = mdbx_cursor_init(&mc, read_txn, FREE_DBI, NULL); if (unlikely(rc != MDBX_SUCCESS)) return rc; while ((rc = mdbx_cursor_get(&mc, &key, &data, MDBX_NEXT)) == 0) freecount += *(pgno_t *)data.iov_base; if (unlikely(rc != MDBX_NOTFOUND)) - goto finish; + return rc; - freecount += txn->mt_dbs[FREE_DBI].md_branch_pages + - txn->mt_dbs[FREE_DBI].md_leaf_pages + - txn->mt_dbs[FREE_DBI].md_overflow_pages; + freecount += read_txn->mt_dbs[FREE_DBI].md_branch_pages + + read_txn->mt_dbs[FREE_DBI].md_leaf_pages + + read_txn->mt_dbs[FREE_DBI].md_overflow_pages; - new_root = txn->mt_next_pgno - 1 - freecount; - meta->mp_meta.mm_geo.next = meta->mp_meta.mm_geo.now = new_root + 1; - meta->mp_meta.mm_dbs[MAIN_DBI] = txn->mt_dbs[MAIN_DBI]; + new_root = read_txn->mt_next_pgno - 1 - freecount; + meta->mp_meta.mm_geo.next = new_root + 1; + meta->mp_meta.mm_dbs[MAIN_DBI] = read_txn->mt_dbs[MAIN_DBI]; meta->mp_meta.mm_dbs[MAIN_DBI].md_root = new_root; - } else { - /* When the DB is empty, handle it specially to - * fix any breakage like page leaks from ITS#8174. */ - meta->mp_meta.mm_dbs[MAIN_DBI].md_flags = txn->mt_dbs[MAIN_DBI].md_flags; - } - /* copy canary sequenses if present */ - if (txn->mt_canary.v) { - meta->mp_meta.mm_canary = txn->mt_canary; - meta->mp_meta.mm_canary.v = mdbx_meta_txnid_stable(env, &meta->mp_meta); - } + mdbx_copy ctx; + memset(&ctx, 0, sizeof(ctx)); + rc = mdbx_condmutex_init(&ctx.mc_condmutex); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; - /* update signature */ - meta->mp_meta.mm_datasync_sign = mdbx_meta_sign(&meta->mp_meta); + ctx.mc_wbuf[0] = buffer + pgno2bytes(env, NUM_METAS); + memset(ctx.mc_wbuf[0], 0, MDBX_WBUF * 2); + ctx.mc_wbuf[1] = ctx.mc_wbuf[0] + MDBX_WBUF; + ctx.mc_next_pgno = NUM_METAS; + ctx.mc_env = env; + ctx.mc_fd = fd; + ctx.mc_txn = read_txn; + + mdbx_thread_t thread; + int thread_err = mdbx_thread_create(&thread, mdbx_env_copythr, &ctx); + if (likely(thread_err == MDBX_SUCCESS)) { + rc = mdbx_env_cwalk(&ctx, &root, 0); + mdbx_env_cthr_toggle(&ctx, 1 | MDBX_EOF); + thread_err = mdbx_thread_join(thread); + mdbx_condmutex_destroy(&ctx.mc_condmutex); + } + if (unlikely(thread_err != MDBX_SUCCESS)) + return thread_err; + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + if (unlikely(ctx.mc_error != MDBX_SUCCESS)) + return ctx.mc_error; - ctx.mc_wlen[0] = pgno2bytes(env, NUM_METAS); - ctx.mc_txn = txn; - rc = mdbx_env_cwalk(&ctx, &root, 0); - if (rc == MDBX_SUCCESS && root != new_root) { if (root > new_root) { mdbx_error("post-compactification root %" PRIaPGNO " GT expected %" PRIaPGNO " (source DB corrupted)", root, new_root); - rc = MDBX_CORRUPTED; /* page leak or corrupt DB */ - } else { - mdbx_error("post-compactification root %" PRIaPGNO - " LT expected %" PRIaPGNO " (page leak(s) in source DB)", - root, new_root); - /* fixup and rewrite metas */ + return MDBX_CORRUPTED; /* page leak or corrupt DB */ + } + if (root < new_root) { + mdbx_notice("post-compactification root %" PRIaPGNO + " LT expected %" PRIaPGNO " (page leak(s) in source DB)", + root, new_root); + /* fixup meta */ meta->mp_meta.mm_dbs[MAIN_DBI].md_root = root; - meta->mp_meta.mm_geo.next = meta->mp_meta.mm_geo.now = root + 1; - meta->mp_meta.mm_datasync_sign = mdbx_meta_sign(&meta->mp_meta); - rc = mdbx_pwrite(fd, buffer, pgno2bytes(env, NUM_METAS), 0); + meta->mp_meta.mm_geo.next = root + 1; } } -finish: - if (rc != MDBX_SUCCESS) - ctx.mc_error = rc; - mdbx_env_cthr_toggle(&ctx, 1 | MDBX_EOF); - rc = mdbx_thread_join(thr); - mdbx_txn_abort(txn); - -done: - mdbx_memalign_free(buffer); - mdbx_condmutex_destroy(&ctx.mc_condmutex); - return rc ? rc : ctx.mc_error; + /* update signature */ + meta->mp_meta.mm_datasync_sign = mdbx_meta_sign(&meta->mp_meta); + return MDBX_SUCCESS; } /* Copy environment as-is. */ -static int __cold mdbx_env_copy_asis(MDBX_env *env, mdbx_filehandle_t fd) { - MDBX_txn *txn = NULL; - - /* Do the lock/unlock of the reader mutex before starting the - * write txn. Otherwise other read txns could block writers. */ - int rc = mdbx_txn_begin(env, NULL, MDBX_RDONLY, &txn); +static int __cold mdbx_env_copy_asis(MDBX_env *env, MDBX_txn *read_txn, + mdbx_filehandle_t fd, uint8_t *buffer) { + /* We must start the actual read txn after blocking writers */ + int rc = mdbx_txn_end(read_txn, MDBX_END_RESET_TMP); if (unlikely(rc != MDBX_SUCCESS)) return rc; - /* We must start the actual read txn after blocking writers */ - rc = mdbx_txn_end(txn, MDBX_END_RESET_TMP); - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; /* FIXME: or just return? */ - /* Temporarily block writers until we snapshot the meta pages */ rc = mdbx_txn_lock(env, false); if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; + return rc; - rc = mdbx_txn_renew0(txn, MDBX_RDONLY); + rc = mdbx_txn_renew0(read_txn, MDBX_RDONLY); if (unlikely(rc != MDBX_SUCCESS)) { mdbx_txn_unlock(env); - goto bailout; + return rc; } - rc = mdbx_write(fd, env->me_map, pgno2bytes(env, NUM_METAS)); - MDBX_meta *const head = mdbx_meta_head(env); + /* Make a snapshot of meta-pages, + * but writing ones after the data was flushed */ + memcpy(buffer, env->me_map, pgno2bytes(env, NUM_METAS)); + MDBX_meta *const headcopy = /* LY: get pointer to the spanshot copy */ + (MDBX_meta *)(buffer + ((uint8_t *)mdbx_meta_head(env) - env->me_map)); const uint64_t size = - mdbx_roundup2(pgno2bytes(env, head->mm_geo.now), env->me_os_psize); + mdbx_roundup2(pgno2bytes(env, headcopy->mm_geo.now), env->me_os_psize); mdbx_txn_unlock(env); - if (likely(rc == MDBX_SUCCESS)) - rc = mdbx_write(fd, env->me_map + pgno2bytes(env, NUM_METAS), - pgno2bytes(env, txn->mt_next_pgno - NUM_METAS)); + /* Update signature to steady */ + headcopy->mm_datasync_sign = mdbx_meta_sign(headcopy); + + /* Copy the data */ + rc = mdbx_pwrite(fd, env->me_map + pgno2bytes(env, NUM_METAS), + pgno2bytes(env, read_txn->mt_next_pgno - NUM_METAS), + pgno2bytes(env, NUM_METAS)); if (likely(rc == MDBX_SUCCESS)) rc = mdbx_ftruncate(fd, size); -bailout: - mdbx_txn_abort(txn); return rc; } int __cold mdbx_env_copy2fd(MDBX_env *env, mdbx_filehandle_t fd, unsigned flags) { - if (flags & MDBX_CP_COMPACT) - return mdbx_env_compact(env, fd); + if (unlikely(!env)) + return MDBX_EINVAL; - return mdbx_env_copy_asis(env, fd); + if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) + return MDBX_EBADSIGN; + + int rc = mdbx_fseek(fd, 0); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + const size_t buffer_size = pgno2bytes(env, NUM_METAS) + + ((flags & MDBX_CP_COMPACT) ? MDBX_WBUF * 2 : 0); + uint8_t *buffer = NULL; + rc = mdbx_memalign_alloc(env->me_os_psize, buffer_size, (void **)&buffer); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + MDBX_txn *read_txn = NULL; + /* Do the lock/unlock of the reader mutex before starting the + * write txn. Otherwise other read txns could block writers. */ + rc = mdbx_txn_begin(env, NULL, MDBX_RDONLY, &read_txn); + if (unlikely(rc != MDBX_SUCCESS)) { + mdbx_memalign_free(buffer); + return rc; + } + + /* Firstly write a stub to meta-pages. + * Now we sure to incomplete copy will not be used. */ + memset(buffer, -1, pgno2bytes(env, NUM_METAS)); + rc = mdbx_write(fd, buffer, pgno2bytes(env, NUM_METAS)); + if (likely(rc == MDBX_SUCCESS)) { + memset(buffer, 0, pgno2bytes(env, NUM_METAS)); + rc = (flags & MDBX_CP_COMPACT) + ? mdbx_env_compact(env, read_txn, fd, buffer) + : mdbx_env_copy_asis(env, read_txn, fd, buffer); + } + mdbx_txn_abort(read_txn); + + if (likely(rc == MDBX_SUCCESS)) + rc = mdbx_filesync(fd, true); + + /* Write actual meta */ + if (likely(rc == MDBX_SUCCESS)) + rc = mdbx_pwrite(fd, buffer, pgno2bytes(env, NUM_METAS), 0); + + mdbx_memalign_free(buffer); + return rc; } -int __cold mdbx_env_copy(MDBX_env *env, const char *path, unsigned flags) { - char *lck_pathname; +int __cold mdbx_env_copy(MDBX_env *env, const char *dest_path, unsigned flags) { + if (unlikely(!env || !dest_path)) + return MDBX_EINVAL; + + if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) + return MDBX_EBADSIGN; + + char *dxb_pathname; mdbx_filehandle_t newfd = INVALID_HANDLE_VALUE; if (env->me_flags & MDBX_NOSUBDIR) { - lck_pathname = (char *)path; + dxb_pathname = (char *)dest_path; } else { - size_t len = strlen(path); + size_t len = strlen(dest_path); len += sizeof(MDBX_DATANAME); - lck_pathname = malloc(len); - if (!lck_pathname) + dxb_pathname = malloc(len); + if (!dxb_pathname) return MDBX_ENOMEM; - sprintf(lck_pathname, "%s" MDBX_DATANAME, path); + sprintf(dxb_pathname, "%s" MDBX_DATANAME, dest_path); } /* The destination path must exist, but the destination file must not. * We don't want the OS to cache the writes, since the source data is * already in the OS cache. */ int rc = - mdbx_openfile(lck_pathname, O_WRONLY | O_CREAT | O_EXCL, 0666, &newfd); + mdbx_openfile(dxb_pathname, O_WRONLY | O_CREAT | O_EXCL, 0640, &newfd); if (rc == MDBX_SUCCESS) { if (env->me_psize >= env->me_os_psize) { #ifdef F_NOCACHE /* __APPLE__ */ @@ -10767,15 +10798,17 @@ int __cold mdbx_env_copy(MDBX_env *env, const char *path, unsigned flags) { rc = mdbx_env_copy2fd(env, newfd, flags); } - if (!(env->me_flags & MDBX_NOSUBDIR)) - free(lck_pathname); - if (newfd != INVALID_HANDLE_VALUE) { int err = mdbx_closefile(newfd); if (rc == MDBX_SUCCESS && err != rc) rc = err; + if (rc != MDBX_SUCCESS) + (void)mdbx_removefile(dxb_pathname); } + if (dxb_pathname != dest_path) + free(dxb_pathname); + return rc; } diff --git a/src/osal.c b/src/osal.c index faa7cc16..502205ae 100644 --- a/src/osal.c +++ b/src/osal.c @@ -399,6 +399,13 @@ int mdbx_fastmutex_release(mdbx_fastmutex_t *fastmutex) { /*----------------------------------------------------------------------------*/ +int mdbx_removefile(const char *pathname) { +#if defined(_WIN32) || defined(_WIN64) + return DeleteFileA(pathname) ? MDBX_SUCCESS : GetLastError(); +#else + return unlink(pathname) ? errno : MDBX_SUCCESS; +#endif +} int mdbx_openfile(const char *pathname, int flags, mode_t mode, mdbx_filehandle_t *fd) { *fd = INVALID_HANDLE_VALUE; @@ -698,6 +705,19 @@ int mdbx_ftruncate(mdbx_filehandle_t fd, uint64_t length) { #endif } +int mdbx_fseek(mdbx_filehandle_t fd, uint64_t pos) { +#if defined(_WIN32) || defined(_WIN64) + LARGE_INTEGER li; + li.QuadPart = pos; + return SetFilePointerEx(fd, li, NULL, FILE_BEGIN) ? MDBX_SUCCESS + : GetLastError(); +#else + STATIC_ASSERT_MSG(sizeof(off_t) >= sizeof(size_t), + "libmdbx requires 64-bit file I/O on 64-bit systems"); + return (lseek(fd, pos, SEEK_SET) < 0) ? errno : MDBX_SUCCESS; +#endif +} + /*----------------------------------------------------------------------------*/ int mdbx_thread_create(mdbx_thread_t *thread, diff --git a/src/osal.h b/src/osal.h index 0b230a4a..5754a83a 100644 --- a/src/osal.h +++ b/src/osal.h @@ -473,10 +473,12 @@ int mdbx_thread_join(mdbx_thread_t thread); int mdbx_filesync(mdbx_filehandle_t fd, bool fullsync); int mdbx_filesize_sync(mdbx_filehandle_t fd); int mdbx_ftruncate(mdbx_filehandle_t fd, uint64_t length); +int mdbx_fseek(mdbx_filehandle_t fd, uint64_t pos); int mdbx_filesize(mdbx_filehandle_t fd, uint64_t *length); int mdbx_openfile(const char *pathname, int flags, mode_t mode, mdbx_filehandle_t *fd); int mdbx_closefile(mdbx_filehandle_t fd); +int mdbx_removefile(const char *pathname); typedef struct mdbx_mmap_param { union { From 3d59c9f9e772638a4df72f4c193c3c6007504fc3 Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Sun, 4 Nov 2018 18:57:19 +0300 Subject: [PATCH 55/83] mdbx: backport - take in account shrink/growing thresholds while copy-with-compactification. Change-Id: Id93e62089819dfcc8cbc83620e0bdd806d8c1950 --- src/mdbx.c | 33 +++++++++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/src/mdbx.c b/src/mdbx.c index cef8e69a..7dd1053b 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -10661,9 +10661,38 @@ static int __cold mdbx_env_compact(MDBX_env *env, MDBX_txn *read_txn, } } - /* update signature */ + /* Calculate filesize taking in account shrink/growing thresholds */ + if (meta->mp_meta.mm_geo.next > meta->mp_meta.mm_geo.now) { + const pgno_t aligned = + pgno_align2os_pgno(env, pgno_add(meta->mp_meta.mm_geo.next, + meta->mp_meta.mm_geo.grow - + meta->mp_meta.mm_geo.next % + meta->mp_meta.mm_geo.grow)); + meta->mp_meta.mm_geo.now = aligned; + } else if (meta->mp_meta.mm_geo.next < meta->mp_meta.mm_geo.now) { + meta->mp_meta.mm_geo.now = meta->mp_meta.mm_geo.next; + const pgno_t aligner = meta->mp_meta.mm_geo.grow + ? meta->mp_meta.mm_geo.grow + : meta->mp_meta.mm_geo.shrink; + const pgno_t aligned = + pgno_align2os_pgno(env, meta->mp_meta.mm_geo.next + aligner - + meta->mp_meta.mm_geo.next % aligner); + meta->mp_meta.mm_geo.now = aligned; + } + + if (meta->mp_meta.mm_geo.now < meta->mp_meta.mm_geo.lower) + meta->mp_meta.mm_geo.now = meta->mp_meta.mm_geo.lower; + if (meta->mp_meta.mm_geo.now > meta->mp_meta.mm_geo.upper) + meta->mp_meta.mm_geo.now = meta->mp_meta.mm_geo.upper; + + /* Update signature */ + assert(meta->mp_meta.mm_geo.now >= meta->mp_meta.mm_geo.next); meta->mp_meta.mm_datasync_sign = mdbx_meta_sign(&meta->mp_meta); - return MDBX_SUCCESS; + + /* Extend file if required */ + return (meta->mp_meta.mm_geo.now != meta->mp_meta.mm_geo.next) + ? mdbx_ftruncate(fd, pgno2bytes(env, meta->mp_meta.mm_geo.now)) + : MDBX_SUCCESS; } /* Copy environment as-is. */ From ee0c8bb249396da61812eec6e996b50f36d9f1b4 Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Sun, 4 Nov 2018 18:57:36 +0300 Subject: [PATCH 56/83] mdbx: backport - add db-copy testcase. Change-Id: Ib554880ebbabcb5dfc55bdb3c71767d0fa1630fd --- libmdbx.files | 1 + mdbx.h | 3 +++ test/CMakeLists.txt | 51 ++++++++++++++++++++++---------------------- test/cases.cc | 3 ++- test/config.h | 5 +++-- test/copy.cc | 26 ++++++++++++++++++++++ test/main.cc | 4 ++++ test/osal-unix.cc | 6 +++++- test/osal-windows.cc | 6 +++++- test/osal.h | 3 ++- test/test.cc | 7 +++++- test/test.h | 11 ++++++++++ test/test.vcxproj | 1 + 13 files changed, 95 insertions(+), 32 deletions(-) create mode 100644 test/copy.cc diff --git a/libmdbx.files b/libmdbx.files index 38125146..653b0397 100644 --- a/libmdbx.files +++ b/libmdbx.files @@ -3,6 +3,7 @@ README-RU.md pcrf_test/CMakeLists.txt src/tools/CMakeLists.txt test/CMakeLists.txt +test/copy.cc tutorial/CMakeLists.txt tutorial/sample-mdbx.c AUTHORS diff --git a/mdbx.h b/mdbx.h index 64aaeec8..cedcbc11 100644 --- a/mdbx.h +++ b/mdbx.h @@ -100,6 +100,7 @@ typedef DWORD mdbx_tid_t; #define MDBX_EIO ERROR_WRITE_FAULT #define MDBX_EPERM ERROR_INVALID_FUNCTION #define MDBX_EINTR ERROR_CANCELLED +#define MDBX_ENOFILE ERROR_FILE_NOT_FOUND #else @@ -120,6 +121,8 @@ typedef pthread_t mdbx_tid_t; #define MDBX_EIO EIO #define MDBX_EPERM EPERM #define MDBX_EINTR EINTR +#define MDBX_ENOFILE ENOENT + #endif #ifdef _MSC_VER diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 3ed01bdd..ca7dd794 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -1,34 +1,35 @@ -set(TARGET mdbx_test) +set(TARGET mdbx_test) project(${TARGET}) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-missing-declarations") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-cast-qual") add_executable(${TARGET} - base.h - cases.cc - chrono.cc - chrono.h - config.cc - config.h - dead.cc - hill.cc - jitter.cc - keygen.cc - keygen.h - log.cc - log.h - main.cc - osal.h - osal-unix.cc - test.cc - test.h - try.cc - utils.cc - utils.h -) + base.h + cases.cc + chrono.cc + chrono.h + config.cc + config.h + copy.cc + dead.cc + hill.cc + jitter.cc + keygen.cc + keygen.h + log.cc + log.h + main.cc + osal.h + osal-unix.cc + test.cc + test.h + try.cc + utils.cc + utils.h + ) target_link_libraries(${TARGET} - mdbx - ) + mdbx + ) diff --git a/test/cases.cc b/test/cases.cc index 4f4306d5..13d47576 100644 --- a/test/cases.cc +++ b/test/cases.cc @@ -1,4 +1,4 @@ -/* +/* * Copyright 2017-2018 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. @@ -68,6 +68,7 @@ void testcase_setup(const char *casename, actor_params ¶ms, configure_actor(last_space_id, ac_jitter, nullptr, params); configure_actor(last_space_id, ac_hill, nullptr, params); configure_actor(last_space_id, ac_try, nullptr, params); + configure_actor(last_space_id, ac_copy, nullptr, params); log_notice("<<< testcase_setup(%s): done", casename); } else { failure("unknown testcase `%s`", casename); diff --git a/test/config.h b/test/config.h index 2d0fede0..1886a8ea 100644 --- a/test/config.h +++ b/test/config.h @@ -1,4 +1,4 @@ -/* +/* * Copyright 2017-2018 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. @@ -26,7 +26,8 @@ enum actor_testcase { ac_deadread, ac_deadwrite, ac_jitter, - ac_try + ac_try, + ac_copy }; enum actor_status { diff --git a/test/copy.cc b/test/copy.cc new file mode 100644 index 00000000..e239d41e --- /dev/null +++ b/test/copy.cc @@ -0,0 +1,26 @@ +#include "test.h" + +void testcase_copy::copy_db(const bool with_compaction) { + int err = osal_removefile(copy_pathname); + if (err != MDBX_SUCCESS && err != MDBX_ENOFILE) + failure_perror("mdbx_removefile()", err); + + err = mdbx_env_copy(db_guard.get(), copy_pathname.c_str(), + with_compaction ? MDBX_CP_COMPACT : 0); + if (unlikely(err != MDBX_SUCCESS)) + failure_perror(with_compaction ? "mdbx_env_copy(MDBX_CP_COMPACT)" + : "mdbx_env_copy(MDBX_CP_ASIS)", + err); +} + +bool testcase_copy::run() { + jitter_delay(); + db_open(); + assert(!txn_guard); + const bool order = flipcoin(); + jitter_delay(); + copy_db(order); + jitter_delay(); + copy_db(!order); + return true; +} diff --git a/test/main.cc b/test/main.cc index 3384311b..275b7b13 100644 --- a/test/main.cc +++ b/test/main.cc @@ -337,6 +337,10 @@ int main(int argc, char *const argv[]) { configure_actor(last_space_id, ac_deadwrite, value, params); continue; } + if (config::parse_option(argc, argv, narg, "copy", nullptr)) { + configure_actor(last_space_id, ac_copy, value, params); + continue; + } if (config::parse_option(argc, argv, narg, "failfast", global::config::failfast)) continue; diff --git a/test/osal-unix.cc b/test/osal-unix.cc index 6661ae42..1856d0f8 100644 --- a/test/osal-unix.cc +++ b/test/osal-unix.cc @@ -1,4 +1,4 @@ -/* +/* * Copyright 2017-2018 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. @@ -301,3 +301,7 @@ std::string osal_tempdir(void) { return "/dev/shm/"; return ""; } + +int osal_removefile(const std::string &pathname) { + return unlink(pathname.c_str()) ? errno : MDBX_SUCCESS; +} diff --git a/test/osal-windows.cc b/test/osal-windows.cc index 7d59f657..f7f1de56 100644 --- a/test/osal-windows.cc +++ b/test/osal-windows.cc @@ -1,4 +1,4 @@ -/* +/* * Copyright 2017-2018 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. @@ -405,3 +405,7 @@ std::string osal_tempdir(void) { DWORD len = GetTempPathA(sizeof(buf), buf); return std::string(buf, len); } + +int osal_removefile(const std::string &pathname) { + return DeleteFileA(pathname.c_str()) ? MDBX_SUCCESS : GetLastError(); +} diff --git a/test/osal.h b/test/osal.h index c27282a6..3ccc7bbe 100644 --- a/test/osal.h +++ b/test/osal.h @@ -1,4 +1,4 @@ -/* +/* * Copyright 2017-2018 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. @@ -32,6 +32,7 @@ void osal_udelay(unsigned us); void osal_yield(void); bool osal_istty(int fd); std::string osal_tempdir(void); +int osal_removefile(const std::string &pathname); #ifdef _MSC_VER #ifndef STDIN_FILENO diff --git a/test/test.cc b/test/test.cc index c28bbd22..445c4c8d 100644 --- a/test/test.cc +++ b/test/test.cc @@ -1,4 +1,4 @@ -/* +/* * Copyright 2017-2018 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. @@ -31,6 +31,8 @@ const char *testcase2str(const actor_testcase testcase) { return "jitter"; case ac_try: return "try"; + case ac_copy: + return "copy"; } } @@ -443,6 +445,9 @@ bool test_execute(const actor_config &config) { case ac_try: test.reset(new testcase_try(config, pid)); break; + case ac_copy: + test.reset(new testcase_copy(config, pid)); + break; default: test.reset(new testcase(config, pid)); break; diff --git a/test/test.h b/test/test.h index ef1c4caa..765940ce 100644 --- a/test/test.h +++ b/test/test.h @@ -203,3 +203,14 @@ public: bool run(); bool teardown(); }; + +class testcase_copy : public testcase { + const std::string copy_pathname; + void copy_db(const bool with_compaction); + +public: + testcase_copy(const actor_config &config, const mdbx_pid_t pid) + : testcase(config, pid), + copy_pathname(config.params.pathname_db + "-copy") {} + bool run(); +}; diff --git a/test/test.vcxproj b/test/test.vcxproj index a8c21d38..98ff7f49 100644 --- a/test/test.vcxproj +++ b/test/test.vcxproj @@ -184,6 +184,7 @@ + From 6120c2be0afcd0046320f9dd3408dd4a0234cb5b Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Sun, 4 Nov 2018 18:57:38 +0300 Subject: [PATCH 57/83] mdbx-test: backport - add checks for `db-copy` after the `basic` testcase. Change-Id: I5e7d343266c66418a8798d272e697e1c3d5c775b --- Makefile | 9 ++++++--- test/gc.sh | 5 +++-- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/Makefile b/Makefile index dabf2f98..11a6140f 100644 --- a/Makefile +++ b/Makefile @@ -82,13 +82,16 @@ clean: rm -rf $(TOOLS) mdbx_test @* *.[ao] *.[ls]o *~ tmp.db/* *.gcov *.log *.err src/*.o test/*.o check: all - rm -f $(TESTDB) $(TESTLOG) && (set -o pipefail; ./mdbx_test --pathname=$(TESTDB) --dont-cleanup-after basic | tee -a $(TESTLOG) | tail -n 42) && ./mdbx_chk -vvn $(TESTDB) + rm -f $(TESTDB) $(TESTLOG) && (set -o pipefail; ./mdbx_test --pathname=$(TESTDB) --dont-cleanup-after basic | tee -a $(TESTLOG) | tail -n 42) \ + && ./mdbx_chk -vvn $(TESTDB) && ./mdbx_chk -vvn $(TESTDB)-copy check-singleprocess: all - rm -f $(TESTDB) $(TESTLOG) && (set -o pipefail; ./mdbx_test --pathname=$(TESTDB) --dont-cleanup-after --hill | tee -a $(TESTLOG) | tail -n 42) && ./mdbx_chk -vvn $(TESTDB) + rm -f $(TESTDB) $(TESTLOG) && (set -o pipefail; ./mdbx_test --pathname=$(TESTDB) --dont-cleanup-after --hill | tee -a $(TESTLOG) | tail -n 42) \ + && ./mdbx_chk -vvn $(TESTDB) && ./mdbx_chk -vvn $(TESTDB)-copy check-fault: all - rm -f $(TESTDB) $(TESTLOG) && (set -o pipefail; ./mdbx_test --pathname=$(TESTDB) --inject-writefault=42 --dump-config --dont-cleanup-after basic | tee -a $(TESTLOG) | tail -n 42) && ./mdbx_chk -vvn $(TESTDB) + rm -f $(TESTDB) $(TESTLOG) && (set -o pipefail; ./mdbx_test --pathname=$(TESTDB) --inject-writefault=42 --dump-config --dont-cleanup-after basic | tee -a $(TESTLOG) | tail -n 42) \ + && ./mdbx_chk -vvn $(TESTDB) && ./mdbx_chk -vvn $(TESTDB)-copy define core-rule $(patsubst %.c,%.o,$(1)): $(1) $(CORE_INC) mdbx.h Makefile diff --git a/test/gc.sh b/test/gc.sh index b8431d0f..1ab5b305 100755 --- a/test/gc.sh +++ b/test/gc.sh @@ -25,6 +25,7 @@ function probe { rm -f ${TESTDB_PREFIX}* \ && ./mdbx_test --pathname=${TESTDB_PREFIX}db "$@" | lz4 > ${TESTDB_PREFIX}log.lz4 \ && ./mdbx_chk -nvv ${TESTDB_PREFIX}db | tee ${TESTDB_PREFIX}chk \ + && ./mdbx_chk -nvv ${TESTDB_PREFIX}db-copy | tee ${TESTDB_PREFIX}chk-copy \ || (echo "FAILED"; exit 1) } @@ -52,11 +53,11 @@ for nops in {2..7}; do caption="Probe #$((++count)) w/o-dups, repeat ${rep} of ${loops}" probe \ --pagesize=min --size=6G --table=-data.dups --keylen.min=min --keylen.max=max --datalen.min=min --datalen.max=1111 \ --nops=$( rep9 $nops ) --batch.write=$( rep9 $wbatch ) --mode=$(bits2list options $bits) \ - --keygen.seed=${seed} --hill + --keygen.seed=${seed} basic caption="Probe #$((++count)) with-dups, repeat ${rep} of ${loops}" probe \ --pagesize=min --size=6G --table=+data.dups --keylen.min=min --keylen.max=max --datalen.min=min --datalen.max=max \ --nops=$( rep9 $nops ) --batch.write=$( rep9 $wbatch ) --mode=$(bits2list options $bits) \ - --keygen.seed=${seed} --hill + --keygen.seed=${seed} basic done done done From feab109c61c02ce78cef0c5a4f8d1f9df39de606 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Sun, 4 Nov 2018 20:46:59 +0300 Subject: [PATCH 58/83] mdbx-test: backport - fix osal_actor_poll() for 32-bit builds. Change-Id: I36b2f955295d8ca5435a68737c0c2e7f069bfe34 --- test/osal-unix.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/osal-unix.cc b/test/osal-unix.cc index 1856d0f8..6e6d7a1c 100644 --- a/test/osal-unix.cc +++ b/test/osal-unix.cc @@ -184,7 +184,7 @@ void osal_killall_actors(void) { int osal_actor_poll(mdbx_pid_t &pid, unsigned timeout) { struct timespec ts; ts.tv_nsec = 0; - ts.tv_sec = timeout; + ts.tv_sec = (timeout > INT_MAX) ? INT_MAX : timeout; retry: int status, options = WNOHANG; #ifdef WUNTRACED From de8d0479ab49d3f46ff059cda88d0c7f5c1f2da3 Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Mon, 5 Nov 2018 00:18:41 +0300 Subject: [PATCH 59/83] mdbx: backport - fix typo in mdbx_limits_dbsize_max(). Change-Id: Ie55e3ca108ac6aab9a41d65f316a3d5ff5ff6f1f --- src/mdbx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mdbx.c b/src/mdbx.c index 7dd1053b..dba5f612 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -12518,7 +12518,7 @@ __cold intptr_t mdbx_limits_dbsize_max(intptr_t pagesize) { const uint64_t limit = MAX_PAGENO * (uint64_t)pagesize; return (limit < (intptr_t)MAX_MAPSIZE) ? (intptr_t)limit - : (intptr_t)MAX_PAGESIZE; + : (intptr_t)MAX_MAPSIZE; } /*----------------------------------------------------------------------------*/ From 0043f62a43a8c82ed301d1b675906c64673c23af Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Mon, 5 Nov 2018 13:46:08 +0300 Subject: [PATCH 60/83] mdbx: backport - silently put mm_geo.now into [geo.lower...geo.upper] boundaries. Copy-with-compaction by previous version of libmfbx could produce DB-file less than meta.geo.lower bound, in case actual filling is low or no data at all. This is not a problem as there is no damage or loss of data. Therefore it is better not to consider such situation as an error, but silently correct it. Change-Id: Ia662656cc3584c07efcfbdfc80f80e3c76e6dd59 --- src/mdbx.c | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/src/mdbx.c b/src/mdbx.c index dba5f612..95120ca9 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -4592,15 +4592,6 @@ static int __cold mdbx_read_header(MDBX_env *env, MDBX_meta *meta, continue; } - /* LY: check end_pgno */ - if (page.mp_meta.mm_geo.now < page.mp_meta.mm_geo.lower || - page.mp_meta.mm_geo.now > page.mp_meta.mm_geo.upper) { - mdbx_notice("meta[%u] has invalid end-pageno (%" PRIaPGNO "), skip it", - meta_number, page.mp_meta.mm_geo.now); - rc = MDBX_CORRUPTED; - continue; - } - /* LY: check last_pgno */ if (page.mp_meta.mm_geo.next < MIN_PAGENO || page.mp_meta.mm_geo.next - 1 > MAX_PAGENO) { @@ -4614,6 +4605,7 @@ static int __cold mdbx_read_header(MDBX_env *env, MDBX_meta *meta, const uint64_t used_bytes = page.mp_meta.mm_geo.next * (uint64_t)page.mp_meta.mm_psize; if (used_bytes > *filesize) { + /* Here could be a race with DB-shrinking performed by other process */ rc = mdbx_filesize(env->me_fd, filesize); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -4657,10 +4649,20 @@ static int __cold mdbx_read_header(MDBX_env *env, MDBX_meta *meta, "but size of used space still acceptable (%" PRIu64 ")", meta_number, mapsize_max, used_bytes); page.mp_meta.mm_geo.upper = (pgno_t)(MAX_MAPSIZE / page.mp_meta.mm_psize); - if (page.mp_meta.mm_geo.now > page.mp_meta.mm_geo.upper) - page.mp_meta.mm_geo.now = page.mp_meta.mm_geo.upper; } + /* LY: check and silently put mm_geo.now into [geo.lower...geo.upper]. + * + * Copy-with-compaction by previous version of libmfbx could produce DB-file + * less than meta.geo.lower bound, in case actual filling is low or no data + * at all. This is not a problem as there is no damage or loss of data. + * Therefore it is better not to consider such situation as an error, but + * silently correct it. */ + if (page.mp_meta.mm_geo.now < page.mp_meta.mm_geo.lower) + page.mp_meta.mm_geo.now = page.mp_meta.mm_geo.lower; + if (page.mp_meta.mm_geo.now > page.mp_meta.mm_geo.upper) + page.mp_meta.mm_geo.now = page.mp_meta.mm_geo.upper; + if (page.mp_meta.mm_geo.next > page.mp_meta.mm_geo.now) { mdbx_notice("meta[%u] next-pageno (%" PRIaPGNO ") is beyond end-pgno (%" PRIaPGNO "), skip it", @@ -5601,6 +5603,10 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, int lck_rc) { } if (env->me_flags & MDBX_RDONLY) { + if (filesize_before_mmap % env->me_os_psize) { + mdbx_error("filesize should be rounded-up to system page"); + return MDBX_WANNA_RECOVERY; + } mdbx_notice("ignore filesize mismatch in readonly-mode"); } else { mdbx_info("resize datafile to %" PRIuSIZE " bytes, %" PRIaPGNO " pages", From 850fe8408edaae2d21295cb34a0b78206d129a00 Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Mon, 19 Nov 2018 13:30:10 +0300 Subject: [PATCH 61/83] mdbx: backport - relax DBI-sequences for concurrent open DBI-handles for present tables. Change-Id: I7f07d2e716074bd9c2847aeb062e366f46cca214 --- src/mdbx.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/mdbx.c b/src/mdbx.c index 95120ca9..e6907a8c 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -11213,19 +11213,19 @@ int mdbx_dbi_open_ex(MDBX_txn *txn, const char *table_name, unsigned user_flags, bailout: free(namedup); } else { - txn->mt_dbiseqs[slot] = (env->me_dbiseqs[slot] += 1); txn->mt_dbflags[slot] = (uint8_t)dbflag; txn->mt_dbxs[slot].md_name.iov_base = namedup; - mdbx_compiler_barrier(); txn->mt_dbxs[slot].md_name.iov_len = len; - if (slot == txn->mt_numdbs) - txn->mt_numdbs++; + txn->mt_numdbs += (slot == txn->mt_numdbs); if ((dbflag & DB_CREAT) == 0) { env->me_dbflags[slot] = txn->mt_dbs[slot].md_flags | MDBX_VALID; mdbx_compiler_barrier(); if (env->me_numdbs <= slot) env->me_numdbs = slot + 1; + } else { + env->me_dbiseqs[slot] += 1; } + txn->mt_dbiseqs[slot] = env->me_dbiseqs[slot]; *dbi = slot; } From f183cef7d770dedb68f999cb135d6c39c76b90b7 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Thu, 22 Nov 2018 14:19:49 +0300 Subject: [PATCH 62/83] mdbx-windows: backport - always susppend local threads while resize DB (workaround for Windows kernel bug). We should not concern about performance on Windows platform, it just unreasonable. Therefore just always suspend the local threads to avoid this issue. This resolves https://github.com/leo-yuriev/libmdbx/issues/48 Change-Id: I6e652692794b8c4c0d41625be62f2051b63c033a --- src/mdbx.c | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/src/mdbx.c b/src/mdbx.c index e6907a8c..b2124fe9 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -1983,19 +1983,21 @@ static int mdbx_mapresize(MDBX_env *env, const pgno_t size_pgno, env->me_dxb_mmap.current == env->me_dxb_mmap.filesize) goto bailout; - if ((env->me_flags & MDBX_RDONLY) || limit_bytes != env->me_dxb_mmap.length || - size_bytes < env->me_dxb_mmap.current) { - /* Windows allows only extending a read-write section, but not a - * corresponing mapped view. Therefore in other cases we must suspend - * the local threads for safe remap. */ - array_onstack.limit = ARRAY_LENGTH(array_onstack.handles); - array_onstack.count = 0; - suspended = &array_onstack; - rc = mdbx_suspend_threads_before_remap(env, &suspended); - if (rc != MDBX_SUCCESS) { - mdbx_error("failed suspend-for-remap: errcode %d", rc); - goto bailout; - } + /* 1) Windows allows only extending a read-write section, but not a + * corresponing mapped view. Therefore in other cases we must suspend + * the local threads for safe remap. + * 2) At least on Windows 10 1803 the entire mapped section is unavailable + * for short time during NtExtendSection() or VirtualAlloc() execution. + * + * THEREFORE LOCAL THREADS SUSPENDING IS ALWAYS REQUIRED! + */ + array_onstack.limit = ARRAY_LENGTH(array_onstack.handles); + array_onstack.count = 0; + suspended = &array_onstack; + rc = mdbx_suspend_threads_before_remap(env, &suspended); + if (rc != MDBX_SUCCESS) { + mdbx_error("failed suspend-for-remap: errcode %d", rc); + goto bailout; } #else /* Acquire guard to avoid collision between read and write txns From 4f79e3756c786003a998065ef894d35c2a78e1be Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Sun, 3 Feb 2019 12:28:01 +0300 Subject: [PATCH 63/83] mdbx: 2019 HNY. Change-Id: Iec6b7affedef0ea89fad917eb17af7e8201a7482 --- COPYRIGHT | 2 +- README-RU.md | 29 +++++++++++++---------------- mdbx.h | 6 +++--- src/bits.h | 2 +- src/defs.h | 2 +- src/lck-posix.c | 2 +- src/lck-windows.c | 2 +- src/mdbx.c | 4 ++-- src/osal.c | 2 +- src/osal.h | 2 +- src/tools/mdbx_chk.c | 4 ++-- src/tools/mdbx_copy.1 | 2 +- src/tools/mdbx_copy.c | 2 +- src/tools/mdbx_dump.1 | 2 +- src/tools/mdbx_dump.c | 2 +- src/tools/mdbx_load.1 | 2 +- src/tools/mdbx_load.c | 2 +- src/tools/mdbx_stat.1 | 2 +- src/tools/mdbx_stat.c | 2 +- src/version.c | 2 +- test/CMakeLists.txt | 2 +- test/base.h | 2 +- test/cases.cc | 4 ++-- test/chrono.cc | 2 +- test/chrono.h | 2 +- test/config.cc | 2 +- test/config.h | 4 ++-- test/copy.cc | 2 +- test/dead.cc | 2 +- test/hill.cc | 2 +- test/jitter.cc | 2 +- test/keygen.cc | 2 +- test/keygen.h | 2 +- test/log.cc | 2 +- test/log.h | 2 +- test/main.cc | 4 ++-- test/osal-unix.cc | 4 ++-- test/osal-windows.cc | 4 ++-- test/osal.h | 4 ++-- test/test.cc | 4 ++-- test/test.h | 2 +- test/utils.cc | 2 +- test/utils.h | 2 +- tutorial/sample-bdb.txt | 2 +- tutorial/sample-mdbx.c | 2 +- 45 files changed, 68 insertions(+), 71 deletions(-) diff --git a/COPYRIGHT b/COPYRIGHT index 7c2fd24c..46e09610 100644 --- a/COPYRIGHT +++ b/COPYRIGHT @@ -1,4 +1,4 @@ -Copyright 2015-2018 Leonid Yuriev . +Copyright 2015-2019 Leonid Yuriev . Copyright 2011-2015 Howard Chu, Symas Corp. Copyright 2015,2016 Peter-Service R&D LLC. All rights reserved. diff --git a/README-RU.md b/README-RU.md index 23354555..622b682c 100644 --- a/README-RU.md +++ b/README-RU.md @@ -12,24 +12,21 @@ and [by Yandex](https://translate.yandex.ru/translate?url=https%3A%2F%2Fgithub.c ### Project Status +**Сейчас MDBX _активно перерабатывается_** предстоит +большое изменение как API, так и формата базы данных. К сожалению, +обновление приведет к потере совместимости с предыдущими версиями. -**Сейчас MDBX _активно перерабатывается_** и к середине 2018 -ожидается большое изменение как API, так и формата базы данных. -К сожалению, обновление приведет к потере совместимости с -предыдущими версиями. +Цель этой революции - обеспечение более четкого надежного API и +добавление новых функции, а также наделение базы данных новыми +свойствами. -Цель этой революции - обеспечение более четкого надежного -API и добавление новых функции, а также наделение базы данных -новыми свойствами. - -В настоящее время MDBX предназначена для Linux, а также -поддерживает Windows (начиная с Windows Server 2008) в качестве -дополнительной платформы. Поддержка других ОС может быть -обеспечена на коммерческой основе. Однако такие -усовершенствования (т. е. pull-requests) могут быть приняты в -мейнстрим только в том случае, если будет доступен -соответствующий публичный и бесплатный сервис непрерывной -интеграции (aka Continuous Integration). +В настоящее время MDBX предназначена для Linux, а также поддерживает +Windows (начиная с Windows Server 2008) в качестве дополнительной +платформы. Поддержка других ОС может быть обеспечена на коммерческой +основе. Однако такие усовершенствования (т. е. pull-requests) могут быть +приняты в мейнстрим только в том случае, если будет доступен +соответствующий публичный и бесплатный сервис непрерывной интеграции +(aka Continuous Integration). ## Содержание diff --git a/mdbx.h b/mdbx.h index cedcbc11..8d5265d4 100644 --- a/mdbx.h +++ b/mdbx.h @@ -1,6 +1,6 @@ /* LICENSE AND COPYRUSTING ***************************************************** * - * Copyright 2015-2018 Leonid Yuriev + * Copyright 2015-2019 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -60,8 +60,8 @@ /* IMPENDING CHANGES WARNING *************************************************** * - * MDBX is under active development, database format and API aren't stable - * at least until 2018Q2. New version won't be backwards compatible. Main focus + * MDBX is under active non-public development, database format and API + * will be refined. New version won't be backwards compatible. Main focus * of the rework is to provide clear and robust API and new features. * ******************************************************************************/ diff --git a/src/bits.h b/src/bits.h index 76419db5..b7094ef2 100644 --- a/src/bits.h +++ b/src/bits.h @@ -1,5 +1,5 @@ /* - * Copyright 2015-2018 Leonid Yuriev + * Copyright 2015-2019 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * diff --git a/src/defs.h b/src/defs.h index 6da5a963..992d8f74 100644 --- a/src/defs.h +++ b/src/defs.h @@ -1,5 +1,5 @@ /* - * Copyright 2015-2018 Leonid Yuriev + * Copyright 2015-2019 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * diff --git a/src/lck-posix.c b/src/lck-posix.c index e2353575..55523efc 100644 --- a/src/lck-posix.c +++ b/src/lck-posix.c @@ -1,5 +1,5 @@ /* - * Copyright 2015-2018 Leonid Yuriev + * Copyright 2015-2019 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * diff --git a/src/lck-windows.c b/src/lck-windows.c index 21d6f05a..9f56568f 100644 --- a/src/lck-windows.c +++ b/src/lck-windows.c @@ -1,5 +1,5 @@ /* - * Copyright 2015-2018 Leonid Yuriev + * Copyright 2015-2019 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * diff --git a/src/mdbx.c b/src/mdbx.c index b2124fe9..1ff535b7 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -1,5 +1,5 @@ -/* - * Copyright 2015-2018 Leonid Yuriev +/* + * Copyright 2015-2019 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * diff --git a/src/osal.c b/src/osal.c index 502205ae..2350ce7a 100644 --- a/src/osal.c +++ b/src/osal.c @@ -1,7 +1,7 @@ /* https://en.wikipedia.org/wiki/Operating_system_abstraction_layer */ /* - * Copyright 2015-2018 Leonid Yuriev + * Copyright 2015-2019 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * diff --git a/src/osal.h b/src/osal.h index 5754a83a..69857cd1 100644 --- a/src/osal.h +++ b/src/osal.h @@ -1,7 +1,7 @@ /* https://en.wikipedia.org/wiki/Operating_system_abstraction_layer */ /* - * Copyright 2015-2018 Leonid Yuriev + * Copyright 2015-2019 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * diff --git a/src/tools/mdbx_chk.c b/src/tools/mdbx_chk.c index 9ad65f8d..38e9e204 100644 --- a/src/tools/mdbx_chk.c +++ b/src/tools/mdbx_chk.c @@ -1,7 +1,7 @@ -/* mdbx_chk.c - memory-mapped database check tool */ +/* mdbx_chk.c - memory-mapped database check tool */ /* - * Copyright 2015-2018 Leonid Yuriev + * Copyright 2015-2019 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * diff --git a/src/tools/mdbx_copy.1 b/src/tools/mdbx_copy.1 index db6c453a..74d94b6b 100644 --- a/src/tools/mdbx_copy.1 +++ b/src/tools/mdbx_copy.1 @@ -1,4 +1,4 @@ -.\" Copyright 2015-2018 Leonid Yuriev . +.\" Copyright 2015-2019 Leonid Yuriev . .\" Copyright 2012-2015 Howard Chu, Symas Corp. All Rights Reserved. .\" Copyright 2015,2016 Peter-Service R&D LLC . .\" Copying restrictions apply. See COPYRIGHT/LICENSE. diff --git a/src/tools/mdbx_copy.c b/src/tools/mdbx_copy.c index ee3f739d..9b0c833a 100644 --- a/src/tools/mdbx_copy.c +++ b/src/tools/mdbx_copy.c @@ -1,7 +1,7 @@ /* mdbx_copy.c - memory-mapped database backup tool */ /* - * Copyright 2015-2018 Leonid Yuriev + * Copyright 2015-2019 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * diff --git a/src/tools/mdbx_dump.1 b/src/tools/mdbx_dump.1 index ccfcc0c9..93d29a7c 100644 --- a/src/tools/mdbx_dump.1 +++ b/src/tools/mdbx_dump.1 @@ -1,4 +1,4 @@ -.\" Copyright 2015-2018 Leonid Yuriev . +.\" Copyright 2015-2019 Leonid Yuriev . .\" Copyright 2014-2015 Howard Chu, Symas Corp. All Rights Reserved. .\" Copyright 2015,2016 Peter-Service R&D LLC . .\" Copying restrictions apply. See COPYRIGHT/LICENSE. diff --git a/src/tools/mdbx_dump.c b/src/tools/mdbx_dump.c index c854e0ad..246aca8f 100644 --- a/src/tools/mdbx_dump.c +++ b/src/tools/mdbx_dump.c @@ -1,7 +1,7 @@ /* mdbx_dump.c - memory-mapped database dump tool */ /* - * Copyright 2015-2018 Leonid Yuriev + * Copyright 2015-2019 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * diff --git a/src/tools/mdbx_load.1 b/src/tools/mdbx_load.1 index 7a18a6c0..5227fd9b 100644 --- a/src/tools/mdbx_load.1 +++ b/src/tools/mdbx_load.1 @@ -1,4 +1,4 @@ -.\" Copyright 2015-2018 Leonid Yuriev . +.\" Copyright 2015-2019 Leonid Yuriev . .\" Copyright 2014-2015 Howard Chu, Symas Corp. All Rights Reserved. .\" Copyright 2015,2016 Peter-Service R&D LLC . .\" Copying restrictions apply. See COPYRIGHT/LICENSE. diff --git a/src/tools/mdbx_load.c b/src/tools/mdbx_load.c index 4a337a1a..c36084c0 100644 --- a/src/tools/mdbx_load.c +++ b/src/tools/mdbx_load.c @@ -1,7 +1,7 @@ /* mdbx_load.c - memory-mapped database load tool */ /* - * Copyright 2015-2018 Leonid Yuriev + * Copyright 2015-2019 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * diff --git a/src/tools/mdbx_stat.1 b/src/tools/mdbx_stat.1 index ca427f7a..50a30b4f 100644 --- a/src/tools/mdbx_stat.1 +++ b/src/tools/mdbx_stat.1 @@ -1,4 +1,4 @@ -.\" Copyright 2015-2018 Leonid Yuriev . +.\" Copyright 2015-2019 Leonid Yuriev . .\" Copyright 2012-2015 Howard Chu, Symas Corp. All Rights Reserved. .\" Copyright 2015,2016 Peter-Service R&D LLC . .\" Copying restrictions apply. See COPYRIGHT/LICENSE. diff --git a/src/tools/mdbx_stat.c b/src/tools/mdbx_stat.c index a219b9ec..0791d9f3 100644 --- a/src/tools/mdbx_stat.c +++ b/src/tools/mdbx_stat.c @@ -1,7 +1,7 @@ /* mdbx_stat.c - memory-mapped database status tool */ /* - * Copyright 2015-2018 Leonid Yuriev + * Copyright 2015-2019 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * diff --git a/src/version.c b/src/version.c index d5cd3697..8077d69c 100644 --- a/src/version.c +++ b/src/version.c @@ -1,5 +1,5 @@ /* - * Copyright 2015-2018 Leonid Yuriev + * Copyright 2015-2019 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index ca7dd794..d34da91a 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -1,4 +1,4 @@ -set(TARGET mdbx_test) +set(TARGET mdbx_test) project(${TARGET}) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-missing-declarations") diff --git a/test/base.h b/test/base.h index bc82ff26..8210c18b 100644 --- a/test/base.h +++ b/test/base.h @@ -1,5 +1,5 @@ /* - * Copyright 2017-2018 Leonid Yuriev + * Copyright 2017-2019 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * diff --git a/test/cases.cc b/test/cases.cc index 13d47576..58141603 100644 --- a/test/cases.cc +++ b/test/cases.cc @@ -1,5 +1,5 @@ -/* - * Copyright 2017-2018 Leonid Yuriev +/* + * Copyright 2017-2019 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * diff --git a/test/chrono.cc b/test/chrono.cc index f7346686..38cb321a 100644 --- a/test/chrono.cc +++ b/test/chrono.cc @@ -1,5 +1,5 @@ /* - * Copyright 2017-2018 Leonid Yuriev + * Copyright 2017-2019 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * diff --git a/test/chrono.h b/test/chrono.h index c2bd5627..11675195 100644 --- a/test/chrono.h +++ b/test/chrono.h @@ -1,5 +1,5 @@ /* - * Copyright 2017-2018 Leonid Yuriev + * Copyright 2017-2019 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * diff --git a/test/config.cc b/test/config.cc index 7fa46208..bb5fc61c 100644 --- a/test/config.cc +++ b/test/config.cc @@ -1,5 +1,5 @@ /* - * Copyright 2017-2018 Leonid Yuriev + * Copyright 2017-2019 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * diff --git a/test/config.h b/test/config.h index 1886a8ea..dd14046f 100644 --- a/test/config.h +++ b/test/config.h @@ -1,5 +1,5 @@ -/* - * Copyright 2017-2018 Leonid Yuriev +/* + * Copyright 2017-2019 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * diff --git a/test/copy.cc b/test/copy.cc index e239d41e..ff53153e 100644 --- a/test/copy.cc +++ b/test/copy.cc @@ -1,4 +1,4 @@ -#include "test.h" +#include "test.h" void testcase_copy::copy_db(const bool with_compaction) { int err = osal_removefile(copy_pathname); diff --git a/test/dead.cc b/test/dead.cc index ee13fbd0..7f55ba8c 100644 --- a/test/dead.cc +++ b/test/dead.cc @@ -1,5 +1,5 @@ /* - * Copyright 2017-2018 Leonid Yuriev + * Copyright 2017-2019 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * diff --git a/test/hill.cc b/test/hill.cc index 753a095b..2abe6d0d 100644 --- a/test/hill.cc +++ b/test/hill.cc @@ -1,5 +1,5 @@ /* - * Copyright 2017-2018 Leonid Yuriev + * Copyright 2017-2019 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * diff --git a/test/jitter.cc b/test/jitter.cc index 25514004..e8876921 100644 --- a/test/jitter.cc +++ b/test/jitter.cc @@ -1,5 +1,5 @@ /* - * Copyright 2017-2018 Leonid Yuriev + * Copyright 2017-2019 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * diff --git a/test/keygen.cc b/test/keygen.cc index c7a70606..4753899e 100644 --- a/test/keygen.cc +++ b/test/keygen.cc @@ -1,5 +1,5 @@ /* - * Copyright 2017-2018 Leonid Yuriev + * Copyright 2017-2019 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * diff --git a/test/keygen.h b/test/keygen.h index bbd97b29..1466720d 100644 --- a/test/keygen.h +++ b/test/keygen.h @@ -1,5 +1,5 @@ /* - * Copyright 2017-2018 Leonid Yuriev + * Copyright 2017-2019 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * diff --git a/test/log.cc b/test/log.cc index 0e325e3a..79544e11 100644 --- a/test/log.cc +++ b/test/log.cc @@ -1,5 +1,5 @@ /* - * Copyright 2017-2018 Leonid Yuriev + * Copyright 2017-2019 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * diff --git a/test/log.h b/test/log.h index e09cccaa..03e2bbc9 100644 --- a/test/log.h +++ b/test/log.h @@ -1,5 +1,5 @@ /* - * Copyright 2017-2018 Leonid Yuriev + * Copyright 2017-2019 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * diff --git a/test/main.cc b/test/main.cc index 275b7b13..7ebe2568 100644 --- a/test/main.cc +++ b/test/main.cc @@ -1,5 +1,5 @@ -/* - * Copyright 2017-2018 Leonid Yuriev +/* + * Copyright 2017-2019 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * diff --git a/test/osal-unix.cc b/test/osal-unix.cc index 6e6d7a1c..ee2051a8 100644 --- a/test/osal-unix.cc +++ b/test/osal-unix.cc @@ -1,5 +1,5 @@ -/* - * Copyright 2017-2018 Leonid Yuriev +/* + * Copyright 2017-2019 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * diff --git a/test/osal-windows.cc b/test/osal-windows.cc index f7f1de56..ea802182 100644 --- a/test/osal-windows.cc +++ b/test/osal-windows.cc @@ -1,5 +1,5 @@ -/* - * Copyright 2017-2018 Leonid Yuriev +/* + * Copyright 2017-2019 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * diff --git a/test/osal.h b/test/osal.h index 3ccc7bbe..eb88008b 100644 --- a/test/osal.h +++ b/test/osal.h @@ -1,5 +1,5 @@ -/* - * Copyright 2017-2018 Leonid Yuriev +/* + * Copyright 2017-2019 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * diff --git a/test/test.cc b/test/test.cc index 445c4c8d..de02ae4f 100644 --- a/test/test.cc +++ b/test/test.cc @@ -1,5 +1,5 @@ -/* - * Copyright 2017-2018 Leonid Yuriev +/* + * Copyright 2017-2019 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * diff --git a/test/test.h b/test/test.h index 765940ce..e2e0d7c4 100644 --- a/test/test.h +++ b/test/test.h @@ -1,5 +1,5 @@ /* - * Copyright 2017-2018 Leonid Yuriev + * Copyright 2017-2019 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * diff --git a/test/utils.cc b/test/utils.cc index 53a750e3..2df6dc72 100644 --- a/test/utils.cc +++ b/test/utils.cc @@ -1,5 +1,5 @@ /* - * Copyright 2017-2018 Leonid Yuriev + * Copyright 2017-2019 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * diff --git a/test/utils.h b/test/utils.h index 5d62909f..e0479cb6 100644 --- a/test/utils.h +++ b/test/utils.h @@ -1,5 +1,5 @@ /* - * Copyright 2017-2018 Leonid Yuriev + * Copyright 2017-2019 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * diff --git a/tutorial/sample-bdb.txt b/tutorial/sample-bdb.txt index 1015d064..440efddb 100644 --- a/tutorial/sample-bdb.txt +++ b/tutorial/sample-bdb.txt @@ -4,7 +4,7 @@ */ /* - * Copyright 2015-2018 Leonid Yuriev . + * Copyright 2015-2019 Leonid Yuriev . * Copyright 2012-2015 Howard Chu, Symas Corp. * Copyright 2015,2016 Peter-Service R&D LLC. * All rights reserved. diff --git a/tutorial/sample-mdbx.c b/tutorial/sample-mdbx.c index aaafbc31..991ab698 100644 --- a/tutorial/sample-mdbx.c +++ b/tutorial/sample-mdbx.c @@ -5,7 +5,7 @@ /* * Copyright 2017 Ilya Shipitsin . - * Copyright 2015-2018 Leonid Yuriev . + * Copyright 2015-2019 Leonid Yuriev . * Copyright 2012-2015 Howard Chu, Symas Corp. * All rights reserved. * From ba00b597a7b90336676533c9d87164a207fa5aff Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Thu, 17 Jan 2019 23:09:40 +0300 Subject: [PATCH 64/83] mdbx-windows: backport - fix returning negative value on failure. Change-Id: Iaf5fb1f0cbcc3c14e2d3edf1f57538ecc0dfdf00 --- src/mdbx.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/src/mdbx.c b/src/mdbx.c index 1ff535b7..c5148a80 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -5024,7 +5024,7 @@ fail: int __cold mdbx_env_get_maxkeysize(MDBX_env *env) { if (!env || env->me_signature != MDBX_ME_SIGNATURE || !env->me_maxkey_limit) - return -MDBX_EINVAL; + return (MDBX_EINVAL > 0) ? -MDBX_EINVAL : MDBX_EINVAL; return env->me_maxkey_limit; } @@ -11538,7 +11538,7 @@ int __cold mdbx_reader_list(MDBX_env *env, MDBX_msg_func *func, void *ctx) { int rc = 0, first = 1; if (unlikely(!env || !func)) - return -MDBX_EINVAL; + return (MDBX_EINVAL > 0) ? -MDBX_EINVAL : MDBX_EINVAL; if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) return MDBX_EBADSIGN; @@ -11848,7 +11848,7 @@ __attribute__((no_sanitize_thread, noinline)) int mdbx_txn_straggler(MDBX_txn *txn, int *percent) { if (unlikely(!txn)) - return -MDBX_EINVAL; + return (MDBX_EINVAL > 0) ? -MDBX_EINVAL : MDBX_EINVAL; if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) return MDBX_EBADSIGN; @@ -12501,6 +12501,17 @@ int mdbx_dbi_sequence(MDBX_txn *txn, MDBX_dbi dbi, uint64_t *result, /*----------------------------------------------------------------------------*/ +__cold intptr_t mdbx_limits_keysize_max(intptr_t pagesize) { + if (pagesize < 1) + pagesize = (intptr_t)mdbx_syspagesize(); + else if (unlikely(pagesize < (intptr_t)MIN_PAGESIZE || + pagesize > (intptr_t)MAX_PAGESIZE || + !mdbx_is_power2((size_t)pagesize))) + return (MDBX_EINVAL > 0) ? -MDBX_EINVAL : MDBX_EINVAL; + + return mdbx_maxkey(mdbx_nodemax(pagesize)); +} + __cold int mdbx_limits_pgsize_min(void) { return MIN_PAGESIZE; } __cold int mdbx_limits_pgsize_max(void) { return MAX_PAGESIZE; } @@ -12511,7 +12522,7 @@ __cold intptr_t mdbx_limits_dbsize_min(intptr_t pagesize) { else if (unlikely(pagesize < (intptr_t)MIN_PAGESIZE || pagesize > (intptr_t)MAX_PAGESIZE || !mdbx_is_power2((size_t)pagesize))) - return -MDBX_EINVAL; + return (MDBX_EINVAL > 0) ? -MDBX_EINVAL : MDBX_EINVAL; return MIN_PAGENO * pagesize; } @@ -12522,7 +12533,7 @@ __cold intptr_t mdbx_limits_dbsize_max(intptr_t pagesize) { else if (unlikely(pagesize < (intptr_t)MIN_PAGESIZE || pagesize > (intptr_t)MAX_PAGESIZE || !mdbx_is_power2((size_t)pagesize))) - return -MDBX_EINVAL; + return (MDBX_EINVAL > 0) ? -MDBX_EINVAL : MDBX_EINVAL; const uint64_t limit = MAX_PAGENO * (uint64_t)pagesize; return (limit < (intptr_t)MAX_MAPSIZE) ? (intptr_t)limit From 64f6648d0cfb72fc8b0c1e3da68131ad02561d43 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Sun, 3 Feb 2019 12:47:56 +0300 Subject: [PATCH 65/83] mdbx: make API compatible to the master branch. Change-Id: I95c5db639cebe4bba9c600f97c9966082bc9bc09 --- mdbx.h | 2 ++ src/mdbx.c | 20 ++++++++++++-------- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/mdbx.h b/mdbx.h index 8d5265d4..9285b236 100644 --- a/mdbx.h +++ b/mdbx.h @@ -1683,6 +1683,8 @@ LIBMDBX_API int mdbx_limits_pgsize_min(void); LIBMDBX_API int mdbx_limits_pgsize_max(void); LIBMDBX_API intptr_t mdbx_limits_dbsize_min(intptr_t pagesize); LIBMDBX_API intptr_t mdbx_limits_dbsize_max(intptr_t pagesize); +LIBMDBX_API intptr_t mdbx_limits_keysize_max(intptr_t pagesize); +LIBMDBX_API intptr_t mdbx_limits_txnsize_max(intptr_t pagesize); /*----------------------------------------------------------------------------*/ /* attribute support functions for Nexenta */ diff --git a/src/mdbx.c b/src/mdbx.c index c5148a80..80e6435e 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -5037,14 +5037,7 @@ int __cold mdbx_env_get_maxkeysize(MDBX_env *env) { (((pagesize)-PAGEHDRSZ) / sizeof(pgno_t) - 1) int mdbx_get_maxkeysize(intptr_t pagesize) { - if (pagesize < 1) - pagesize = (intptr_t)mdbx_syspagesize(); - else if (unlikely(pagesize < (intptr_t)MIN_PAGESIZE || - pagesize > (intptr_t)MAX_PAGESIZE || - !mdbx_is_power2((size_t)pagesize))) - return -MDBX_EINVAL; - - return mdbx_maxkey(mdbx_nodemax(pagesize)); + return (int)mdbx_limits_keysize_max(pagesize); } static void __cold mdbx_setup_pagesize(MDBX_env *env, const size_t pagesize) { @@ -12540,6 +12533,17 @@ __cold intptr_t mdbx_limits_dbsize_max(intptr_t pagesize) { : (intptr_t)MAX_MAPSIZE; } +__cold intptr_t mdbx_limits_txnsize_max(intptr_t pagesize) { + if (pagesize < 1) + pagesize = (intptr_t)mdbx_syspagesize(); + else if (unlikely(pagesize < (intptr_t)MIN_PAGESIZE || + pagesize > (intptr_t)MAX_PAGESIZE || + !mdbx_is_power2((size_t)pagesize))) + return (MDBX_EINVAL > 0) ? -MDBX_EINVAL : MDBX_EINVAL; + + return pagesize * (MDBX_PNL_UM_SIZE - 1); +} + /*----------------------------------------------------------------------------*/ /* attribute support functions for Nexenta */ From 131485e5160b492a7486dfeca0169ff450f56191 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Sun, 10 Feb 2019 16:23:49 +0300 Subject: [PATCH 66/83] mdbx: fix comment typo (minor). --- src/mdbx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mdbx.c b/src/mdbx.c index 80e6435e..0b2bfd82 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -4655,7 +4655,7 @@ static int __cold mdbx_read_header(MDBX_env *env, MDBX_meta *meta, /* LY: check and silently put mm_geo.now into [geo.lower...geo.upper]. * - * Copy-with-compaction by previous version of libmfbx could produce DB-file + * Copy-with-compaction by previous version of libmdbx could produce DB-file * less than meta.geo.lower bound, in case actual filling is low or no data * at all. This is not a problem as there is no damage or loss of data. * Therefore it is better not to consider such situation as an error, but From b1ffe87556f0e7b8ea0cefc283f64896a30dc5a9 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Wed, 13 Feb 2019 20:23:24 +0300 Subject: [PATCH 67/83] mdbx: fix one more comment typo (minor). --- mdbx.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mdbx.h b/mdbx.h index 9285b236..f698b340 100644 --- a/mdbx.h +++ b/mdbx.h @@ -591,7 +591,7 @@ LIBMDBX_API int mdbx_env_create(MDBX_env **penv); * - MDBX_NOTLS * Don't use Thread-Local Storage. Tie reader locktable slots to * MDBX_txn objects instead of to threads. I.e. mdbx_txn_reset() keeps - * the slot reseved for the MDBX_txn object. A thread may use parallel + * the slot reserved for the MDBX_txn object. A thread may use parallel * read-only transactions. A read-only transaction may span threads if * the user synchronizes its use. Applications that multiplex many * user threads over individual OS threads need this option. Such an From 46eb178f07c655fd9c29123a4ef95bca7c517bb8 Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Sat, 22 Jun 2019 01:29:45 +0300 Subject: [PATCH 68/83] mdbx: backport - fix GC corruption due deep recursive rebalance from update_gc(). Change-Id: I810250deb25cd625e737000282b434e3158ef8cc --- src/mdbx.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/mdbx.c b/src/mdbx.c index 0b2bfd82..0f1d89c5 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -3885,7 +3885,9 @@ retry: data.iov_len = (chunk + 1) * sizeof(pgno_t); mdbx_trace("%s.reserve: %u [%u...%u] @%" PRIaTXN, dbg_prefix_mode, chunk, settled + 1, settled + chunk + 1, reservation_gc_id); + mc.mc_flags |= C_RECLAIMING; rc = mdbx_cursor_put(&mc, &key, &data, MDBX_RESERVE | MDBX_NOOVERWRITE); + mc.mc_flags -= C_RECLAIMING; mdbx_tassert(txn, mdbx_pnl_check(env->me_reclaimed_pglist)); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; From 06c35dd59c3b4229a526c99825416dc58c8382ae Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Sat, 2 Feb 2019 17:13:41 +0300 Subject: [PATCH 69/83] mdbx: backport - fix __ANDROID__ typo. Thank to Howard Chu . Change-Id: Ibcbe2e4790a5df5758d9fd6c621793ea42a94682 --- src/lck-posix.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/lck-posix.c b/src/lck-posix.c index 55523efc..67e6aefc 100644 --- a/src/lck-posix.c +++ b/src/lck-posix.c @@ -20,8 +20,8 @@ #ifndef MDBX_USE_ROBUST /* Howard Chu: Android currently lacks Robust Mutex support */ #if defined(EOWNERDEAD) && \ - !defined(ANDROID) /* LY: glibc before 2.10 has a troubles with Robust \ - Mutex too. */ \ + !defined(__ANDROID__) /* LY: glibc before 2.10 has a troubles \ + with Robust Mutex too. */ \ && __GLIBC_PREREQ(2, 10) #define MDBX_USE_ROBUST 1 #else From fead1c3853d14be9ed2fe711c9b1ba0d277e3294 Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Mon, 4 Feb 2019 00:08:56 +0300 Subject: [PATCH 70/83] mdbx: backport - fix handling MDBX_APPENDDUP mode. Change-Id: I36de2a8dcab5126dab3857a7840ab3904a1d19c8 --- src/mdbx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mdbx.c b/src/mdbx.c index 0f1d89c5..cd81e361 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -7699,7 +7699,7 @@ int mdbx_cursor_put(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, if (rc > 0) { rc = MDBX_NOTFOUND; mc->mc_ki[mc->mc_top]++; - } else { + } else if (unlikely(rc < 0 || (flags & MDBX_APPENDDUP) == 0)) { /* new key is <= last key */ rc = MDBX_EKEYMISMATCH; } From 6c160d02af1c6ee1e1ad4da3ed43aa4d8ac55694 Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Thu, 2 May 2019 16:46:05 +0300 Subject: [PATCH 71/83] mdbx: backport - fix TAGRET typo (minor). Change-Id: Iffafbed7fdad3492aeb51f17caf8109a5b3e35c0 --- src/version.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/version.c b/src/version.c index 8077d69c..05f83ba2 100644 --- a/src/version.c +++ b/src/version.c @@ -30,5 +30,5 @@ "@MDBX_GIT_DESCRIBE@"}}; /*LIBMDBX_EXPORTS*/ const mdbx_build_info mdbx_build = { - "@MDBX_BUILD_TIMESTAMP@", "@MDBX_BUILD_TAGRET@", "@MDBX_BUILD_OPTIONS@", + "@MDBX_BUILD_TIMESTAMP@", "@MDBX_BUILD_TARGET@", "@MDBX_BUILD_OPTIONS@", "@MDBX_BUILD_COMPILER@", "@MDBX_BUILD_FLAGS@"}; From 828889de5c446a417ecbb2899f0d12bbb2d0d984 Mon Sep 17 00:00:00 2001 From: Howard Chu Date: Wed, 30 Jan 2019 23:43:34 +0000 Subject: [PATCH 72/83] mdbx: import - tweak mdb_page_split (ITS#8969). Bump up number of keys for which we use fine-grained splitpoint search Change-Id: Icca2e1953cbcd6898b790f657636c2195b397790 --- src/mdbx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mdbx.c b/src/mdbx.c index cd81e361..a81f332a 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -10008,7 +10008,7 @@ static int mdbx_page_split(MDBX_cursor *mc, MDBX_val *newkey, MDBX_val *newdata, * This yields better packing during sequential inserts. */ int dir; - if (nkeys < 20 || nsize > pmax / 16 || newindx >= nkeys) { + if (nkeys < 32 || nsize > pmax / 16 || newindx >= nkeys) { /* Find split point */ psize = 0; if (newindx <= split_indx || newindx >= nkeys) { From 26838a2164658b8ff74ca454d3183065fc280a3f Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Sat, 22 Jun 2019 18:46:41 +0300 Subject: [PATCH 73/83] mdbx: rework RECLAIMING inside update_gc(). Change-Id: I9cf592476780bfdb346472baa12497d68a3d5aad --- src/mdbx.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/src/mdbx.c b/src/mdbx.c index a81f332a..2b885517 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -1,4 +1,4 @@ -/* +/* * Copyright 2015-2019 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. @@ -3471,6 +3471,7 @@ static int mdbx_prep_backlog(MDBX_txn *txn, MDBX_cursor *mc) { const int extra = mdbx_backlog_extragap(txn->mt_env); if (mdbx_backlog_size(txn) < mc->mc_db->md_depth + extra) { + mc->mc_flags &= ~C_RECLAIMING; int rc = mdbx_cursor_touch(mc); if (unlikely(rc)) return rc; @@ -3484,6 +3485,7 @@ static int mdbx_prep_backlog(MDBX_txn *txn, MDBX_cursor *mc) { break; } } + mc->mc_flags |= C_RECLAIMING; } return MDBX_SUCCESS; @@ -3504,6 +3506,7 @@ static int mdbx_update_gc(MDBX_txn *txn) { if (unlikely(rc != MDBX_SUCCESS)) return rc; + mc.mc_flags |= C_RECLAIMING; mc.mc_next = txn->mt_cursors[FREE_DBI]; txn->mt_cursors[FREE_DBI] = &mc; @@ -3559,9 +3562,7 @@ retry: mdbx_tassert(txn, cleaned_gc_id < *env->me_oldest); mdbx_trace("%s.cleanup-reclaimed-id [%u]%" PRIaTXN, dbg_prefix_mode, cleaned_gc_slot, cleaned_gc_id); - mc.mc_flags |= C_RECLAIMING; rc = mdbx_cursor_del(&mc, 0); - mc.mc_flags ^= C_RECLAIMING; if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } while (cleaned_gc_slot < txn->mt_lifo_reclaimed[0]); @@ -3581,9 +3582,7 @@ retry: mdbx_tassert(txn, cleaned_gc_id < *env->me_oldest); mdbx_trace("%s.cleanup-reclaimed-id %" PRIaTXN, dbg_prefix_mode, cleaned_gc_id); - mc.mc_flags |= C_RECLAIMING; rc = mdbx_cursor_del(&mc, 0); - mc.mc_flags ^= C_RECLAIMING; if (unlikely(rc != MDBX_SUCCESS)) goto bailout; settled = 0; @@ -3681,7 +3680,9 @@ retry: if (befree_stored < txn->mt_befree_pages[0]) { if (unlikely(!befree_stored)) { /* Make sure last page of freeDB is touched and on befree-list */ + mc.mc_flags &= ~C_RECLAIMING; rc = mdbx_page_search(&mc, NULL, MDBX_PS_LAST | MDBX_PS_MODIFY); + mc.mc_flags |= C_RECLAIMING; if (unlikely(rc != MDBX_SUCCESS && rc != MDBX_NOTFOUND)) goto bailout; } @@ -3766,7 +3767,9 @@ retry: left > ((unsigned)txn->mt_lifo_reclaimed[0] - reused_gc_slot) * env->me_maxgc_ov1page) { /* LY: need just a txn-id for save page list. */ + mc.mc_flags &= ~C_RECLAIMING; rc = mdbx_page_alloc(&mc, 0, NULL, MDBX_ALLOC_GC | MDBX_ALLOC_KICK); + mc.mc_flags |= C_RECLAIMING; if (likely(rc == MDBX_SUCCESS)) { /* LY: ok, reclaimed from freedb. */ mdbx_trace("%s: took @%" PRIaTXN " from GC, continue", @@ -3885,9 +3888,7 @@ retry: data.iov_len = (chunk + 1) * sizeof(pgno_t); mdbx_trace("%s.reserve: %u [%u...%u] @%" PRIaTXN, dbg_prefix_mode, chunk, settled + 1, settled + chunk + 1, reservation_gc_id); - mc.mc_flags |= C_RECLAIMING; rc = mdbx_cursor_put(&mc, &key, &data, MDBX_RESERVE | MDBX_NOOVERWRITE); - mc.mc_flags -= C_RECLAIMING; mdbx_tassert(txn, mdbx_pnl_check(env->me_reclaimed_pglist)); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; @@ -3973,9 +3974,7 @@ retry: key.iov_base = &fill_gc_id; key.iov_len = sizeof(fill_gc_id); - mc.mc_flags |= C_RECLAIMING; rc = mdbx_cursor_put(&mc, &key, &data, MDBX_CURRENT | MDBX_RESERVE); - mc.mc_flags ^= C_RECLAIMING; mdbx_tassert(txn, mdbx_pnl_check(env->me_reclaimed_pglist)); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; From ebc8e9935e74453d556acc807e7dc0faba88d21f Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Sat, 22 Jun 2019 23:31:10 +0300 Subject: [PATCH 74/83] mdbx: bump version to v0.1.7 Change-Id: I0f72ed31fbd1ed74a875c2aa2023521855e72894 --- src/version.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/version.c b/src/version.c index 05f83ba2..e10d5055 100644 --- a/src/version.c +++ b/src/version.c @@ -18,8 +18,8 @@ #error "API version mismatch!" #endif -#define MDBX_VERSION_RELEASE 6 -#define MDBX_VERSION_REVISION 1 +#define MDBX_VERSION_RELEASE 7 +#define MDBX_VERSION_REVISION 0 /*LIBMDBX_EXPORTS*/ const mdbx_version_info mdbx_version = { MDBX_VERSION_MAJOR, From 5413407f23543d2badd5541cacfb2c5402fdffe6 Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Sun, 23 Jun 2019 20:41:50 +0300 Subject: [PATCH 75/83] mdbx-test: backport - fix dbsize-options handling. Change-Id: Ia51f802ac1ad4e8b1b059a3f3b38214bda6b43fc --- test/main.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/main.cc b/test/main.cc index 7ebe2568..c4b24f60 100644 --- a/test/main.cc +++ b/test/main.cc @@ -182,11 +182,11 @@ int main(int argc, char *const argv[]) { mdbx_limits_dbsize_min(params.pagesize), mdbx_limits_dbsize_max(params.pagesize))) continue; - if (config::parse_option(argc, argv, narg, "size", params.size_now, + if (config::parse_option(argc, argv, narg, "size-upper", params.size_upper, mdbx_limits_dbsize_min(params.pagesize), mdbx_limits_dbsize_max(params.pagesize))) continue; - if (config::parse_option(argc, argv, narg, "size-upper", params.size_upper, + if (config::parse_option(argc, argv, narg, "size", params.size_now, mdbx_limits_dbsize_min(params.pagesize), mdbx_limits_dbsize_max(params.pagesize))) continue; From 5c488d70337675f6e7e61f2beeacfff5424a4072 Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Mon, 24 Jun 2019 00:56:26 +0300 Subject: [PATCH 76/83] mdbx: backport - fix pwrite() for WRITE_MAX. Change-Id: If4924d20c1e267c2d3a190c860b89fc2fda0d517 --- src/osal.c | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/src/osal.c b/src/osal.c index 2350ce7a..d398a87e 100644 --- a/src/osal.c +++ b/src/osal.c @@ -533,17 +533,25 @@ int mdbx_pwrite(mdbx_filehandle_t fd, const void *buf, size_t bytes, return (bytes == written) ? MDBX_SUCCESS : MDBX_EIO /* ERROR_WRITE_FAULT */; return GetLastError(); #else - int rc; - intptr_t written; - do { + while (true) { STATIC_ASSERT_MSG(sizeof(off_t) >= sizeof(size_t), "libmdbx requires 64-bit file I/O on 64-bit systems"); - written = pwrite(fd, buf, bytes, offset); + const intptr_t written = + pwrite(fd, buf, (bytes <= MAX_WRITE) ? bytes : MAX_WRITE, offset); if (likely(bytes == (size_t)written)) return MDBX_SUCCESS; - rc = errno; - } while (rc == EINTR); - return (written < 0) ? rc : MDBX_EIO /* Use which error code (ENOSPC)? */; + if (written < 0) { + const int rc = errno; + if (rc != EINTR) + return rc; + } else if (written > 0) { + bytes -= written; + offset += written; + buf = (char *)buf + written; + } else { + return -1; + } + } #endif } From c7674f671d1f85b2c070c351789c73e1120c6867 Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Mon, 8 Jul 2019 15:27:26 +0300 Subject: [PATCH 77/83] mdbx: backport - refine backlog preparation inside update_gc(). Change-Id: Ib18842c2922afba794d6ab69337580bcea29bfe6 --- src/mdbx.c | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/src/mdbx.c b/src/mdbx.c index 2b885517..104d9ea7 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -3455,7 +3455,7 @@ static __inline int mdbx_backlog_size(MDBX_txn *txn) { int reclaimed = txn->mt_env->me_reclaimed_pglist ? txn->mt_env->me_reclaimed_pglist[0] : 0; - return reclaimed + txn->mt_loose_count + txn->mt_end_pgno - txn->mt_next_pgno; + return reclaimed + txn->mt_loose_count; } static __inline int mdbx_backlog_extragap(MDBX_env *env) { @@ -3468,7 +3468,9 @@ static __inline int mdbx_backlog_extragap(MDBX_env *env) { * in mdbx_page_alloc() during a deleting, when freeDB tree is unbalanced. */ static int mdbx_prep_backlog(MDBX_txn *txn, MDBX_cursor *mc) { /* LY: extra page(s) for b-tree rebalancing */ - const int extra = mdbx_backlog_extragap(txn->mt_env); + const int extra = + mdbx_backlog_extragap(txn->mt_env) + + MDBX_PNL_SIZEOF(txn->mt_befree_pages) / txn->mt_env->me_maxkey_limit; if (mdbx_backlog_size(txn) < mc->mc_db->md_depth + extra) { mc->mc_flags &= ~C_RECLAIMING; @@ -3476,11 +3478,10 @@ static int mdbx_prep_backlog(MDBX_txn *txn, MDBX_cursor *mc) { if (unlikely(rc)) return rc; - int backlog; - while (unlikely((backlog = mdbx_backlog_size(txn)) < extra)) { + while (unlikely(mdbx_backlog_size(txn) < extra)) { rc = mdbx_page_alloc(mc, 1, NULL, MDBX_ALLOC_GC); if (unlikely(rc)) { - if (unlikely(rc != MDBX_NOTFOUND)) + if (rc != MDBX_NOTFOUND) return rc; break; } @@ -3491,6 +3492,20 @@ static int mdbx_prep_backlog(MDBX_txn *txn, MDBX_cursor *mc) { return MDBX_SUCCESS; } +static void mdbx_prep_backlog_data(MDBX_txn *txn, MDBX_cursor *mc, + size_t bytes) { + const int wanna = + (int)OVPAGES(txn->mt_env, bytes) + mdbx_backlog_extragap(txn->mt_env); + if (unlikely(wanna > mdbx_backlog_size(txn))) { + mc->mc_flags &= ~C_RECLAIMING; + do { + if (mdbx_page_alloc(mc, 1, NULL, MDBX_ALLOC_GC) != MDBX_SUCCESS) + break; + } while (wanna > mdbx_backlog_size(txn)); + mc->mc_flags |= C_RECLAIMING; + } +} + /* Cleanup reclaimed GC records, than save the befree-list as of this * transaction to GC (aka freeDB). This recursive changes the reclaimed-list * loose-list and befree-list. Keep trying until it stabilizes. */ @@ -3683,7 +3698,7 @@ retry: mc.mc_flags &= ~C_RECLAIMING; rc = mdbx_page_search(&mc, NULL, MDBX_PS_LAST | MDBX_PS_MODIFY); mc.mc_flags |= C_RECLAIMING; - if (unlikely(rc != MDBX_SUCCESS && rc != MDBX_NOTFOUND)) + if (unlikely(rc != MDBX_SUCCESS) && rc != MDBX_NOTFOUND) goto bailout; } /* Write to last page of freeDB */ @@ -3691,6 +3706,7 @@ retry: key.iov_base = &txn->mt_txnid; do { data.iov_len = MDBX_PNL_SIZEOF(txn->mt_befree_pages); + mdbx_prep_backlog_data(txn, &mc, data.iov_len); rc = mdbx_cursor_put(&mc, &key, &data, MDBX_RESERVE); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; @@ -3888,6 +3904,7 @@ retry: data.iov_len = (chunk + 1) * sizeof(pgno_t); mdbx_trace("%s.reserve: %u [%u...%u] @%" PRIaTXN, dbg_prefix_mode, chunk, settled + 1, settled + chunk + 1, reservation_gc_id); + mdbx_prep_backlog_data(txn, &mc, data.iov_len); rc = mdbx_cursor_put(&mc, &key, &data, MDBX_RESERVE | MDBX_NOOVERWRITE); mdbx_tassert(txn, mdbx_pnl_check(env->me_reclaimed_pglist)); if (unlikely(rc != MDBX_SUCCESS)) From ce75405ccc62947938f670195cbc534a31b4348f Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Tue, 9 Jul 2019 15:23:19 +0300 Subject: [PATCH 78/83] mdbx: backport - fix DB-shrinking race with copy-asis & readers. Change-Id: I893b388d186b6425ab60be4b7cc6bf9b67142def --- src/bits.h | 9 +++++--- src/mdbx.c | 68 +++++++++++++++++++++++++++++++++++++++++++----------- 2 files changed, 60 insertions(+), 17 deletions(-) diff --git a/src/bits.h b/src/bits.h index b7094ef2..d23d72d5 100644 --- a/src/bits.h +++ b/src/bits.h @@ -244,11 +244,14 @@ typedef struct MDBX_reader { volatile mdbx_pid_t mr_pid; /* The thread ID of the thread owning this txn. */ volatile mdbx_tid_t mr_tid; + /* The number of pages used in the reader's MVCC snapshot, + * i.e. the value of meta->mm_geo.next and txn->mt_next_pgno */ + volatile pgno_t mr_snapshot_pages; /* cache line alignment */ - uint8_t pad[MDBX_CACHELINE_SIZE - - (sizeof(txnid_t) + sizeof(mdbx_pid_t) + sizeof(mdbx_tid_t)) % - MDBX_CACHELINE_SIZE]; + uint8_t pad[MDBX_CACHELINE_SIZE - (sizeof(txnid_t) + sizeof(mdbx_pid_t) + + sizeof(mdbx_tid_t) + sizeof(pgno_t)) % + MDBX_CACHELINE_SIZE]; } MDBX_reader; /* Information about a single database in the environment. */ diff --git a/src/mdbx.c b/src/mdbx.c index 104d9ea7..b1b7c391 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -1934,6 +1934,32 @@ static txnid_t mdbx_find_oldest(MDBX_txn *txn) { return oldest; } +/* Find largest mvcc-snapshot still referenced. */ +static pgno_t mdbx_find_largest(MDBX_env *env, pgno_t largest) { + MDBX_lockinfo *const lck = env->me_lck; + if (likely(lck != NULL /* exclusive mode */)) { + const unsigned snap_nreaders = lck->mti_numreaders; + for (unsigned i = 0; i < snap_nreaders; ++i) { + retry: + if (lck->mti_readers[i].mr_pid) { + /* mdbx_jitter4testing(true); */ + const pgno_t snap_pages = lck->mti_readers[i].mr_snapshot_pages; + const txnid_t snap_txnid = lck->mti_readers[i].mr_txnid; + mdbx_memory_barrier(); + if (unlikely(snap_pages != lck->mti_readers[i].mr_snapshot_pages || + snap_txnid != lck->mti_readers[i].mr_txnid)) + goto retry; + if (largest < snap_pages && + lck->mti_oldest <= /* ignore pending updates */ snap_txnid && + snap_txnid <= env->me_txn0->mt_txnid) + largest = snap_pages; + } + } + } + + return largest; +} + /* Add a page to the txn's dirty list */ static void mdbx_page_dirty(MDBX_txn *txn, MDBX_page *mp) { MDBX_ID2 mid; @@ -2962,6 +2988,7 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) { const txnid_t snap = mdbx_meta_txnid_fluid(env, meta); mdbx_jitter4testing(false); if (r) { + r->mr_snapshot_pages = meta->mm_geo.next; r->mr_txnid = snap; mdbx_jitter4testing(false); mdbx_assert(env, r->mr_pid == mdbx_getpid()); @@ -2998,6 +3025,8 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) { mdbx_assert(env, txn->mt_txnid >= *env->me_oldest); txn->mt_ro_reader = r; txn->mt_dbxs = env->me_dbxs; /* mostly static anyway */ + mdbx_ensure(env, txn->mt_txnid >= + /* paranoia is appropriate here */ *env->me_oldest); } else { /* Not yet touching txn == env->me_txn0, it may be active */ mdbx_jitter4testing(false); @@ -3338,9 +3367,16 @@ static int mdbx_txn_end(MDBX_txn *txn, unsigned mode) { (void *)env, txn->mt_dbs[MAIN_DBI].md_root, txn->mt_dbs[FREE_DBI].md_root); + mdbx_ensure(env, txn->mt_txnid >= + /* paranoia is appropriate here */ *env->me_oldest); if (F_ISSET(txn->mt_flags, MDBX_TXN_RDONLY)) { if (txn->mt_ro_reader) { + mdbx_ensure(env, /* paranoia is appropriate here */ + txn->mt_txnid == txn->mt_ro_reader->mr_txnid && + txn->mt_ro_reader->mr_txnid >= env->me_lck->mti_oldest); + txn->mt_ro_reader->mr_snapshot_pages = 0; txn->mt_ro_reader->mr_txnid = ~(txnid_t)0; + mdbx_memory_barrier(); env->me_lck->mti_readers_refresh_flag = true; if (mode & MDBX_END_SLOT) { if ((env->me_flags & MDBX_ENV_TXKEY) == 0) @@ -4836,19 +4872,23 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, if ((flags & MDBX_SHRINK_ALLOWED) && pending->mm_geo.shrink && pending->mm_geo.now - pending->mm_geo.next > pending->mm_geo.shrink + backlog_gap) { - const pgno_t aligner = - pending->mm_geo.grow ? pending->mm_geo.grow : pending->mm_geo.shrink; - const pgno_t with_backlog_gap = pending->mm_geo.next + backlog_gap; - const pgno_t aligned = pgno_align2os_pgno( - env, with_backlog_gap + aligner - with_backlog_gap % aligner); - const pgno_t bottom = - (aligned > pending->mm_geo.lower) ? aligned : pending->mm_geo.lower; - if (pending->mm_geo.now > bottom) { - flags &= MDBX_WRITEMAP | MDBX_SHRINK_ALLOWED; /* force steady */ - shrink = pending->mm_geo.now - bottom; - pending->mm_geo.now = bottom; - if (mdbx_meta_txnid_stable(env, head) == pending->mm_txnid_a) - mdbx_meta_set_txnid(env, pending, pending->mm_txnid_a + 1); + const pgno_t largest = mdbx_find_largest(env, pending->mm_geo.next); + if (pending->mm_geo.now > largest && + pending->mm_geo.now - largest > pending->mm_geo.shrink + backlog_gap) { + const pgno_t aligner = + pending->mm_geo.grow ? pending->mm_geo.grow : pending->mm_geo.shrink; + const pgno_t with_backlog_gap = largest + backlog_gap; + const pgno_t aligned = pgno_align2os_pgno( + env, with_backlog_gap + aligner - with_backlog_gap % aligner); + const pgno_t bottom = + (aligned > pending->mm_geo.lower) ? aligned : pending->mm_geo.lower; + if (pending->mm_geo.now > bottom) { + flags &= MDBX_WRITEMAP | MDBX_SHRINK_ALLOWED; /* force steady */ + shrink = pending->mm_geo.now - bottom; + pending->mm_geo.now = bottom; + if (mdbx_meta_txnid_stable(env, head) == pending->mm_txnid_a) + mdbx_meta_set_txnid(env, pending, pending->mm_txnid_a + 1); + } } } @@ -5467,7 +5507,7 @@ int __cold mdbx_env_get_maxreaders(MDBX_env *env, unsigned *readers) { } /* Further setup required for opening an MDBX environment */ -static int __cold mdbx_setup_dxb(MDBX_env *env, int lck_rc) { +static int __cold mdbx_setup_dxb(MDBX_env *env, const int lck_rc) { uint64_t filesize_before_mmap; MDBX_meta meta; int rc = MDBX_RESULT_FALSE; From 334aa68a85a55c38fc97eef729b2eff93e014495 Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Sat, 27 Jul 2019 22:13:28 +0300 Subject: [PATCH 79/83] Note about moving from Github. The [repository was moved](https://abf.io/erthink/libmdbx) due to illegal discriminatory blocking of access from the territory of the Russian Crimea and for sovereign crimeans. Crimea is Russia. Change-Id: I5a4eb6b50be2e88f4dc6658d00331954e373603a --- README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.md b/README.md index 48209756..6119d788 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,7 @@ +## The [repository was moved](https://abf.io/erthink/libmdbx) due to illegal discriminatory blocking of access from the territory of the Russian Crimea and for sovereign crimeans. + +--- + libmdbx ====================================== **Revised and extended descendant of [Symas LMDB](https://symas.com/lmdb/).** From 45487eb052f98819e0ff816dbadcc502d9c09c7f Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Thu, 22 Aug 2019 21:11:49 +0300 Subject: [PATCH 80/83] mdbx: backport - fix env_sync_ex() for out-of-txn case. Change-Id: Ie19bbe1d467ce4fc83f8dfc1e367070f532ee335 --- src/mdbx.c | 50 +++++++++++++++++++++++++++----------------------- 1 file changed, 27 insertions(+), 23 deletions(-) diff --git a/src/mdbx.c b/src/mdbx.c index b1b7c391..06594421 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -1898,7 +1898,7 @@ static txnid_t mdbx_find_oldest(MDBX_txn *txn) { MDBX_lockinfo *const lck = env->me_lck; const txnid_t edge = mdbx_reclaiming_detent(env); - mdbx_tassert(txn, edge <= txn->mt_txnid - 1); + mdbx_tassert(txn, edge <= txn->mt_txnid); const txnid_t last_oldest = lck->mti_oldest; mdbx_tassert(txn, edge >= last_oldest); if (last_oldest == edge) @@ -1935,7 +1935,7 @@ static txnid_t mdbx_find_oldest(MDBX_txn *txn) { } /* Find largest mvcc-snapshot still referenced. */ -static pgno_t mdbx_find_largest(MDBX_env *env, pgno_t largest) { +static __cold pgno_t mdbx_find_largest(MDBX_env *env, pgno_t largest) { MDBX_lockinfo *const lck = env->me_lck; if (likely(lck != NULL /* exclusive mode */)) { const unsigned snap_nreaders = lck->mti_numreaders; @@ -2739,39 +2739,43 @@ static int mdbx_env_sync_ex(MDBX_env *env, int force, int nonblock) { return rc; } - MDBX_meta *head = mdbx_meta_head(env); + const MDBX_meta *head = mdbx_meta_head(env); if (!META_IS_STEADY(head) || env->me_sync_pending) { if (force || (env->me_sync_threshold && env->me_sync_pending >= env->me_sync_threshold)) flags &= MDBX_WRITEMAP /* clear flags for full steady sync */; - if (outside_txn && - env->me_sync_pending > - pgno2bytes(env, 16 /* FIXME: define threshold */) && - (flags & MDBX_NOSYNC) == 0) { - mdbx_assert(env, ((flags ^ env->me_flags) & MDBX_WRITEMAP) == 0); - const size_t usedbytes = pgno_align2os_bytes(env, head->mm_geo.next); + if (outside_txn) { + if (env->me_sync_pending > + pgno2bytes(env, 16 /* FIXME: define threshold */) && + (flags & (MDBX_NOSYNC | MDBX_MAPASYNC)) == 0) { + mdbx_assert(env, ((flags ^ env->me_flags) & MDBX_WRITEMAP) == 0); + const size_t usedbytes = pgno_align2os_bytes(env, head->mm_geo.next); - mdbx_txn_unlock(env); + mdbx_txn_unlock(env); - /* LY: pre-sync without holding lock to reduce latency for writer(s) */ - int rc = (flags & MDBX_WRITEMAP) - ? mdbx_msync(&env->me_dxb_mmap, 0, usedbytes, - flags & MDBX_MAPASYNC) - : mdbx_filesync(env->me_fd, false); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; + /* LY: pre-sync without holding lock to reduce latency for writer(s) */ + int rc = (flags & MDBX_WRITEMAP) + ? mdbx_msync(&env->me_dxb_mmap, 0, usedbytes, false) + : mdbx_filesync(env->me_fd, false); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; - rc = mdbx_txn_lock(env, nonblock); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; + rc = mdbx_txn_lock(env, nonblock); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; - /* LY: head may be changed. */ - head = mdbx_meta_head(env); + /* LY: head may be changed. */ + head = mdbx_meta_head(env); + } + env->me_txn0->mt_txnid = meta_txnid(env, head, false); + mdbx_find_oldest(env->me_txn0); } - if (!META_IS_STEADY(head) || env->me_sync_pending) { + if (!META_IS_STEADY(head) || + ((flags & (MDBX_NOSYNC | MDBX_MAPASYNC)) == 0 && + env->me_sync_pending)) { mdbx_debug("meta-head %" PRIaPGNO ", %s, sync_pending %" PRIuPTR, container_of(head, MDBX_page, mp_data)->mp_pgno, mdbx_durable_str(head), env->me_sync_pending); From 78e354689e1a7c5cb3d84a543b8a225f718ec609 Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Thu, 22 Aug 2019 22:33:18 +0300 Subject: [PATCH 81/83] mdbx: backport - don't shrink DB less largest reader inside mdbx_env_set_geometry(). Change-Id: I42a5d3a08313fb9590a6730bc0dc06c7b4f16634 --- src/mdbx.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/mdbx.c b/src/mdbx.c index 06594421..405a80f6 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -5270,6 +5270,10 @@ LIBMDBX_API int mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, need_unlock = true; } MDBX_meta *head = mdbx_meta_head(env); + if (!inside_txn) { + env->me_txn0->mt_txnid = meta_txnid(env, head, false); + mdbx_find_oldest(env->me_txn0); + } if (pagesize < 0) pagesize = env->me_psize; @@ -5289,7 +5293,8 @@ LIBMDBX_API int mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, if (shrink_threshold < 0) shrink_threshold = pgno2bytes(env, head->mm_geo.shrink); - const size_t usedbytes = pgno2bytes(env, head->mm_geo.next); + const size_t usedbytes = + pgno2bytes(env, mdbx_find_largest(env, head->mm_geo.next)); if ((size_t)size_upper < usedbytes) { rc = MDBX_MAP_FULL; goto bailout; From b35789774575cc0f5dfef29f5a457ee51ce15f39 Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Fri, 23 Aug 2019 17:47:38 +0300 Subject: [PATCH 82/83] mdbx: backport - fix reclaiming_detent() for out-of-txn case. Change-Id: I5275f4f3676b125e860f6a7c204a5f9cdc65dd5f --- src/mdbx.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/mdbx.c b/src/mdbx.c index 405a80f6..6b9601f2 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -1875,7 +1875,9 @@ static __hot MDBX_meta *mdbx_meta_head(const MDBX_env *env) { static __hot txnid_t mdbx_reclaiming_detent(const MDBX_env *env) { if (F_ISSET(env->me_flags, MDBX_UTTERLY_NOSYNC)) - return env->me_txn->mt_txnid - 1; + return likely(env->me_txn0->mt_owner == mdbx_thread_self()) + ? env->me_txn0->mt_txnid - 1 + : mdbx_meta_txnid_fluid(env, mdbx_meta_head(env)); return mdbx_meta_txnid_stable(env, mdbx_meta_steady(env)); } From 09cc8c51e290b37f5865199a88dcd0b1932d201d Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Tue, 27 Aug 2019 14:50:19 +0300 Subject: [PATCH 83/83] mdbx-load: backport - fix backslash escaping (for compatibility with ITS#9068). In fact MDBX not affected by this bug, since a very long time mdbx_dump was fixed to not produce a problematic sequence of backslash. For compatibility with LMDB after http://www.openldap.org/devel/gitweb.cgi?p=openldap.git;a=commit;h=5c012bbe033f9bbb273078b07dded59f080d348d Change-Id: I8ff8e003ae29504605402b937becd4fb37120408 --- src/tools/mdbx_load.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/tools/mdbx_load.c b/src/tools/mdbx_load.c index c36084c0..ac2ed171 100644 --- a/src/tools/mdbx_load.c +++ b/src/tools/mdbx_load.c @@ -262,10 +262,9 @@ static int readline(MDBX_val *out, MDBX_val *buf) { if (mode & PRINT) { while (c2 < end) { - if (*c2 == '\\') { + if (unlikely(*c2 == '\\')) { if (c2[1] == '\\') { - c1++; - c2 += 2; + *c1++ = '\\'; } else { if (c2 + 3 > end || !isxdigit(c2[1]) || !isxdigit(c2[2])) { Eof = 1; @@ -273,8 +272,8 @@ static int readline(MDBX_val *out, MDBX_val *buf) { return EOF; } *c1++ = (char)unhex(++c2); - c2 += 2; } + c2 += 2; } else { /* copies are redundant when no escapes were used */ *c1++ = *c2++;