From 5e43ee61a262f2fbc32b3f1b729a5730640f22d5 Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Sat, 1 Aug 2020 19:13:17 +0300 Subject: [PATCH] mdbx: drop/deprecate MDBX_MAPASYNC. Change-Id: I472f97f568a32325eb056c8ee4d2f2350a473bda --- GNUmakefile | 6 +- README.md | 17 +++-- mdbx.h | 115 ++++++++++++-------------------- src/core.c | 149 +++++++++++++++++++----------------------- src/internals.h | 6 +- src/osal.c | 82 ++++++++++++----------- src/osal.h | 8 ++- test/config.cc | 1 - test/main.cc | 5 +- test/nested.cc | 3 +- test/pcrf/pcrf_test.c | 4 +- 11 files changed, 182 insertions(+), 214 deletions(-) diff --git a/GNUmakefile b/GNUmakefile index 9baec289..6b8222b3 100644 --- a/GNUmakefile +++ b/GNUmakefile @@ -155,7 +155,7 @@ check: test dist test: build-test rm -f $(TEST_DB) $(TEST_LOG) && (set -o pipefail; \ (./mdbx_test --table=+data.integer --keygen.split=29 --datalen.min=min --datalen.max=max --progress --console=no --repeat=$(TEST_ITER) --pathname=$(TEST_DB) --dont-cleanup-after basic && \ - ./mdbx_test --mode=-writemap,-mapasync,-lifo --progress --console=no --repeat=12 --pathname=$(TEST_DB) --dont-cleanup-after basic) \ + ./mdbx_test --mode=-writemap,-nosync-safe,-lifo --progress --console=no --repeat=12 --pathname=$(TEST_DB) --dont-cleanup-after basic) \ | tee >(gzip --stdout > $(TEST_LOG)) | tail -n 42) \ && ./mdbx_chk -vvn $(TEST_DB) && ./mdbx_chk -vvn $(TEST_DB)-copy @@ -163,7 +163,7 @@ test-singleprocess: all mdbx_test rm -f $(TEST_DB) $(TEST_LOG) && (set -o pipefail; \ (./mdbx_test --table=+data.integer --keygen.split=29 --datalen.min=min --datalen.max=max --progress --console=no --repeat=42 --pathname=$(TEST_DB) --dont-cleanup-after --hill && \ ./mdbx_test --progress --console=no --repeat=2 --pathname=$(TEST_DB) --dont-cleanup-before --dont-cleanup-after --copy && \ - ./mdbx_test --mode=-writemap,-mapasync,-lifo --progress --console=no --repeat=42 --pathname=$(TEST_DB) --dont-cleanup-after --nested) \ + ./mdbx_test --mode=-writemap,-nosync-safe,-lifo --progress --console=no --repeat=42 --pathname=$(TEST_DB) --dont-cleanup-after --nested) \ | tee >(gzip --stdout > $(TEST_LOG)) | tail -n 42) \ && ./mdbx_chk -vvn $(TEST_DB) && ./mdbx_chk -vvn $(TEST_DB)-copy @@ -178,7 +178,7 @@ memcheck test-valgrind: rm -f valgrind-*.log $(TEST_DB) $(TEST_LOG) && (set -o pipefail; ( \ $(VALGRIND) ./mdbx_test --table=+data.integer --keygen.split=29 --datalen.min=min --datalen.max=max --progress --console=no --repeat=2 --pathname=$(TEST_DB) --dont-cleanup-after basic && \ $(VALGRIND) ./mdbx_test --progress --console=no --pathname=$(TEST_DB) --dont-cleanup-before --dont-cleanup-after --copy && \ - $(VALGRIND) ./mdbx_test --mode=-writemap,-mapasync,-lifo --progress --console=no --repeat=4 --pathname=$(TEST_DB) --dont-cleanup-after basic && \ + $(VALGRIND) ./mdbx_test --mode=-writemap,-nosync-safe,-lifo --progress --console=no --repeat=4 --pathname=$(TEST_DB) --dont-cleanup-after basic && \ $(VALGRIND) ./mdbx_chk -vvn $(TEST_DB) && \ $(VALGRIND) ./mdbx_chk -vvn $(TEST_DB)-copy \ ) | tee >(gzip --stdout > $(TEST_LOG)) | tail -n 42) diff --git a/README.md b/README.md index 84e530c0..dcb6dbbe 100644 --- a/README.md +++ b/README.md @@ -227,7 +227,7 @@ the user's point of view. 7. Fast estimation of range query result volume, i.e. how many items can be found between a `KEY1` and a `KEY2`. This is a prerequisite for build and/or optimize query execution plans. - > _libmdbx_ performs a rough estimate based on common B-tree pages of the paths from root to corresponding keys. + > _libmdbx_ performs a rough estimate based on common B-tree pages of the paths from root to corresponding keys. 8. `mdbx_chk` tool for database integrity check. @@ -259,9 +259,13 @@ pair, to the first, to the last, or not set to anything. ## Other fixes and specifics -1. Fixed more than 10 significant errors, in particular: page leaks, wrong sub-database statistics, segfault in several conditions, nonoptimal page merge strategy, updating an existing record with a change in data size (including for multimap), etc. +1. Fixed more than 10 significant errors, in particular: page leaks, +wrong sub-database statistics, segfault in several conditions, +nonoptimal page merge strategy, updating an existing record with +a change in data size (including for multimap), etc. -2. All cursors can be reused and should be closed explicitly, regardless ones were opened within a write or read transaction. +2. All cursors can be reused and should be closed explicitly, +regardless ones were opened within a write or read transaction. 3. Opening database handles are spared from race conditions and pre-opening is not needed. @@ -269,10 +273,9 @@ pre-opening is not needed. 4. Returning `MDBX_EMULTIVAL` error in case of ambiguous update or delete. 5. Guarantee of database integrity even in asynchronous unordered write-to-disk mode. - > _libmdbx_ propose additional trade-off by implementing append-like manner for updates - > in `MDBX_SAFE_NOSYNC` and `MDBX_WRITEMAP|MDBX_MAPASYNC` modes, that avoid database corruption after a system crash - > contrary to LMDB. Nevertheless, the `MDBX_UTTERLY_NOSYNC` mode is available to match LMDB behaviour, - > and for special use-cases. + > _libmdbx_ propose additional trade-off by `MDBX_SAFE_NOSYNC` with append-like manner for updates, + > that avoids database corruption after a system crash contrary to LMDB. + > Nevertheless, the `MDBX_UTTERLY_NOSYNC` mode is available to match behaviour of the `MDB_NOSYNC` in LMDB. 6. On **MacOS & iOS** the `fcntl(F_FULLFSYNC)` syscall is used _by default_ to synchronize data with the disk, as this is [the only way to diff --git a/mdbx.h b/mdbx.h index 1cbd7a76..09e86597 100644 --- a/mdbx.h +++ b/mdbx.h @@ -809,13 +809,12 @@ enum MDBX_env_flags_t { * series of write transactions, will be as small as possible. Thus creates * ideal conditions for the efficient operation of the disk write-back cache. * - * \ref MDBX_LIFORECLAIM is compatible with all no-sync flags (i.e. - * \ref MDBX_NOMETASYNC, \ref MDBX_SAFE_NOSYNC, \ref MDBX_UTTERLY_NOSYNC, \ref - * MDBX_MAPASYNC), but gives no noticeable impact in combination with \ref - * MDBX_SAFE_NOSYNC. Because MDBX will reused pages only before the last - * "steady" MVCC-snapshot, i.e. the loop length of database pages circulation - * will be mostly defined by frequency of calling `mdbx_env_sync()` rather - * than LIFO and FIFO difference. + * \ref MDBX_LIFORECLAIM is compatible with all no-sync flags, but gives NO + * noticeable impact in combination with \ref MDBX_SAFE_NOSYNC or + * \ref MDBX_UTTERLY_NOSYNC. Because MDBX will reused pages only before the + * last "steady" MVCC-snapshot, i.e. the loop length of database pages + * circulation will be mostly defined by frequency of calling + * `mdbx_env_sync()` rather than LIFO and FIFO difference. * * This flag may be changed at any time using mdbx_env_set_flags(). */ MDBX_LIFORECLAIM = UINT32_C(0x4000000), @@ -827,9 +826,9 @@ enum MDBX_env_flags_t { /** \defgroup sync_modes SYNC MODES * * \attention Using any combination of \ref MDBX_SAFE_NOSYNC, \ref - * MDBX_NOMETASYNC, \ref MDBX_MAPASYNC and especially \ref MDBX_UTTERLY_NOSYNC - * is always a deal to reduce durability for gain write performance. You must - * know exactly what you are doing and what risks you are taking! + * MDBX_NOMETASYNC and especially \ref MDBX_UTTERLY_NOSYNC is always a deal to + * reduce durability for gain write performance. You must know exactly what + * you are doing and what risks you are taking! * * \note for LMDB users: \ref MDBX_SAFE_NOSYNC is NOT similar to LMDB_NOSYNC, * but \ref MDBX_UTTERLY_NOSYNC is exactly match LMDB_NOSYNC. See details @@ -863,7 +862,6 @@ enum MDBX_env_flags_t { * * \see MDBX_NOMETASYNC * \see MDBX_SAFE_NOSYNC - * \see MDBX_MAPASYNC * \see MDBX_UTTERLY_NOSYNC * * @{ */ @@ -893,13 +891,14 @@ enum MDBX_env_flags_t { * huge difference in how are recycled the MVCC snapshots corresponding to * previous "steady" transactions (see below). * + * With \ref MDBX_WRITEMAP the `MDBX_SAFE_NOSYNC` instructs MDBX to use + * asynchronous mmap-flushes to disk. Asynchronous mmap-flushes means that + * actually all writes will scheduled and performed by operation system on it + * own manner, i.e. unordered. MDBX itself just notify operating system that + * it would be nice to write data to disk, but no more. + * * Depending on the platform and hardware, with `MDBX_SAFE_NOSYNC` you may get - * a multiple increase of write performance, even 10 times or more. \note Note - * that (`MDBX_SAFE_NOSYNC` | \ref MDBX_WRITEMAP) leaves the system with no - * hint for when to write transactions to disk. Therefore the - * (\ref MDBX_MAPASYNC | \ref MDBX_WRITEMAP) may be preferable, but without - * `MDBX_SAFE_NOSYNC` because the (\ref MDBX_MAPASYNC | `MDBX_SAFE_NOSYNC`) - * actually gives \ref MDBX_UTTERLY_NOSYNC. + * a multiple increase of write performance, even 10 times or more. * * In contrast to \ref MDBX_UTTERLY_NOSYNC mode, with `MDBX_SAFE_NOSYNC` flag * MDBX will keeps untouched pages within B-tree of the last transaction @@ -933,43 +932,15 @@ enum MDBX_env_flags_t { * * `MDBX_SAFE_NOSYNC` flag may be changed at any time using * \ref mdbx_env_set_flags() or by passing to \ref mdbx_txn_begin() for - * particular write transaction. - * - * \warning don't combine this flag with \ref MDBX_MAPASYNC since you will got - * \ref MDBX_UTTERLY_NOSYNC in that way. \see sync_modes */ + * particular write transaction. */ MDBX_SAFE_NOSYNC = UINT32_C(0x10000), - /** Use asynchronous msync when \ref MDBX_WRITEMAP is used. + /** \deprecated Please use \ref MDBX_SAFE_NOSYNC instead of `MDBX_MAPASYNC`. * - * `MDBX_MAPASYNC` meaningful and give effect only in conjunction - * with `MDBX_WRITEMAP` or `MDBX_SAFE_NOSYNC`: - * - with \ref MDBX_SAFE_NOSYNC actually gives \ref MDBX_UTTERLY_NOSYNC, - * which wipe previous steady commits for reuse pages as described above. - * - with \ref MDBX_WRITEMAP but without \ref MDBX_SAFE_NOSYNC instructs MDBX - * to use asynchronous mmap-flushes to disk as described below. - * - with both \ref MDBX_WRITEMAP and \ref MDBX_SAFE_NOSYNC you get the both - * effects. - * - * Asynchronous mmap-flushes means that actually all writes will scheduled and - * performed by operation system on it own manner, i.e. unordered. MDBX itself - * just notify operating system that it would be nice to write data to disk, - * but no more. - * - * With \ref MDBX_MAPASYNC flag, but without \ref MDBX_UTTERLY_NOSYNC (i.e. - * without OR'ing with \ref MDBX_SAFE_NOSYNC) MDBX will keeps untouched pages - * within B-tree of the last transaction "steady" which was synced to disk - * completely. So, this makes exactly the same "long-lived" impact and the - * same consequences as described above for \ref MDBX_SAFE_NOSYNC flag. - * - * Depending on the platform and hardware, with combination of - * \ref MDBX_WRITEMAP and \ref MDBX_MAPASYNC you may get a multiple increase - * of write performance, even 10-100 times or more. \ref MDBX_MAPASYNC flag - * may be changed at any time using \ref mdbx_env_set_flags() or by passing to - * \ref mdbx_txn_begin() for particular write transaction. - * - * \warning don't combine this flag with \ref MDBX_SAFE_NOSYNC since you will - * got \ref MDBX_UTTERLY_NOSYNC in that way. \see sync_modes */ - MDBX_MAPASYNC = UINT32_C(0x100000), + * Since version 0.9.x the `MDBX_MAPASYNC` is deprecated and has the same + * effect as \ref MDBX_SAFE_NOSYNC with \ref MDBX_WRITEMAP. This just API + * simplification is for convenience and clarity. */ + MDBX_MAPASYNC = MDBX_SAFE_NOSYNC, /** Don't sync anything and wipe previous steady commits. * @@ -1012,7 +983,7 @@ enum MDBX_env_flags_t { * `MDBX_UTTERLY_NOSYNC` flag may be changed at any time using * \ref mdbx_env_set_flags(), but don't has effect if passed to * \ref mdbx_txn_begin() for particular write transaction. \see sync_modes */ - MDBX_UTTERLY_NOSYNC = MDBX_SAFE_NOSYNC | MDBX_MAPASYNC, + MDBX_UTTERLY_NOSYNC = MDBX_SAFE_NOSYNC | UINT32_C(0x100000), /** @} end of SYNC MODES */ @@ -1459,14 +1430,14 @@ LIBMDBX_API int mdbx_env_create(MDBX_env **penv); * \ref MDBX_NOMEMINIT, \ref MDBX_COALESCE, \ref MDBX_LIFORECLAIM. * See \ref env_flags section. * - * - \ref MDBX_NOMETASYNC, \ref MDBX_SAFE_NOSYNC, \ref MDBX_UTTERLY_NOSYNC, - * \ref MDBX_MAPASYNC. See \ref sync_modes section. + * - \ref MDBX_NOMETASYNC, \ref MDBX_SAFE_NOSYNC, \ref MDBX_UTTERLY_NOSYNC. + * See \ref sync_modes section. * * \note `MDB_NOLOCK` flag don't supported by MDBX, * try use \ref MDBX_EXCLUSIVE as a replacement. * - * \note MDBX don't allow to mix processes with different \ref MDBX_SAFE_NOSYNC, - * \ref MDBX_MAPASYNC flags on the same environment. + * \note MDBX don't allow to mix processes with different \ref MDBX_SAFE_NOSYNC + * flags on the same environment. * In such case \ref MDBX_INCOMPATIBLE will be returned. * * If the database is already exist and parameters specified early by @@ -1493,7 +1464,7 @@ LIBMDBX_API int mdbx_env_create(MDBX_env **penv); * more than once. * \retval MDBX_INCOMPATIBLE Environment is already opened by another process, * but with different set of \ref MDBX_SAFE_NOSYNC, - * \ref MDBX_MAPASYNC flags. + * \ref MDBX_UTTERLY_NOSYNC flags. * Or if the database is already exist and parameters * specified early by \ref mdbx_env_set_geometry() * are incompatible (i.e. different pagesize, etc). @@ -1697,7 +1668,7 @@ MDBX_DEPRECATED LIBMDBX_API int mdbx_env_info(MDBX_env *env, MDBX_envinfo *info, * \ingroup c_extra * * Unless the environment was opened with no-sync flags (\ref MDBX_NOMETASYNC, - * \ref MDBX_SAFE_NOSYNC, \ref MDBX_UTTERLY_NOSYNC and \ref MDBX_MAPASYNC), then + * \ref MDBX_SAFE_NOSYNC and \ref MDBX_UTTERLY_NOSYNC), then * data is always written an flushed to disk when \ref mdbx_txn_commit() is * called. Otherwise \ref mdbx_env_sync() may be called to manually write and * flush unsynced data to disk. @@ -1741,18 +1712,18 @@ LIBMDBX_API int mdbx_env_sync(MDBX_env *env); LIBMDBX_API int mdbx_env_sync_poll(MDBX_env *env); /** Sets threshold to force flush the data buffers to disk, even any of - * \ref MDBX_SAFE_NOSYNC, \ref MDBX_NOMETASYNC and \ref MDBX_MAPASYNC flags in - * the environment. + * \ref MDBX_SAFE_NOSYNC flag in the environment. * \ingroup c_settings * * The threshold value affects all processes which operates with given * environment until the last process close environment or a new value will be * settled. * - * Data is always written to disk when \ref mdbx_txn_commit() is called, but + * Data is always written to disk when \ref mdbx_txn_commit() is called, but * the operating system may keep it buffered. MDBX always flushes the OS buffers * upon commit as well, unless the environment was opened with - * \ref MDBX_SAFE_NOSYNC, \ref MDBX_MAPASYNC or in part \ref MDBX_NOMETASYNC. + * \ref MDBX_SAFE_NOSYNC, \ref MDBX_UTTERLY_NOSYNC + * or in part \ref MDBX_NOMETASYNC. * * The default is 0, than mean no any threshold checked, and no additional * flush will be made. @@ -1765,8 +1736,7 @@ LIBMDBX_API int mdbx_env_sync_poll(MDBX_env *env); LIBMDBX_API int mdbx_env_set_syncbytes(MDBX_env *env, size_t threshold); /** Sets relative period since the last unsteay commit to force flush the data - * buffers to disk, even any of \ref MDBX_SAFE_NOSYNC, \ref MDBX_NOMETASYNC and - * \ref MDBX_MAPASYNC flags in the environment. + * buffers to disk, even of \ref MDBX_SAFE_NOSYNC flag in the environment. * \ingroup c_settings * * The relative period value affects all processes which operates with given @@ -1776,7 +1746,7 @@ LIBMDBX_API int mdbx_env_set_syncbytes(MDBX_env *env, size_t threshold); * Data is always written to disk when \ref mdbx_txn_commit() is called, but the * operating system may keep it buffered. MDBX always flushes the OS buffers * upon commit as well, unless the environment was opened with - * \ref MDBX_SAFE_NOSYNC, \ref MDBX_MAPASYNC or in part \ref MDBX_NOMETASYNC. + * \ref MDBX_SAFE_NOSYNC or in part \ref MDBX_NOMETASYNC. * * Settled period don't checked asynchronously, but only by the * \ref mdbx_txn_commit() and \ref mdbx_env_sync() functions. Therefore, in @@ -1809,12 +1779,12 @@ LIBMDBX_API int mdbx_env_set_syncperiod(MDBX_env *env, * \ref mdbx_env_create(). * * \param [in] dont_sync A dont'sync flag, if non-zero the last checkpoint - * will be kept "as is" and may be still "weak" in the - * \ref MDBX_UTTERLY_NOSYNC or \ref MDBX_MAPASYNC modes. - * Such "weak" checkpoint will be ignored on opening next - * time, and transactions since the last non-weak - * checkpoint (meta-page update) will rolledback for - * consistency guarantee. + * will be kept "as is" and may be still "weak" in the + * \ref MDBX_SAFE_NOSYNC or \ref MDBX_UTTERLY_NOSYNC + * modes. Such "weak" checkpoint will be ignored on + * opening next time, and transactions since the last + * non-weak checkpoint (meta-page update) will rolledback + * for consistency guarantee. * * \returns A non-zero error value on failure and 0 on success, * some possible errors are: @@ -2294,8 +2264,7 @@ LIBMDBX_API void *mdbx_env_get_userctx(const MDBX_env *env); * - \ref MDBX_TRYTXN Do not block when starting * a write transaction. * - * - \ref MDBX_SAFE_NOSYNC, \ref MDBX_NOMETASYNC or - * \ref MDBX_MAPASYNC. + * - \ref MDBX_SAFE_NOSYNC, \ref MDBX_NOMETASYNC. * Do not sync data to disk corresponding * to \ref MDBX_NOMETASYNC or \ref MDBX_SAFE_NOSYNC * description. \see sync_modes. diff --git a/src/core.c b/src/core.c index f852a4b0..fa95f403 100644 --- a/src/core.c +++ b/src/core.c @@ -4922,7 +4922,7 @@ __cold static int mdbx_wipe_steady(MDBX_env *env, const txnid_t last_steady) { SYNC_FILE_RANGE_WRITE | SYNC_FILE_RANGE_WAIT_AFTER)) err = errno; #else - err = mdbx_filesync(env->me_lazy_fd, MDBX_SYNC_DATA); + err = mdbx_fsync(env->me_lazy_fd, MDBX_SYNC_DATA); #endif if (unlikely(err != MDBX_SUCCESS)) return err; @@ -5646,7 +5646,7 @@ __cold static int mdbx_env_sync_internal(MDBX_env *env, int force, if (outside_txn) { if (unsynced_pages > /* FIXME: define threshold */ 16 && - (flags & (MDBX_SAFE_NOSYNC | MDBX_MAPASYNC)) == 0) { + (flags & MDBX_SAFE_NOSYNC) == 0) { mdbx_assert(env, ((flags ^ env->me_flags) & MDBX_WRITEMAP) == 0); const size_t usedbytes = pgno_align2os_bytes(env, head->mm_geo.next); @@ -5655,7 +5655,7 @@ __cold static int mdbx_env_sync_internal(MDBX_env *env, int force, /* LY: pre-sync without holding lock to reduce latency for writer(s) */ int err = (flags & MDBX_WRITEMAP) ? mdbx_msync(&env->me_dxb_mmap, 0, usedbytes, false) - : mdbx_filesync(env->me_lazy_fd, MDBX_SYNC_DATA); + : mdbx_fsync(env->me_lazy_fd, MDBX_SYNC_DATA); if (unlikely(err != MDBX_SUCCESS)) return err; @@ -5673,7 +5673,7 @@ __cold static int mdbx_env_sync_internal(MDBX_env *env, int force, } if (!META_IS_STEADY(head) || - ((flags & (MDBX_SAFE_NOSYNC | MDBX_MAPASYNC)) == 0 && unsynced_pages)) { + ((flags & MDBX_SAFE_NOSYNC) == 0 && unsynced_pages)) { mdbx_debug("meta-head %" PRIaPGNO ", %s, sync_pending %" PRIaPGNO, data_page(head)->mp_pgno, mdbx_durable_str(head), unsynced_pages); @@ -5694,11 +5694,10 @@ fastpath: if (rc == MDBX_RESULT_TRUE && (env->me_flags & MDBX_NOMETASYNC) != 0) { const txnid_t head_txnid = mdbx_recent_committed_txnid(env); if (*env->me_meta_sync_txnid != (uint32_t)head_txnid) { - rc = - (flags & MDBX_WRITEMAP) - ? mdbx_msync(&env->me_dxb_mmap, 0, - pgno_align2os_bytes(env, NUM_METAS), false) - : mdbx_filesync(env->me_lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); + rc = (flags & MDBX_WRITEMAP) + ? mdbx_msync(&env->me_dxb_mmap, 0, + pgno_align2os_bytes(env, NUM_METAS), false) + : mdbx_fsync(env->me_lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); if (likely(rc == MDBX_SUCCESS)) *env->me_meta_sync_txnid = (uint32_t)head_txnid; } @@ -6480,7 +6479,7 @@ int mdbx_txn_begin(MDBX_env *env, MDBX_txn *parent, unsigned flags, mdbx_assert(env, (txn->mt_flags & ~(MDBX_NOTLS | MDBX_RDONLY | MDBX_WRITEMAP | MDBX_SHRINK_ALLOWED | MDBX_NOMETASYNC | - MDBX_SAFE_NOSYNC | MDBX_MAPASYNC)) == 0); + MDBX_SAFE_NOSYNC)) == 0); txn->mt_signature = MDBX_MT_SIGNATURE; *ret = txn; mdbx_debug("begin txn %" PRIaTXN "%c %p on env %p, root page %" PRIaPGNO @@ -8584,7 +8583,7 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, mdbx_assert(env, (env->me_flags & (MDBX_RDONLY | MDBX_FATAL_ERROR)) == 0); mdbx_assert(env, pending->mm_geo.next <= pending->mm_geo.now); - if (flags & (MDBX_SAFE_NOSYNC | MDBX_MAPASYNC)) { + if (flags & MDBX_SAFE_NOSYNC) { /* Check auto-sync conditions */ const pgno_t autosync_threshold = *env->me_autosync_threshold; const uint64_t autosync_period = *env->me_autosync_period; @@ -8681,38 +8680,27 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, } /* LY: step#1 - sync previously written/updated data-pages */ - int rc = *env->me_unsynced_pages ? MDBX_RESULT_TRUE /* carry non-steady */ - : MDBX_RESULT_FALSE /* carry steady */; - if (rc != MDBX_RESULT_FALSE && (flags & MDBX_SAFE_NOSYNC) == 0) { + int rc = MDBX_RESULT_FALSE /* carry steady */; + if (*env->me_unsynced_pages) { mdbx_assert(env, ((flags ^ env->me_flags) & MDBX_WRITEMAP) == 0); - MDBX_meta *const recent_steady_meta = mdbx_meta_steady(env); - if (flags & MDBX_WRITEMAP) { - const size_t begin = - floor_powerof2(pgno2bytes(env, NUM_METAS), env->me_os_psize); - const size_t end = pgno_align2os_bytes(env, pending->mm_geo.next); - if (end > begin) { - rc = mdbx_msync(&env->me_dxb_mmap, begin, end - begin, - flags & MDBX_MAPASYNC); - if (unlikely(rc != MDBX_SUCCESS)) - goto fail; - } - rc = MDBX_RESULT_TRUE /* carry non-steady */; - if ((flags & MDBX_MAPASYNC) == 0) { - if (unlikely(pending->mm_geo.next > recent_steady_meta->mm_geo.now)) { - rc = mdbx_filesync(env->me_lazy_fd, MDBX_SYNC_SIZE); - if (unlikely(rc != MDBX_SUCCESS)) - goto fail; - } - rc = MDBX_RESULT_FALSE /* carry steady */; - } - } else { - rc = mdbx_filesync(env->me_lazy_fd, - (pending->mm_geo.next > recent_steady_meta->mm_geo.now) - ? MDBX_SYNC_DATA | MDBX_SYNC_SIZE - : MDBX_SYNC_DATA); - if (unlikely(rc != MDBX_SUCCESS)) - goto fail; + enum mdbx_syncmode_bits mode_bits = MDBX_SYNC_NONE; + if ((flags & MDBX_SAFE_NOSYNC) == 0) { + mode_bits = MDBX_SYNC_DATA; + if (pending->mm_geo.next > mdbx_meta_steady(env)->mm_geo.now) + mode_bits |= MDBX_SYNC_SIZE; + if (flags & MDBX_NOMETASYNC) + mode_bits |= MDBX_SYNC_IODQ; } + if (flags & MDBX_WRITEMAP) + rc = + mdbx_msync(&env->me_dxb_mmap, 0, + pgno_align2os_bytes(env, pending->mm_geo.next), mode_bits); + else + rc = mdbx_fsync(env->me_lazy_fd, mode_bits); + if (unlikely(rc != MDBX_SUCCESS)) + goto fail; + rc = (flags & MDBX_SAFE_NOSYNC) ? MDBX_RESULT_TRUE /* carry non-steady */ + : MDBX_RESULT_FALSE /* carry steady */; } /* Steady or Weak */ @@ -8825,24 +8813,13 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, target->mm_datasync_sign = pending->mm_datasync_sign; mdbx_flush_incoherent_cpu_writeback(); mdbx_jitter4testing(true); - if ((flags & MDBX_SAFE_NOSYNC) == 0) { - /* sync meta-pages */ - const bool weak = (flags & (MDBX_MAPASYNC | MDBX_NOMETASYNC)) != 0; - rc = mdbx_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, NUM_METAS), - weak); - if (unlikely(rc != MDBX_SUCCESS)) - goto fail; - if (!weak) { -#if defined(__APPLE__) && \ - MDBX_OSX_SPEED_INSTEADOF_DURABILITY == MDBX_OSX_WANNA_DURABILITY - rc = likely(fcntl(env->me_lazy_fd, F_FULLFSYNC) != -1) ? MDBX_SUCCESS - : errno; - if (unlikely(rc != MDBX_SUCCESS)) - goto fail; -#endif /* MacOS */ - *env->me_meta_sync_txnid = pending->mm_txnid_a.low; - } - } + /* sync meta-pages */ + rc = + mdbx_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, NUM_METAS), + (flags & MDBX_NOMETASYNC) ? MDBX_SYNC_NONE + : MDBX_SYNC_DATA | MDBX_SYNC_IODQ); + if (unlikely(rc != MDBX_SUCCESS)) + goto fail; } else { const MDBX_meta undo_meta = *target; const mdbx_filehandle_t fd = (env->me_dsync_fd != INVALID_HANDLE_VALUE) @@ -8860,16 +8837,17 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, goto fail; } mdbx_flush_incoherent_mmap(target, sizeof(MDBX_meta), env->me_os_psize); - if ((flags & (MDBX_SAFE_NOSYNC | MDBX_NOMETASYNC)) == 0) { - /* sync meta-pages */ - if (fd == env->me_lazy_fd) { - rc = mdbx_filesync(env->me_lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); - if (rc != MDBX_SUCCESS) - goto undo; - } - *env->me_meta_sync_txnid = pending->mm_txnid_a.low; + /* sync meta-pages */ + if ((flags & MDBX_NOMETASYNC) == 0 && fd == env->me_lazy_fd) { + rc = mdbx_fsync(env->me_lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); + if (rc != MDBX_SUCCESS) + goto undo; } } + if (flags & MDBX_NOMETASYNC) + *env->me_unsynced_pages += 1; + else + *env->me_meta_sync_txnid = pending->mm_txnid_a.low; /* LY: shrink datafile if needed */ if (unlikely(shrink)) { @@ -9951,7 +9929,7 @@ static int __cold mdbx_setup_lck(MDBX_env *env, char *lck_pathname, mdbx_error("initial-%s for lck-file failed", "msync"); goto bailout; } - err = mdbx_filesync(env->me_lck_mmap.fd, MDBX_SYNC_SIZE); + err = mdbx_fsync(env->me_lck_mmap.fd, MDBX_SYNC_SIZE); if (unlikely(err != MDBX_SUCCESS)) { mdbx_error("initial-%s for lck-file failed", "fsync"); goto bailout; @@ -10112,13 +10090,24 @@ __cold int mdbx_is_readahead_reasonable(size_t volume, intptr_t redundancy) { : MDBX_RESULT_TRUE; } -/* Merge flags and avoid false MDBX_UTTERLY_NOSYNC */ -static uint32_t merge_flags(const uint32_t a, const uint32_t b) { +/* Merge sync flags */ +static uint32_t merge_sync_flags(const uint32_t a, const uint32_t b) { uint32_t r = a | b; + + /* avoid false MDBX_UTTERLY_NOSYNC */ if (F_ISSET(r, MDBX_UTTERLY_NOSYNC) && !F_ISSET(a, MDBX_UTTERLY_NOSYNC) && !F_ISSET(b, MDBX_UTTERLY_NOSYNC)) - r -= (r & MDBX_WRITEMAP) ? MDBX_UTTERLY_NOSYNC ^ MDBX_MAPASYNC - : MDBX_UTTERLY_NOSYNC ^ MDBX_SAFE_NOSYNC; + r = (r - MDBX_UTTERLY_NOSYNC) | MDBX_SAFE_NOSYNC; + + /* convert MDBX_DEPRECATED_MAPASYNC to MDBX_SAFE_NOSYNC */ + if ((r & (MDBX_WRITEMAP | MDBX_DEPRECATED_MAPASYNC)) == + (MDBX_WRITEMAP | MDBX_DEPRECATED_MAPASYNC)) + r = (r - MDBX_DEPRECATED_MAPASYNC) | MDBX_SAFE_NOSYNC; + + /* force MDBX_NOMETASYNC if MDBX_SAFE_NOSYNC enabled */ + if (r & MDBX_SAFE_NOSYNC) + r |= MDBX_NOMETASYNC; + assert(!(F_ISSET(r, MDBX_UTTERLY_NOSYNC) && !F_ISSET(a, MDBX_UTTERLY_NOSYNC) && !F_ISSET(b, MDBX_UTTERLY_NOSYNC))); @@ -10152,7 +10141,7 @@ int __cold mdbx_env_open(MDBX_env *env, const char *pathname, unsigned flags, /* pickup previously mdbx_env_set_flags(), * but avoid MDBX_UTTERLY_NOSYNC by disjunction */ - flags = merge_flags(flags, env->me_flags); + flags = merge_sync_flags(flags, env->me_flags); #if defined(_WIN32) || defined(_WIN64) const DWORD dwAttrib = GetFileAttributesW(pathnameW); @@ -10231,9 +10220,9 @@ int __cold mdbx_env_open(MDBX_env *env, const char *pathname, unsigned flags, if (flags & MDBX_RDONLY) { /* LY: silently ignore irrelevant flags when * we're only getting read access */ - flags &= - ~(MDBX_WRITEMAP | MDBX_MAPASYNC | MDBX_SAFE_NOSYNC | MDBX_NOMETASYNC | - MDBX_COALESCE | MDBX_LIFORECLAIM | MDBX_NOMEMINIT | MDBX_ACCEDE); + flags &= ~(MDBX_WRITEMAP | MDBX_DEPRECATED_MAPASYNC | MDBX_SAFE_NOSYNC | + MDBX_NOMETASYNC | MDBX_COALESCE | MDBX_LIFORECLAIM | + MDBX_NOMEMINIT | MDBX_ACCEDE); } else { #if MDBX_MMAP_INCOHERENT_FILE_WRITE /* Temporary `workaround` for OpenBSD kernel's flaw. @@ -10315,7 +10304,7 @@ int __cold mdbx_env_open(MDBX_env *env, const char *pathname, unsigned flags, goto bailout; } - const unsigned rigorous_flags = MDBX_SAFE_NOSYNC | MDBX_MAPASYNC; + const unsigned rigorous_flags = MDBX_SAFE_NOSYNC | MDBX_DEPRECATED_MAPASYNC; const unsigned mode_flags = rigorous_flags | MDBX_NOMETASYNC | MDBX_LIFORECLAIM | MDBX_COALESCE | MDBX_NORDAHEAD; @@ -16073,14 +16062,14 @@ int __cold mdbx_env_copy2fd(MDBX_env *env, mdbx_filehandle_t fd, if (!dest_is_pipe) { if (likely(rc == MDBX_SUCCESS)) - rc = mdbx_filesync(fd, MDBX_SYNC_DATA | MDBX_SYNC_SIZE); + rc = mdbx_fsync(fd, MDBX_SYNC_DATA | MDBX_SYNC_SIZE); /* Write actual meta */ if (likely(rc == MDBX_SUCCESS)) rc = mdbx_pwrite(fd, buffer, pgno2bytes(env, NUM_METAS), 0); if (likely(rc == MDBX_SUCCESS)) - rc = mdbx_filesync(fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); + rc = mdbx_fsync(fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); } mdbx_memalign_free(buffer); @@ -16141,7 +16130,7 @@ int __cold mdbx_env_set_flags(MDBX_env *env, unsigned flags, int onoff) { return rc; if (onoff) - env->me_flags = merge_flags(env->me_flags, flags); + env->me_flags = merge_sync_flags(env->me_flags, flags); else env->me_flags &= ~flags; diff --git a/src/internals.h b/src/internals.h index 3e1f55f9..c4d36374 100644 --- a/src/internals.h +++ b/src/internals.h @@ -908,6 +908,8 @@ struct MDBX_env { #define MDBX_ENV_ACTIVE UINT32_C(0x20000000) /* me_txkey is set */ #define MDBX_ENV_TXKEY UINT32_C(0x10000000) + /* Legacy MDBX_MAPASYNC (prior v0.9) */ +#define MDBX_DEPRECATED_MAPASYNC UINT32_C(0x100000) #define ENV_INTERNAL_FLAGS (MDBX_FATAL_ERROR | MDBX_ENV_ACTIVE | MDBX_ENV_TXKEY) uint32_t me_flags; mdbx_mmap_t me_dxb_mmap; /* The main data file */ @@ -1390,8 +1392,8 @@ ceil_powerof2(size_t value, size_t granularity) { * at runtime. Changing other flags requires closing the * environment and re-opening it with the new flags. */ #define ENV_CHANGEABLE_FLAGS \ - (MDBX_SAFE_NOSYNC | MDBX_NOMETASYNC | MDBX_MAPASYNC | MDBX_NOMEMINIT | \ - MDBX_COALESCE | MDBX_PAGEPERTURB | MDBX_ACCEDE) + (MDBX_SAFE_NOSYNC | MDBX_NOMETASYNC | MDBX_DEPRECATED_MAPASYNC | \ + MDBX_NOMEMINIT | MDBX_COALESCE | MDBX_PAGEPERTURB | MDBX_ACCEDE) #define ENV_CHANGELESS_FLAGS \ (MDBX_NOSUBDIR | MDBX_RDONLY | MDBX_WRITEMAP | MDBX_NOTLS | MDBX_NORDAHEAD | \ MDBX_LIFORECLAIM | MDBX_EXCLUSIVE) diff --git a/src/osal.c b/src/osal.c index 43c8e837..360a76f2 100644 --- a/src/osal.c +++ b/src/osal.c @@ -783,13 +783,12 @@ int mdbx_pwritev(mdbx_filehandle_t fd, struct iovec *iov, int iovcnt, #endif } -MDBX_INTERNAL_FUNC int mdbx_filesync(mdbx_filehandle_t fd, - enum mdbx_syncmode_bits mode_bits) { +MDBX_INTERNAL_FUNC int mdbx_fsync(mdbx_filehandle_t fd, + enum mdbx_syncmode_bits mode_bits) { #if defined(_WIN32) || defined(_WIN64) - return ((mode_bits & (MDBX_SYNC_DATA | MDBX_SYNC_IODQ)) == 0 || - FlushFileBuffers(fd)) - ? MDBX_SUCCESS - : GetLastError(); + if ((mode_bits & (MDBX_SYNC_DATA | MDBX_SYNC_IODQ)) && !FlushFileBuffers(fd)) + return GetLastError(); + return MDBX_SUCCESS; #else #if defined(__APPLE__) && \ @@ -797,30 +796,37 @@ MDBX_INTERNAL_FUNC int mdbx_filesync(mdbx_filehandle_t fd, if (mode_bits & MDBX_SYNC_IODQ) return likely(fcntl(fd, F_FULLFSYNC) != -1) ? MDBX_SUCCESS : errno; #endif /* MacOS */ -#if defined(__linux__) || defined(__gnu_linux__) - if (mode_bits == MDBX_SYNC_SIZE && mdbx_linux_kernel_version >= 0x03060000) - return MDBX_SUCCESS; -#endif /* Linux */ - int rc; - do { + + /* LY: This approach is always safe and without appreciable performance + * degradation, even on a kernel with fdatasync's bug. + * + * For more info about of a corresponding fdatasync() bug + * see http://www.spinics.net/lists/linux-ext4/msg33714.html */ + while (1) { + switch (mode_bits & (MDBX_SYNC_DATA | MDBX_SYNC_SIZE)) { + case MDBX_SYNC_NONE: + return MDBX_SUCCESS /* nothing to do */; #if defined(_POSIX_SYNCHRONIZED_IO) && _POSIX_SYNCHRONIZED_IO > 0 - /* LY: This code is always safe and without appreciable performance - * degradation, even on a kernel with fdatasync's bug. - * - * For more info about of a corresponding fdatasync() bug - * see http://www.spinics.net/lists/linux-ext4/msg33714.html */ - if ((mode_bits & MDBX_SYNC_SIZE) == 0) { + case MDBX_SYNC_DATA: if (fdatasync(fd) == 0) return MDBX_SUCCESS; - } else -#else - (void)mode_bits; -#endif - if (fsync(fd) == 0) - return MDBX_SUCCESS; - rc = errno; - } while (rc == EINTR); - return rc; + break /* error */; +#if defined(__linux__) || defined(__gnu_linux__) + case MDBX_SYNC_SIZE: + if (mdbx_linux_kernel_version >= 0x03060000) + return MDBX_SUCCESS; + __fallthrough /* fall through */; +#endif /* Linux */ +#endif /* _POSIX_SYNCHRONIZED_IO > 0 */ + default: + if (fsync(fd) == 0) + return MDBX_SUCCESS; + } + + int rc = errno; + if (rc != EINTR) + return rc; + } #endif } @@ -938,24 +944,24 @@ MDBX_INTERNAL_FUNC int mdbx_thread_join(mdbx_thread_t thread) { /*----------------------------------------------------------------------------*/ MDBX_INTERNAL_FUNC int mdbx_msync(mdbx_mmap_t *map, size_t offset, - size_t length, int async) { + size_t length, + enum mdbx_syncmode_bits mode_bits) { uint8_t *ptr = (uint8_t *)map->address + offset; #if defined(_WIN32) || defined(_WIN64) - if (FlushViewOfFile(ptr, length) && (async || FlushFileBuffers(map->fd))) - return MDBX_SUCCESS; - return GetLastError(); + if (!FlushViewOfFile(ptr, length)) + return GetLastError(); #else #if defined(__linux__) || defined(__gnu_linux__) - if (async && mdbx_linux_kernel_version > 0x02061300) - /* Since Linux 2.6.19, MS_ASYNC is in fact a no-op, - since the kernel properly tracks dirty pages and flushes them to storage - as necessary. */ + if (mode_bits == MDBX_SYNC_NONE && mdbx_linux_kernel_version > 0x02061300) + /* Since Linux 2.6.19, MS_ASYNC is in fact a no-op. The kernel properly + * tracks dirty pages and flushes them to storage as necessary. */ return MDBX_SUCCESS; #endif /* Linux */ - const int mode = async ? MS_ASYNC : MS_SYNC; - int rc = (msync(ptr, length, mode) == 0) ? MDBX_SUCCESS : errno; - return rc; + if (msync(ptr, length, (mode_bits & MDBX_SYNC_DATA) ? MS_SYNC : MS_ASYNC)) + return errno; + mode_bits &= ~MDBX_SYNC_DATA; #endif + return mdbx_fsync(map->fd, mode_bits); } MDBX_INTERNAL_FUNC int mdbx_check_fs_rdonly(mdbx_filehandle_t handle, diff --git a/src/osal.h b/src/osal.h index 8d134ba2..8fef7cf7 100644 --- a/src/osal.h +++ b/src/osal.h @@ -572,13 +572,14 @@ mdbx_thread_create(mdbx_thread_t *thread, MDBX_INTERNAL_FUNC int mdbx_thread_join(mdbx_thread_t thread); enum mdbx_syncmode_bits { + MDBX_SYNC_NONE = 0, MDBX_SYNC_DATA = 1, MDBX_SYNC_SIZE = 2, MDBX_SYNC_IODQ = 4 }; -MDBX_INTERNAL_FUNC int mdbx_filesync(mdbx_filehandle_t fd, - const enum mdbx_syncmode_bits mode_bits); +MDBX_INTERNAL_FUNC int mdbx_fsync(mdbx_filehandle_t fd, + const enum mdbx_syncmode_bits mode_bits); MDBX_INTERNAL_FUNC int mdbx_ftruncate(mdbx_filehandle_t fd, uint64_t length); MDBX_INTERNAL_FUNC int mdbx_fseek(mdbx_filehandle_t fd, uint64_t pos); MDBX_INTERNAL_FUNC int mdbx_filesize(mdbx_filehandle_t fd, uint64_t *length); @@ -635,7 +636,8 @@ MDBX_INTERNAL_FUNC int mdbx_resume_threads_after_remap(mdbx_handle_array_t *array); #endif /* Windows */ MDBX_INTERNAL_FUNC int mdbx_msync(mdbx_mmap_t *map, size_t offset, - size_t length, int async); + size_t length, + enum mdbx_syncmode_bits mode_bits); MDBX_INTERNAL_FUNC int mdbx_check_fs_rdonly(mdbx_filehandle_t handle, const char *pathname, int err); diff --git a/test/config.cc b/test/config.cc index f4e3f1d1..1b057397 100644 --- a/test/config.cc +++ b/test/config.cc @@ -298,7 +298,6 @@ bool parse_option(int argc, char *const argv[], int &narg, const char *option, const struct option_verb mode_bits[] = { {"rdonly", unsigned(MDBX_RDONLY)}, - {"mapasync", unsigned(MDBX_MAPASYNC)}, {"nosync-utterly", unsigned(MDBX_UTTERLY_NOSYNC)}, {"nosubdir", unsigned(MDBX_NOSUBDIR)}, {"nosync-safe", unsigned(MDBX_SAFE_NOSYNC)}, diff --git a/test/main.cc b/test/main.cc index 10016ab3..19f3b81f 100644 --- a/test/main.cc +++ b/test/main.cc @@ -95,7 +95,6 @@ void __noreturn usage(void) { " coalesce == MDBX_COALESCE\n" " nosync-safe == MDBX_SAFE_NOSYNC\n" " writemap == MDBX_WRITEMAP\n" - " mapasync == MDBX_MAPASYNC\n" " nosync-utterly == MDBX_UTTERLY_NOSYNC\n" " perturb == MDBX_PAGEPERTURB\n" " notls == MDBX_NOTLS\n" @@ -125,8 +124,8 @@ void actor_params::set_defaults(const std::string &tmpdir) { #endif pathname_db = tmpdir + "mdbx-test.db"; - mode_flags = MDBX_NOSUBDIR | MDBX_WRITEMAP | MDBX_MAPASYNC | MDBX_NOMEMINIT | - MDBX_COALESCE | MDBX_LIFORECLAIM | MDBX_ACCEDE; + mode_flags = MDBX_NOSUBDIR | MDBX_WRITEMAP | MDBX_SAFE_NOSYNC | + MDBX_NOMEMINIT | MDBX_COALESCE | MDBX_LIFORECLAIM | MDBX_ACCEDE; table_flags = MDBX_DUPSORT; size_lower = -1; diff --git a/test/nested.cc b/test/nested.cc index 85df6fa6..a0054062 100644 --- a/test/nested.cc +++ b/test/nested.cc @@ -77,8 +77,7 @@ bool testcase_nested::teardown() { void testcase_nested::push_txn() { MDBX_txn *txn; - unsigned flags = - prng32() & (MDBX_SAFE_NOSYNC | MDBX_NOMETASYNC | MDBX_MAPASYNC); + unsigned flags = prng32() & (MDBX_SAFE_NOSYNC | MDBX_NOMETASYNC); int err = mdbx_txn_begin(db_guard.get(), txn_guard.get(), flags, &txn); if (unlikely(err != MDBX_SUCCESS)) failure_perror("mdbx_txn_begin(nested)", err); diff --git a/test/pcrf/pcrf_test.c b/test/pcrf/pcrf_test.c index 2db58a02..e309cf0a 100644 --- a/test/pcrf/pcrf_test.c +++ b/test/pcrf/pcrf_test.c @@ -109,8 +109,8 @@ static void db_connect() { env, 0, 0, REC_COUNT * sizeof(session_data_t) * 10, -1, -1, -1)); MDBX_CHECK(mdbx_env_set_maxdbs(env, 30)); MDBX_CHECK(mdbx_env_open(env, opt_db_path, - MDBX_CREATE | MDBX_WRITEMAP | MDBX_MAPASYNC | - MDBX_SAFE_NOSYNC | MDBX_LIFORECLAIM, + MDBX_CREATE | MDBX_WRITEMAP | MDBX_UTTERLY_NOSYNC | + MDBX_LIFORECLAIM, 0664)); MDBX_txn *txn;