mdbx: drop/deprecate MDBX_MAPASYNC.

Change-Id: I472f97f568a32325eb056c8ee4d2f2350a473bda
This commit is contained in:
Leonid Yuriev 2020-08-01 19:13:17 +03:00
parent 135bead730
commit 5e43ee61a2
11 changed files with 182 additions and 214 deletions

View File

@ -155,7 +155,7 @@ check: test dist
test: build-test
rm -f $(TEST_DB) $(TEST_LOG) && (set -o pipefail; \
(./mdbx_test --table=+data.integer --keygen.split=29 --datalen.min=min --datalen.max=max --progress --console=no --repeat=$(TEST_ITER) --pathname=$(TEST_DB) --dont-cleanup-after basic && \
./mdbx_test --mode=-writemap,-mapasync,-lifo --progress --console=no --repeat=12 --pathname=$(TEST_DB) --dont-cleanup-after basic) \
./mdbx_test --mode=-writemap,-nosync-safe,-lifo --progress --console=no --repeat=12 --pathname=$(TEST_DB) --dont-cleanup-after basic) \
| tee >(gzip --stdout > $(TEST_LOG)) | tail -n 42) \
&& ./mdbx_chk -vvn $(TEST_DB) && ./mdbx_chk -vvn $(TEST_DB)-copy
@ -163,7 +163,7 @@ test-singleprocess: all mdbx_test
rm -f $(TEST_DB) $(TEST_LOG) && (set -o pipefail; \
(./mdbx_test --table=+data.integer --keygen.split=29 --datalen.min=min --datalen.max=max --progress --console=no --repeat=42 --pathname=$(TEST_DB) --dont-cleanup-after --hill && \
./mdbx_test --progress --console=no --repeat=2 --pathname=$(TEST_DB) --dont-cleanup-before --dont-cleanup-after --copy && \
./mdbx_test --mode=-writemap,-mapasync,-lifo --progress --console=no --repeat=42 --pathname=$(TEST_DB) --dont-cleanup-after --nested) \
./mdbx_test --mode=-writemap,-nosync-safe,-lifo --progress --console=no --repeat=42 --pathname=$(TEST_DB) --dont-cleanup-after --nested) \
| tee >(gzip --stdout > $(TEST_LOG)) | tail -n 42) \
&& ./mdbx_chk -vvn $(TEST_DB) && ./mdbx_chk -vvn $(TEST_DB)-copy
@ -178,7 +178,7 @@ memcheck test-valgrind:
rm -f valgrind-*.log $(TEST_DB) $(TEST_LOG) && (set -o pipefail; ( \
$(VALGRIND) ./mdbx_test --table=+data.integer --keygen.split=29 --datalen.min=min --datalen.max=max --progress --console=no --repeat=2 --pathname=$(TEST_DB) --dont-cleanup-after basic && \
$(VALGRIND) ./mdbx_test --progress --console=no --pathname=$(TEST_DB) --dont-cleanup-before --dont-cleanup-after --copy && \
$(VALGRIND) ./mdbx_test --mode=-writemap,-mapasync,-lifo --progress --console=no --repeat=4 --pathname=$(TEST_DB) --dont-cleanup-after basic && \
$(VALGRIND) ./mdbx_test --mode=-writemap,-nosync-safe,-lifo --progress --console=no --repeat=4 --pathname=$(TEST_DB) --dont-cleanup-after basic && \
$(VALGRIND) ./mdbx_chk -vvn $(TEST_DB) && \
$(VALGRIND) ./mdbx_chk -vvn $(TEST_DB)-copy \
) | tee >(gzip --stdout > $(TEST_LOG)) | tail -n 42)

View File

@ -227,7 +227,7 @@ the user's point of view.
7. Fast estimation of range query result volume, i.e. how many items can
be found between a `KEY1` and a `KEY2`. This is a prerequisite for build
and/or optimize query execution plans.
> _libmdbx_ performs a rough estimate based on common B-tree pages of the paths from root to corresponding keys.
> _libmdbx_ performs a rough estimate based on common B-tree pages of the paths from root to corresponding keys.
8. `mdbx_chk` tool for database integrity check.
@ -259,9 +259,13 @@ pair, to the first, to the last, or not set to anything.
## Other fixes and specifics
1. Fixed more than 10 significant errors, in particular: page leaks, wrong sub-database statistics, segfault in several conditions, nonoptimal page merge strategy, updating an existing record with a change in data size (including for multimap), etc.
1. Fixed more than 10 significant errors, in particular: page leaks,
wrong sub-database statistics, segfault in several conditions,
nonoptimal page merge strategy, updating an existing record with
a change in data size (including for multimap), etc.
2. All cursors can be reused and should be closed explicitly, regardless ones were opened within a write or read transaction.
2. All cursors can be reused and should be closed explicitly,
regardless ones were opened within a write or read transaction.
3. Opening database handles are spared from race conditions and
pre-opening is not needed.
@ -269,10 +273,9 @@ pre-opening is not needed.
4. Returning `MDBX_EMULTIVAL` error in case of ambiguous update or delete.
5. Guarantee of database integrity even in asynchronous unordered write-to-disk mode.
> _libmdbx_ propose additional trade-off by implementing append-like manner for updates
> in `MDBX_SAFE_NOSYNC` and `MDBX_WRITEMAP|MDBX_MAPASYNC` modes, that avoid database corruption after a system crash
> contrary to LMDB. Nevertheless, the `MDBX_UTTERLY_NOSYNC` mode is available to match LMDB behaviour,
> and for special use-cases.
> _libmdbx_ propose additional trade-off by `MDBX_SAFE_NOSYNC` with append-like manner for updates,
> that avoids database corruption after a system crash contrary to LMDB.
> Nevertheless, the `MDBX_UTTERLY_NOSYNC` mode is available to match behaviour of the `MDB_NOSYNC` in LMDB.
6. On **MacOS & iOS** the `fcntl(F_FULLFSYNC)` syscall is used _by
default_ to synchronize data with the disk, as this is [the only way to

115
mdbx.h
View File

@ -809,13 +809,12 @@ enum MDBX_env_flags_t {
* series of write transactions, will be as small as possible. Thus creates
* ideal conditions for the efficient operation of the disk write-back cache.
*
* \ref MDBX_LIFORECLAIM is compatible with all no-sync flags (i.e.
* \ref MDBX_NOMETASYNC, \ref MDBX_SAFE_NOSYNC, \ref MDBX_UTTERLY_NOSYNC, \ref
* MDBX_MAPASYNC), but gives no noticeable impact in combination with \ref
* MDBX_SAFE_NOSYNC. Because MDBX will reused pages only before the last
* "steady" MVCC-snapshot, i.e. the loop length of database pages circulation
* will be mostly defined by frequency of calling `mdbx_env_sync()` rather
* than LIFO and FIFO difference.
* \ref MDBX_LIFORECLAIM is compatible with all no-sync flags, but gives NO
* noticeable impact in combination with \ref MDBX_SAFE_NOSYNC or
* \ref MDBX_UTTERLY_NOSYNC. Because MDBX will reused pages only before the
* last "steady" MVCC-snapshot, i.e. the loop length of database pages
* circulation will be mostly defined by frequency of calling
* `mdbx_env_sync()` rather than LIFO and FIFO difference.
*
* This flag may be changed at any time using mdbx_env_set_flags(). */
MDBX_LIFORECLAIM = UINT32_C(0x4000000),
@ -827,9 +826,9 @@ enum MDBX_env_flags_t {
/** \defgroup sync_modes SYNC MODES
*
* \attention Using any combination of \ref MDBX_SAFE_NOSYNC, \ref
* MDBX_NOMETASYNC, \ref MDBX_MAPASYNC and especially \ref MDBX_UTTERLY_NOSYNC
* is always a deal to reduce durability for gain write performance. You must
* know exactly what you are doing and what risks you are taking!
* MDBX_NOMETASYNC and especially \ref MDBX_UTTERLY_NOSYNC is always a deal to
* reduce durability for gain write performance. You must know exactly what
* you are doing and what risks you are taking!
*
* \note for LMDB users: \ref MDBX_SAFE_NOSYNC is NOT similar to LMDB_NOSYNC,
* but \ref MDBX_UTTERLY_NOSYNC is exactly match LMDB_NOSYNC. See details
@ -863,7 +862,6 @@ enum MDBX_env_flags_t {
*
* \see MDBX_NOMETASYNC
* \see MDBX_SAFE_NOSYNC
* \see MDBX_MAPASYNC
* \see MDBX_UTTERLY_NOSYNC
*
* @{ */
@ -893,13 +891,14 @@ enum MDBX_env_flags_t {
* huge difference in how are recycled the MVCC snapshots corresponding to
* previous "steady" transactions (see below).
*
* With \ref MDBX_WRITEMAP the `MDBX_SAFE_NOSYNC` instructs MDBX to use
* asynchronous mmap-flushes to disk. Asynchronous mmap-flushes means that
* actually all writes will scheduled and performed by operation system on it
* own manner, i.e. unordered. MDBX itself just notify operating system that
* it would be nice to write data to disk, but no more.
*
* Depending on the platform and hardware, with `MDBX_SAFE_NOSYNC` you may get
* a multiple increase of write performance, even 10 times or more. \note Note
* that (`MDBX_SAFE_NOSYNC` | \ref MDBX_WRITEMAP) leaves the system with no
* hint for when to write transactions to disk. Therefore the
* (\ref MDBX_MAPASYNC | \ref MDBX_WRITEMAP) may be preferable, but without
* `MDBX_SAFE_NOSYNC` because the (\ref MDBX_MAPASYNC | `MDBX_SAFE_NOSYNC`)
* actually gives \ref MDBX_UTTERLY_NOSYNC.
* a multiple increase of write performance, even 10 times or more.
*
* In contrast to \ref MDBX_UTTERLY_NOSYNC mode, with `MDBX_SAFE_NOSYNC` flag
* MDBX will keeps untouched pages within B-tree of the last transaction
@ -933,43 +932,15 @@ enum MDBX_env_flags_t {
*
* `MDBX_SAFE_NOSYNC` flag may be changed at any time using
* \ref mdbx_env_set_flags() or by passing to \ref mdbx_txn_begin() for
* particular write transaction.
*
* \warning don't combine this flag with \ref MDBX_MAPASYNC since you will got
* \ref MDBX_UTTERLY_NOSYNC in that way. \see sync_modes */
* particular write transaction. */
MDBX_SAFE_NOSYNC = UINT32_C(0x10000),
/** Use asynchronous msync when \ref MDBX_WRITEMAP is used.
/** \deprecated Please use \ref MDBX_SAFE_NOSYNC instead of `MDBX_MAPASYNC`.
*
* `MDBX_MAPASYNC` meaningful and give effect only in conjunction
* with `MDBX_WRITEMAP` or `MDBX_SAFE_NOSYNC`:
* - with \ref MDBX_SAFE_NOSYNC actually gives \ref MDBX_UTTERLY_NOSYNC,
* which wipe previous steady commits for reuse pages as described above.
* - with \ref MDBX_WRITEMAP but without \ref MDBX_SAFE_NOSYNC instructs MDBX
* to use asynchronous mmap-flushes to disk as described below.
* - with both \ref MDBX_WRITEMAP and \ref MDBX_SAFE_NOSYNC you get the both
* effects.
*
* Asynchronous mmap-flushes means that actually all writes will scheduled and
* performed by operation system on it own manner, i.e. unordered. MDBX itself
* just notify operating system that it would be nice to write data to disk,
* but no more.
*
* With \ref MDBX_MAPASYNC flag, but without \ref MDBX_UTTERLY_NOSYNC (i.e.
* without OR'ing with \ref MDBX_SAFE_NOSYNC) MDBX will keeps untouched pages
* within B-tree of the last transaction "steady" which was synced to disk
* completely. So, this makes exactly the same "long-lived" impact and the
* same consequences as described above for \ref MDBX_SAFE_NOSYNC flag.
*
* Depending on the platform and hardware, with combination of
* \ref MDBX_WRITEMAP and \ref MDBX_MAPASYNC you may get a multiple increase
* of write performance, even 10-100 times or more. \ref MDBX_MAPASYNC flag
* may be changed at any time using \ref mdbx_env_set_flags() or by passing to
* \ref mdbx_txn_begin() for particular write transaction.
*
* \warning don't combine this flag with \ref MDBX_SAFE_NOSYNC since you will
* got \ref MDBX_UTTERLY_NOSYNC in that way. \see sync_modes */
MDBX_MAPASYNC = UINT32_C(0x100000),
* Since version 0.9.x the `MDBX_MAPASYNC` is deprecated and has the same
* effect as \ref MDBX_SAFE_NOSYNC with \ref MDBX_WRITEMAP. This just API
* simplification is for convenience and clarity. */
MDBX_MAPASYNC = MDBX_SAFE_NOSYNC,
/** Don't sync anything and wipe previous steady commits.
*
@ -1012,7 +983,7 @@ enum MDBX_env_flags_t {
* `MDBX_UTTERLY_NOSYNC` flag may be changed at any time using
* \ref mdbx_env_set_flags(), but don't has effect if passed to
* \ref mdbx_txn_begin() for particular write transaction. \see sync_modes */
MDBX_UTTERLY_NOSYNC = MDBX_SAFE_NOSYNC | MDBX_MAPASYNC,
MDBX_UTTERLY_NOSYNC = MDBX_SAFE_NOSYNC | UINT32_C(0x100000),
/** @} end of SYNC MODES */
@ -1459,14 +1430,14 @@ LIBMDBX_API int mdbx_env_create(MDBX_env **penv);
* \ref MDBX_NOMEMINIT, \ref MDBX_COALESCE, \ref MDBX_LIFORECLAIM.
* See \ref env_flags section.
*
* - \ref MDBX_NOMETASYNC, \ref MDBX_SAFE_NOSYNC, \ref MDBX_UTTERLY_NOSYNC,
* \ref MDBX_MAPASYNC. See \ref sync_modes section.
* - \ref MDBX_NOMETASYNC, \ref MDBX_SAFE_NOSYNC, \ref MDBX_UTTERLY_NOSYNC.
* See \ref sync_modes section.
*
* \note `MDB_NOLOCK` flag don't supported by MDBX,
* try use \ref MDBX_EXCLUSIVE as a replacement.
*
* \note MDBX don't allow to mix processes with different \ref MDBX_SAFE_NOSYNC,
* \ref MDBX_MAPASYNC flags on the same environment.
* \note MDBX don't allow to mix processes with different \ref MDBX_SAFE_NOSYNC
* flags on the same environment.
* In such case \ref MDBX_INCOMPATIBLE will be returned.
*
* If the database is already exist and parameters specified early by
@ -1493,7 +1464,7 @@ LIBMDBX_API int mdbx_env_create(MDBX_env **penv);
* more than once.
* \retval MDBX_INCOMPATIBLE Environment is already opened by another process,
* but with different set of \ref MDBX_SAFE_NOSYNC,
* \ref MDBX_MAPASYNC flags.
* \ref MDBX_UTTERLY_NOSYNC flags.
* Or if the database is already exist and parameters
* specified early by \ref mdbx_env_set_geometry()
* are incompatible (i.e. different pagesize, etc).
@ -1697,7 +1668,7 @@ MDBX_DEPRECATED LIBMDBX_API int mdbx_env_info(MDBX_env *env, MDBX_envinfo *info,
* \ingroup c_extra
*
* Unless the environment was opened with no-sync flags (\ref MDBX_NOMETASYNC,
* \ref MDBX_SAFE_NOSYNC, \ref MDBX_UTTERLY_NOSYNC and \ref MDBX_MAPASYNC), then
* \ref MDBX_SAFE_NOSYNC and \ref MDBX_UTTERLY_NOSYNC), then
* data is always written an flushed to disk when \ref mdbx_txn_commit() is
* called. Otherwise \ref mdbx_env_sync() may be called to manually write and
* flush unsynced data to disk.
@ -1741,18 +1712,18 @@ LIBMDBX_API int mdbx_env_sync(MDBX_env *env);
LIBMDBX_API int mdbx_env_sync_poll(MDBX_env *env);
/** Sets threshold to force flush the data buffers to disk, even any of
* \ref MDBX_SAFE_NOSYNC, \ref MDBX_NOMETASYNC and \ref MDBX_MAPASYNC flags in
* the environment.
* \ref MDBX_SAFE_NOSYNC flag in the environment.
* \ingroup c_settings
*
* The threshold value affects all processes which operates with given
* environment until the last process close environment or a new value will be
* settled.
*
* Data is always written to disk when \ref mdbx_txn_commit() is called, but
* Data is always written to disk when \ref mdbx_txn_commit() is called, but
* the operating system may keep it buffered. MDBX always flushes the OS buffers
* upon commit as well, unless the environment was opened with
* \ref MDBX_SAFE_NOSYNC, \ref MDBX_MAPASYNC or in part \ref MDBX_NOMETASYNC.
* \ref MDBX_SAFE_NOSYNC, \ref MDBX_UTTERLY_NOSYNC
* or in part \ref MDBX_NOMETASYNC.
*
* The default is 0, than mean no any threshold checked, and no additional
* flush will be made.
@ -1765,8 +1736,7 @@ LIBMDBX_API int mdbx_env_sync_poll(MDBX_env *env);
LIBMDBX_API int mdbx_env_set_syncbytes(MDBX_env *env, size_t threshold);
/** Sets relative period since the last unsteay commit to force flush the data
* buffers to disk, even any of \ref MDBX_SAFE_NOSYNC, \ref MDBX_NOMETASYNC and
* \ref MDBX_MAPASYNC flags in the environment.
* buffers to disk, even of \ref MDBX_SAFE_NOSYNC flag in the environment.
* \ingroup c_settings
*
* The relative period value affects all processes which operates with given
@ -1776,7 +1746,7 @@ LIBMDBX_API int mdbx_env_set_syncbytes(MDBX_env *env, size_t threshold);
* Data is always written to disk when \ref mdbx_txn_commit() is called, but the
* operating system may keep it buffered. MDBX always flushes the OS buffers
* upon commit as well, unless the environment was opened with
* \ref MDBX_SAFE_NOSYNC, \ref MDBX_MAPASYNC or in part \ref MDBX_NOMETASYNC.
* \ref MDBX_SAFE_NOSYNC or in part \ref MDBX_NOMETASYNC.
*
* Settled period don't checked asynchronously, but only by the
* \ref mdbx_txn_commit() and \ref mdbx_env_sync() functions. Therefore, in
@ -1809,12 +1779,12 @@ LIBMDBX_API int mdbx_env_set_syncperiod(MDBX_env *env,
* \ref mdbx_env_create().
*
* \param [in] dont_sync A dont'sync flag, if non-zero the last checkpoint
* will be kept "as is" and may be still "weak" in the
* \ref MDBX_UTTERLY_NOSYNC or \ref MDBX_MAPASYNC modes.
* Such "weak" checkpoint will be ignored on opening next
* time, and transactions since the last non-weak
* checkpoint (meta-page update) will rolledback for
* consistency guarantee.
* will be kept "as is" and may be still "weak" in the
* \ref MDBX_SAFE_NOSYNC or \ref MDBX_UTTERLY_NOSYNC
* modes. Such "weak" checkpoint will be ignored on
* opening next time, and transactions since the last
* non-weak checkpoint (meta-page update) will rolledback
* for consistency guarantee.
*
* \returns A non-zero error value on failure and 0 on success,
* some possible errors are:
@ -2294,8 +2264,7 @@ LIBMDBX_API void *mdbx_env_get_userctx(const MDBX_env *env);
* - \ref MDBX_TRYTXN Do not block when starting
* a write transaction.
*
* - \ref MDBX_SAFE_NOSYNC, \ref MDBX_NOMETASYNC or
* \ref MDBX_MAPASYNC.
* - \ref MDBX_SAFE_NOSYNC, \ref MDBX_NOMETASYNC.
* Do not sync data to disk corresponding
* to \ref MDBX_NOMETASYNC or \ref MDBX_SAFE_NOSYNC
* description. \see sync_modes.

View File

@ -4922,7 +4922,7 @@ __cold static int mdbx_wipe_steady(MDBX_env *env, const txnid_t last_steady) {
SYNC_FILE_RANGE_WRITE | SYNC_FILE_RANGE_WAIT_AFTER))
err = errno;
#else
err = mdbx_filesync(env->me_lazy_fd, MDBX_SYNC_DATA);
err = mdbx_fsync(env->me_lazy_fd, MDBX_SYNC_DATA);
#endif
if (unlikely(err != MDBX_SUCCESS))
return err;
@ -5646,7 +5646,7 @@ __cold static int mdbx_env_sync_internal(MDBX_env *env, int force,
if (outside_txn) {
if (unsynced_pages > /* FIXME: define threshold */ 16 &&
(flags & (MDBX_SAFE_NOSYNC | MDBX_MAPASYNC)) == 0) {
(flags & MDBX_SAFE_NOSYNC) == 0) {
mdbx_assert(env, ((flags ^ env->me_flags) & MDBX_WRITEMAP) == 0);
const size_t usedbytes = pgno_align2os_bytes(env, head->mm_geo.next);
@ -5655,7 +5655,7 @@ __cold static int mdbx_env_sync_internal(MDBX_env *env, int force,
/* LY: pre-sync without holding lock to reduce latency for writer(s) */
int err = (flags & MDBX_WRITEMAP)
? mdbx_msync(&env->me_dxb_mmap, 0, usedbytes, false)
: mdbx_filesync(env->me_lazy_fd, MDBX_SYNC_DATA);
: mdbx_fsync(env->me_lazy_fd, MDBX_SYNC_DATA);
if (unlikely(err != MDBX_SUCCESS))
return err;
@ -5673,7 +5673,7 @@ __cold static int mdbx_env_sync_internal(MDBX_env *env, int force,
}
if (!META_IS_STEADY(head) ||
((flags & (MDBX_SAFE_NOSYNC | MDBX_MAPASYNC)) == 0 && unsynced_pages)) {
((flags & MDBX_SAFE_NOSYNC) == 0 && unsynced_pages)) {
mdbx_debug("meta-head %" PRIaPGNO ", %s, sync_pending %" PRIaPGNO,
data_page(head)->mp_pgno, mdbx_durable_str(head),
unsynced_pages);
@ -5694,11 +5694,10 @@ fastpath:
if (rc == MDBX_RESULT_TRUE && (env->me_flags & MDBX_NOMETASYNC) != 0) {
const txnid_t head_txnid = mdbx_recent_committed_txnid(env);
if (*env->me_meta_sync_txnid != (uint32_t)head_txnid) {
rc =
(flags & MDBX_WRITEMAP)
? mdbx_msync(&env->me_dxb_mmap, 0,
pgno_align2os_bytes(env, NUM_METAS), false)
: mdbx_filesync(env->me_lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ);
rc = (flags & MDBX_WRITEMAP)
? mdbx_msync(&env->me_dxb_mmap, 0,
pgno_align2os_bytes(env, NUM_METAS), false)
: mdbx_fsync(env->me_lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ);
if (likely(rc == MDBX_SUCCESS))
*env->me_meta_sync_txnid = (uint32_t)head_txnid;
}
@ -6480,7 +6479,7 @@ int mdbx_txn_begin(MDBX_env *env, MDBX_txn *parent, unsigned flags,
mdbx_assert(env,
(txn->mt_flags & ~(MDBX_NOTLS | MDBX_RDONLY | MDBX_WRITEMAP |
MDBX_SHRINK_ALLOWED | MDBX_NOMETASYNC |
MDBX_SAFE_NOSYNC | MDBX_MAPASYNC)) == 0);
MDBX_SAFE_NOSYNC)) == 0);
txn->mt_signature = MDBX_MT_SIGNATURE;
*ret = txn;
mdbx_debug("begin txn %" PRIaTXN "%c %p on env %p, root page %" PRIaPGNO
@ -8584,7 +8583,7 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags,
mdbx_assert(env, (env->me_flags & (MDBX_RDONLY | MDBX_FATAL_ERROR)) == 0);
mdbx_assert(env, pending->mm_geo.next <= pending->mm_geo.now);
if (flags & (MDBX_SAFE_NOSYNC | MDBX_MAPASYNC)) {
if (flags & MDBX_SAFE_NOSYNC) {
/* Check auto-sync conditions */
const pgno_t autosync_threshold = *env->me_autosync_threshold;
const uint64_t autosync_period = *env->me_autosync_period;
@ -8681,38 +8680,27 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags,
}
/* LY: step#1 - sync previously written/updated data-pages */
int rc = *env->me_unsynced_pages ? MDBX_RESULT_TRUE /* carry non-steady */
: MDBX_RESULT_FALSE /* carry steady */;
if (rc != MDBX_RESULT_FALSE && (flags & MDBX_SAFE_NOSYNC) == 0) {
int rc = MDBX_RESULT_FALSE /* carry steady */;
if (*env->me_unsynced_pages) {
mdbx_assert(env, ((flags ^ env->me_flags) & MDBX_WRITEMAP) == 0);
MDBX_meta *const recent_steady_meta = mdbx_meta_steady(env);
if (flags & MDBX_WRITEMAP) {
const size_t begin =
floor_powerof2(pgno2bytes(env, NUM_METAS), env->me_os_psize);
const size_t end = pgno_align2os_bytes(env, pending->mm_geo.next);
if (end > begin) {
rc = mdbx_msync(&env->me_dxb_mmap, begin, end - begin,
flags & MDBX_MAPASYNC);
if (unlikely(rc != MDBX_SUCCESS))
goto fail;
}
rc = MDBX_RESULT_TRUE /* carry non-steady */;
if ((flags & MDBX_MAPASYNC) == 0) {
if (unlikely(pending->mm_geo.next > recent_steady_meta->mm_geo.now)) {
rc = mdbx_filesync(env->me_lazy_fd, MDBX_SYNC_SIZE);
if (unlikely(rc != MDBX_SUCCESS))
goto fail;
}
rc = MDBX_RESULT_FALSE /* carry steady */;
}
} else {
rc = mdbx_filesync(env->me_lazy_fd,
(pending->mm_geo.next > recent_steady_meta->mm_geo.now)
? MDBX_SYNC_DATA | MDBX_SYNC_SIZE
: MDBX_SYNC_DATA);
if (unlikely(rc != MDBX_SUCCESS))
goto fail;
enum mdbx_syncmode_bits mode_bits = MDBX_SYNC_NONE;
if ((flags & MDBX_SAFE_NOSYNC) == 0) {
mode_bits = MDBX_SYNC_DATA;
if (pending->mm_geo.next > mdbx_meta_steady(env)->mm_geo.now)
mode_bits |= MDBX_SYNC_SIZE;
if (flags & MDBX_NOMETASYNC)
mode_bits |= MDBX_SYNC_IODQ;
}
if (flags & MDBX_WRITEMAP)
rc =
mdbx_msync(&env->me_dxb_mmap, 0,
pgno_align2os_bytes(env, pending->mm_geo.next), mode_bits);
else
rc = mdbx_fsync(env->me_lazy_fd, mode_bits);
if (unlikely(rc != MDBX_SUCCESS))
goto fail;
rc = (flags & MDBX_SAFE_NOSYNC) ? MDBX_RESULT_TRUE /* carry non-steady */
: MDBX_RESULT_FALSE /* carry steady */;
}
/* Steady or Weak */
@ -8825,24 +8813,13 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags,
target->mm_datasync_sign = pending->mm_datasync_sign;
mdbx_flush_incoherent_cpu_writeback();
mdbx_jitter4testing(true);
if ((flags & MDBX_SAFE_NOSYNC) == 0) {
/* sync meta-pages */
const bool weak = (flags & (MDBX_MAPASYNC | MDBX_NOMETASYNC)) != 0;
rc = mdbx_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, NUM_METAS),
weak);
if (unlikely(rc != MDBX_SUCCESS))
goto fail;
if (!weak) {
#if defined(__APPLE__) && \
MDBX_OSX_SPEED_INSTEADOF_DURABILITY == MDBX_OSX_WANNA_DURABILITY
rc = likely(fcntl(env->me_lazy_fd, F_FULLFSYNC) != -1) ? MDBX_SUCCESS
: errno;
if (unlikely(rc != MDBX_SUCCESS))
goto fail;
#endif /* MacOS */
*env->me_meta_sync_txnid = pending->mm_txnid_a.low;
}
}
/* sync meta-pages */
rc =
mdbx_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, NUM_METAS),
(flags & MDBX_NOMETASYNC) ? MDBX_SYNC_NONE
: MDBX_SYNC_DATA | MDBX_SYNC_IODQ);
if (unlikely(rc != MDBX_SUCCESS))
goto fail;
} else {
const MDBX_meta undo_meta = *target;
const mdbx_filehandle_t fd = (env->me_dsync_fd != INVALID_HANDLE_VALUE)
@ -8860,16 +8837,17 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags,
goto fail;
}
mdbx_flush_incoherent_mmap(target, sizeof(MDBX_meta), env->me_os_psize);
if ((flags & (MDBX_SAFE_NOSYNC | MDBX_NOMETASYNC)) == 0) {
/* sync meta-pages */
if (fd == env->me_lazy_fd) {
rc = mdbx_filesync(env->me_lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ);
if (rc != MDBX_SUCCESS)
goto undo;
}
*env->me_meta_sync_txnid = pending->mm_txnid_a.low;
/* sync meta-pages */
if ((flags & MDBX_NOMETASYNC) == 0 && fd == env->me_lazy_fd) {
rc = mdbx_fsync(env->me_lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ);
if (rc != MDBX_SUCCESS)
goto undo;
}
}
if (flags & MDBX_NOMETASYNC)
*env->me_unsynced_pages += 1;
else
*env->me_meta_sync_txnid = pending->mm_txnid_a.low;
/* LY: shrink datafile if needed */
if (unlikely(shrink)) {
@ -9951,7 +9929,7 @@ static int __cold mdbx_setup_lck(MDBX_env *env, char *lck_pathname,
mdbx_error("initial-%s for lck-file failed", "msync");
goto bailout;
}
err = mdbx_filesync(env->me_lck_mmap.fd, MDBX_SYNC_SIZE);
err = mdbx_fsync(env->me_lck_mmap.fd, MDBX_SYNC_SIZE);
if (unlikely(err != MDBX_SUCCESS)) {
mdbx_error("initial-%s for lck-file failed", "fsync");
goto bailout;
@ -10112,13 +10090,24 @@ __cold int mdbx_is_readahead_reasonable(size_t volume, intptr_t redundancy) {
: MDBX_RESULT_TRUE;
}
/* Merge flags and avoid false MDBX_UTTERLY_NOSYNC */
static uint32_t merge_flags(const uint32_t a, const uint32_t b) {
/* Merge sync flags */
static uint32_t merge_sync_flags(const uint32_t a, const uint32_t b) {
uint32_t r = a | b;
/* avoid false MDBX_UTTERLY_NOSYNC */
if (F_ISSET(r, MDBX_UTTERLY_NOSYNC) && !F_ISSET(a, MDBX_UTTERLY_NOSYNC) &&
!F_ISSET(b, MDBX_UTTERLY_NOSYNC))
r -= (r & MDBX_WRITEMAP) ? MDBX_UTTERLY_NOSYNC ^ MDBX_MAPASYNC
: MDBX_UTTERLY_NOSYNC ^ MDBX_SAFE_NOSYNC;
r = (r - MDBX_UTTERLY_NOSYNC) | MDBX_SAFE_NOSYNC;
/* convert MDBX_DEPRECATED_MAPASYNC to MDBX_SAFE_NOSYNC */
if ((r & (MDBX_WRITEMAP | MDBX_DEPRECATED_MAPASYNC)) ==
(MDBX_WRITEMAP | MDBX_DEPRECATED_MAPASYNC))
r = (r - MDBX_DEPRECATED_MAPASYNC) | MDBX_SAFE_NOSYNC;
/* force MDBX_NOMETASYNC if MDBX_SAFE_NOSYNC enabled */
if (r & MDBX_SAFE_NOSYNC)
r |= MDBX_NOMETASYNC;
assert(!(F_ISSET(r, MDBX_UTTERLY_NOSYNC) &&
!F_ISSET(a, MDBX_UTTERLY_NOSYNC) &&
!F_ISSET(b, MDBX_UTTERLY_NOSYNC)));
@ -10152,7 +10141,7 @@ int __cold mdbx_env_open(MDBX_env *env, const char *pathname, unsigned flags,
/* pickup previously mdbx_env_set_flags(),
* but avoid MDBX_UTTERLY_NOSYNC by disjunction */
flags = merge_flags(flags, env->me_flags);
flags = merge_sync_flags(flags, env->me_flags);
#if defined(_WIN32) || defined(_WIN64)
const DWORD dwAttrib = GetFileAttributesW(pathnameW);
@ -10231,9 +10220,9 @@ int __cold mdbx_env_open(MDBX_env *env, const char *pathname, unsigned flags,
if (flags & MDBX_RDONLY) {
/* LY: silently ignore irrelevant flags when
* we're only getting read access */
flags &=
~(MDBX_WRITEMAP | MDBX_MAPASYNC | MDBX_SAFE_NOSYNC | MDBX_NOMETASYNC |
MDBX_COALESCE | MDBX_LIFORECLAIM | MDBX_NOMEMINIT | MDBX_ACCEDE);
flags &= ~(MDBX_WRITEMAP | MDBX_DEPRECATED_MAPASYNC | MDBX_SAFE_NOSYNC |
MDBX_NOMETASYNC | MDBX_COALESCE | MDBX_LIFORECLAIM |
MDBX_NOMEMINIT | MDBX_ACCEDE);
} else {
#if MDBX_MMAP_INCOHERENT_FILE_WRITE
/* Temporary `workaround` for OpenBSD kernel's flaw.
@ -10315,7 +10304,7 @@ int __cold mdbx_env_open(MDBX_env *env, const char *pathname, unsigned flags,
goto bailout;
}
const unsigned rigorous_flags = MDBX_SAFE_NOSYNC | MDBX_MAPASYNC;
const unsigned rigorous_flags = MDBX_SAFE_NOSYNC | MDBX_DEPRECATED_MAPASYNC;
const unsigned mode_flags = rigorous_flags | MDBX_NOMETASYNC |
MDBX_LIFORECLAIM | MDBX_COALESCE | MDBX_NORDAHEAD;
@ -16073,14 +16062,14 @@ int __cold mdbx_env_copy2fd(MDBX_env *env, mdbx_filehandle_t fd,
if (!dest_is_pipe) {
if (likely(rc == MDBX_SUCCESS))
rc = mdbx_filesync(fd, MDBX_SYNC_DATA | MDBX_SYNC_SIZE);
rc = mdbx_fsync(fd, MDBX_SYNC_DATA | MDBX_SYNC_SIZE);
/* Write actual meta */
if (likely(rc == MDBX_SUCCESS))
rc = mdbx_pwrite(fd, buffer, pgno2bytes(env, NUM_METAS), 0);
if (likely(rc == MDBX_SUCCESS))
rc = mdbx_filesync(fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ);
rc = mdbx_fsync(fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ);
}
mdbx_memalign_free(buffer);
@ -16141,7 +16130,7 @@ int __cold mdbx_env_set_flags(MDBX_env *env, unsigned flags, int onoff) {
return rc;
if (onoff)
env->me_flags = merge_flags(env->me_flags, flags);
env->me_flags = merge_sync_flags(env->me_flags, flags);
else
env->me_flags &= ~flags;

View File

@ -908,6 +908,8 @@ struct MDBX_env {
#define MDBX_ENV_ACTIVE UINT32_C(0x20000000)
/* me_txkey is set */
#define MDBX_ENV_TXKEY UINT32_C(0x10000000)
/* Legacy MDBX_MAPASYNC (prior v0.9) */
#define MDBX_DEPRECATED_MAPASYNC UINT32_C(0x100000)
#define ENV_INTERNAL_FLAGS (MDBX_FATAL_ERROR | MDBX_ENV_ACTIVE | MDBX_ENV_TXKEY)
uint32_t me_flags;
mdbx_mmap_t me_dxb_mmap; /* The main data file */
@ -1390,8 +1392,8 @@ ceil_powerof2(size_t value, size_t granularity) {
* at runtime. Changing other flags requires closing the
* environment and re-opening it with the new flags. */
#define ENV_CHANGEABLE_FLAGS \
(MDBX_SAFE_NOSYNC | MDBX_NOMETASYNC | MDBX_MAPASYNC | MDBX_NOMEMINIT | \
MDBX_COALESCE | MDBX_PAGEPERTURB | MDBX_ACCEDE)
(MDBX_SAFE_NOSYNC | MDBX_NOMETASYNC | MDBX_DEPRECATED_MAPASYNC | \
MDBX_NOMEMINIT | MDBX_COALESCE | MDBX_PAGEPERTURB | MDBX_ACCEDE)
#define ENV_CHANGELESS_FLAGS \
(MDBX_NOSUBDIR | MDBX_RDONLY | MDBX_WRITEMAP | MDBX_NOTLS | MDBX_NORDAHEAD | \
MDBX_LIFORECLAIM | MDBX_EXCLUSIVE)

View File

@ -783,13 +783,12 @@ int mdbx_pwritev(mdbx_filehandle_t fd, struct iovec *iov, int iovcnt,
#endif
}
MDBX_INTERNAL_FUNC int mdbx_filesync(mdbx_filehandle_t fd,
enum mdbx_syncmode_bits mode_bits) {
MDBX_INTERNAL_FUNC int mdbx_fsync(mdbx_filehandle_t fd,
enum mdbx_syncmode_bits mode_bits) {
#if defined(_WIN32) || defined(_WIN64)
return ((mode_bits & (MDBX_SYNC_DATA | MDBX_SYNC_IODQ)) == 0 ||
FlushFileBuffers(fd))
? MDBX_SUCCESS
: GetLastError();
if ((mode_bits & (MDBX_SYNC_DATA | MDBX_SYNC_IODQ)) && !FlushFileBuffers(fd))
return GetLastError();
return MDBX_SUCCESS;
#else
#if defined(__APPLE__) && \
@ -797,30 +796,37 @@ MDBX_INTERNAL_FUNC int mdbx_filesync(mdbx_filehandle_t fd,
if (mode_bits & MDBX_SYNC_IODQ)
return likely(fcntl(fd, F_FULLFSYNC) != -1) ? MDBX_SUCCESS : errno;
#endif /* MacOS */
#if defined(__linux__) || defined(__gnu_linux__)
if (mode_bits == MDBX_SYNC_SIZE && mdbx_linux_kernel_version >= 0x03060000)
return MDBX_SUCCESS;
#endif /* Linux */
int rc;
do {
/* LY: This approach is always safe and without appreciable performance
* degradation, even on a kernel with fdatasync's bug.
*
* For more info about of a corresponding fdatasync() bug
* see http://www.spinics.net/lists/linux-ext4/msg33714.html */
while (1) {
switch (mode_bits & (MDBX_SYNC_DATA | MDBX_SYNC_SIZE)) {
case MDBX_SYNC_NONE:
return MDBX_SUCCESS /* nothing to do */;
#if defined(_POSIX_SYNCHRONIZED_IO) && _POSIX_SYNCHRONIZED_IO > 0
/* LY: This code is always safe and without appreciable performance
* degradation, even on a kernel with fdatasync's bug.
*
* For more info about of a corresponding fdatasync() bug
* see http://www.spinics.net/lists/linux-ext4/msg33714.html */
if ((mode_bits & MDBX_SYNC_SIZE) == 0) {
case MDBX_SYNC_DATA:
if (fdatasync(fd) == 0)
return MDBX_SUCCESS;
} else
#else
(void)mode_bits;
#endif
if (fsync(fd) == 0)
return MDBX_SUCCESS;
rc = errno;
} while (rc == EINTR);
return rc;
break /* error */;
#if defined(__linux__) || defined(__gnu_linux__)
case MDBX_SYNC_SIZE:
if (mdbx_linux_kernel_version >= 0x03060000)
return MDBX_SUCCESS;
__fallthrough /* fall through */;
#endif /* Linux */
#endif /* _POSIX_SYNCHRONIZED_IO > 0 */
default:
if (fsync(fd) == 0)
return MDBX_SUCCESS;
}
int rc = errno;
if (rc != EINTR)
return rc;
}
#endif
}
@ -938,24 +944,24 @@ MDBX_INTERNAL_FUNC int mdbx_thread_join(mdbx_thread_t thread) {
/*----------------------------------------------------------------------------*/
MDBX_INTERNAL_FUNC int mdbx_msync(mdbx_mmap_t *map, size_t offset,
size_t length, int async) {
size_t length,
enum mdbx_syncmode_bits mode_bits) {
uint8_t *ptr = (uint8_t *)map->address + offset;
#if defined(_WIN32) || defined(_WIN64)
if (FlushViewOfFile(ptr, length) && (async || FlushFileBuffers(map->fd)))
return MDBX_SUCCESS;
return GetLastError();
if (!FlushViewOfFile(ptr, length))
return GetLastError();
#else
#if defined(__linux__) || defined(__gnu_linux__)
if (async && mdbx_linux_kernel_version > 0x02061300)
/* Since Linux 2.6.19, MS_ASYNC is in fact a no-op,
since the kernel properly tracks dirty pages and flushes them to storage
as necessary. */
if (mode_bits == MDBX_SYNC_NONE && mdbx_linux_kernel_version > 0x02061300)
/* Since Linux 2.6.19, MS_ASYNC is in fact a no-op. The kernel properly
* tracks dirty pages and flushes them to storage as necessary. */
return MDBX_SUCCESS;
#endif /* Linux */
const int mode = async ? MS_ASYNC : MS_SYNC;
int rc = (msync(ptr, length, mode) == 0) ? MDBX_SUCCESS : errno;
return rc;
if (msync(ptr, length, (mode_bits & MDBX_SYNC_DATA) ? MS_SYNC : MS_ASYNC))
return errno;
mode_bits &= ~MDBX_SYNC_DATA;
#endif
return mdbx_fsync(map->fd, mode_bits);
}
MDBX_INTERNAL_FUNC int mdbx_check_fs_rdonly(mdbx_filehandle_t handle,

View File

@ -572,13 +572,14 @@ mdbx_thread_create(mdbx_thread_t *thread,
MDBX_INTERNAL_FUNC int mdbx_thread_join(mdbx_thread_t thread);
enum mdbx_syncmode_bits {
MDBX_SYNC_NONE = 0,
MDBX_SYNC_DATA = 1,
MDBX_SYNC_SIZE = 2,
MDBX_SYNC_IODQ = 4
};
MDBX_INTERNAL_FUNC int mdbx_filesync(mdbx_filehandle_t fd,
const enum mdbx_syncmode_bits mode_bits);
MDBX_INTERNAL_FUNC int mdbx_fsync(mdbx_filehandle_t fd,
const enum mdbx_syncmode_bits mode_bits);
MDBX_INTERNAL_FUNC int mdbx_ftruncate(mdbx_filehandle_t fd, uint64_t length);
MDBX_INTERNAL_FUNC int mdbx_fseek(mdbx_filehandle_t fd, uint64_t pos);
MDBX_INTERNAL_FUNC int mdbx_filesize(mdbx_filehandle_t fd, uint64_t *length);
@ -635,7 +636,8 @@ MDBX_INTERNAL_FUNC int
mdbx_resume_threads_after_remap(mdbx_handle_array_t *array);
#endif /* Windows */
MDBX_INTERNAL_FUNC int mdbx_msync(mdbx_mmap_t *map, size_t offset,
size_t length, int async);
size_t length,
enum mdbx_syncmode_bits mode_bits);
MDBX_INTERNAL_FUNC int mdbx_check_fs_rdonly(mdbx_filehandle_t handle,
const char *pathname, int err);

View File

@ -298,7 +298,6 @@ bool parse_option(int argc, char *const argv[], int &narg, const char *option,
const struct option_verb mode_bits[] = {
{"rdonly", unsigned(MDBX_RDONLY)},
{"mapasync", unsigned(MDBX_MAPASYNC)},
{"nosync-utterly", unsigned(MDBX_UTTERLY_NOSYNC)},
{"nosubdir", unsigned(MDBX_NOSUBDIR)},
{"nosync-safe", unsigned(MDBX_SAFE_NOSYNC)},

View File

@ -95,7 +95,6 @@ void __noreturn usage(void) {
" coalesce == MDBX_COALESCE\n"
" nosync-safe == MDBX_SAFE_NOSYNC\n"
" writemap == MDBX_WRITEMAP\n"
" mapasync == MDBX_MAPASYNC\n"
" nosync-utterly == MDBX_UTTERLY_NOSYNC\n"
" perturb == MDBX_PAGEPERTURB\n"
" notls == MDBX_NOTLS\n"
@ -125,8 +124,8 @@ void actor_params::set_defaults(const std::string &tmpdir) {
#endif
pathname_db = tmpdir + "mdbx-test.db";
mode_flags = MDBX_NOSUBDIR | MDBX_WRITEMAP | MDBX_MAPASYNC | MDBX_NOMEMINIT |
MDBX_COALESCE | MDBX_LIFORECLAIM | MDBX_ACCEDE;
mode_flags = MDBX_NOSUBDIR | MDBX_WRITEMAP | MDBX_SAFE_NOSYNC |
MDBX_NOMEMINIT | MDBX_COALESCE | MDBX_LIFORECLAIM | MDBX_ACCEDE;
table_flags = MDBX_DUPSORT;
size_lower = -1;

View File

@ -77,8 +77,7 @@ bool testcase_nested::teardown() {
void testcase_nested::push_txn() {
MDBX_txn *txn;
unsigned flags =
prng32() & (MDBX_SAFE_NOSYNC | MDBX_NOMETASYNC | MDBX_MAPASYNC);
unsigned flags = prng32() & (MDBX_SAFE_NOSYNC | MDBX_NOMETASYNC);
int err = mdbx_txn_begin(db_guard.get(), txn_guard.get(), flags, &txn);
if (unlikely(err != MDBX_SUCCESS))
failure_perror("mdbx_txn_begin(nested)", err);

View File

@ -109,8 +109,8 @@ static void db_connect() {
env, 0, 0, REC_COUNT * sizeof(session_data_t) * 10, -1, -1, -1));
MDBX_CHECK(mdbx_env_set_maxdbs(env, 30));
MDBX_CHECK(mdbx_env_open(env, opt_db_path,
MDBX_CREATE | MDBX_WRITEMAP | MDBX_MAPASYNC |
MDBX_SAFE_NOSYNC | MDBX_LIFORECLAIM,
MDBX_CREATE | MDBX_WRITEMAP | MDBX_UTTERLY_NOSYNC |
MDBX_LIFORECLAIM,
0664));
MDBX_txn *txn;