diff --git a/src/core.c b/src/core.c index dd2f2ffe..aed1f500 100644 --- a/src/core.c +++ b/src/core.c @@ -3972,7 +3972,7 @@ __cold static void kill_page(MDBX_txn *txn, MDBX_page *mp, pgno_t pgno, const size_t bytes = pgno2bytes(env, npages); memset(mp, -1, bytes); mp->mp_pgno = pgno; - if ((env->me_flags & MDBX_WRITEMAP) == 0) + if ((txn->mt_flags & MDBX_WRITEMAP) == 0) osal_pwrite(env->me_lazy_fd, mp, bytes, pgno2bytes(env, pgno)); } else { struct iovec iov[MDBX_COMMIT_PAGES]; @@ -4430,6 +4430,9 @@ __must_check_result static int iov_page(MDBX_txn *txn, iov_ctx_t *ctx, tASSERT(txn, !(txn->mt_flags & MDBX_WRITEMAP)); dp->mp_txnid = txn->mt_txnid; tASSERT(txn, IS_SPILLED(txn, dp)); +#if MDBX_AVOID_MSYNC + doit:; +#endif /* MDBX_AVOID_MSYNC */ int err = osal_ioring_add(ctx->ior, pgno2bytes(env, dp->mp_pgno), dp, pgno2bytes(env, npages)); if (unlikely(err != MDBX_SUCCESS)) { @@ -4452,6 +4455,9 @@ __must_check_result static int iov_page(MDBX_txn *txn, iov_ctx_t *ctx, } } else { tASSERT(txn, txn->mt_flags & MDBX_WRITEMAP); +#if MDBX_AVOID_MSYNC + goto doit; +#endif /* MDBX_AVOID_MSYNC */ } #if MDBX_NEED_WRITTEN_RANGE @@ -4466,17 +4472,18 @@ __must_check_result static int iov_page(MDBX_txn *txn, iov_ctx_t *ctx, } static int spill_page(MDBX_txn *txn, iov_ctx_t *ctx, MDBX_page *dp, - unsigned npages) { + const unsigned npages) { +#if !MDBX_AVOID_MSYNC tASSERT(txn, !(txn->mt_flags & MDBX_WRITEMAP)); - pgno_t pgno = dp->mp_pgno; - int err = iov_page(txn, ctx, dp, npages); - if (likely(err == MDBX_SUCCESS)) { - err = pnl_append_range(true, &txn->tw.spill_pages, pgno << 1, npages); +#endif /* MDBX_AVOID_MSYNC */ #if MDBX_ENABLE_PGOP_STAT - if (likely(err == MDBX_SUCCESS)) - txn->mt_env->me_lck->mti_pgop_stat.spill.weak += npages; + txn->mt_env->me_lck->mti_pgop_stat.spill.weak += npages; #endif /* MDBX_ENABLE_PGOP_STAT */ - } + const pgno_t pgno = dp->mp_pgno; + int err = iov_page(txn, ctx, dp, npages); + if (likely(err == MDBX_SUCCESS) && + (!MDBX_AVOID_MSYNC || !(txn->mt_flags & MDBX_WRITEMAP))) + err = pnl_append_range(true, &txn->tw.spill_pages, pgno << 1, npages); return err; } @@ -4610,6 +4617,29 @@ static int txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0, (need > txn->tw.dirtyroom) ? need - txn->tw.dirtyroom : 1; #endif /* xMDBX_DEBUG_SPILLING */ + int rc = MDBX_SUCCESS; +#if !MDBX_AVOID_MSYNC + if (txn->mt_flags & MDBX_WRITEMAP) { + NOTICE("%s-spilling of %u dirty-entries (have %u dirty-room, need %u)", + "msync", wanna_spill, txn->tw.dirtyroom, need); + tASSERT(txn, txn->tw.spill_pages == nullptr); + const MDBX_env *env = txn->mt_env; + rc = + osal_msync(&txn->mt_env->me_dxb_mmap, 0, + pgno_align2os_bytes(env, txn->mt_next_pgno), MDBX_SYNC_NONE); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + dpl_clear(txn->tw.dirtylist); + txn->tw.dirtyroom = env->me_options.dp_limit - txn->tw.loose_count; + for (MDBX_page *lp = txn->tw.loose_pages; lp != nullptr; lp = lp->mp_next) { + rc = dpl_append(txn, lp->mp_pgno, lp, 1); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + } + goto done; + } +#endif /* MDBX_AVOID_MSYNC */ + const unsigned dirty = txn->tw.dirtylist->length; const unsigned spill_min = txn->mt_env->me_options.spill_min_denominator @@ -4624,68 +4654,27 @@ static int txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0, if (!wanna_spill) return MDBX_SUCCESS; - NOTICE("spilling %u dirty-entries (have %u dirty-room, need %u)", wanna_spill, - txn->tw.dirtyroom, need); + NOTICE("%s-spilling %u dirty-entries (have %u dirty-room, need %u)", "pwrite", + wanna_spill, txn->tw.dirtyroom, need); tASSERT(txn, txn->tw.dirtylist->length >= wanna_spill); - - int rc = MDBX_SUCCESS; - if (txn->mt_flags & MDBX_WRITEMAP) { - MDBX_dpl *const dl = txn->tw.dirtylist; - const unsigned span = dl->length - txn->tw.loose_count; - txn->tw.dirtyroom += span; - - iov_ctx_t ctx; - rc = iov_init(txn, &ctx, wanna_spill, - dl->pages_including_loose - txn->tw.loose_count); - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; - - unsigned r, w; - for (w = 0, r = 1; r <= dl->length; ++r) { - MDBX_page *dp = dl->items[r].ptr; - if (dp->mp_flags & P_LOOSE) - dl->items[++w] = dl->items[r]; - else if (!MDBX_FAKE_SPILL_WRITEMAP) { - rc = iov_page(txn, &ctx, dp, dpl_npages(dl, r)); - tASSERT(txn, rc == MDBX_SUCCESS); + if (!MDBX_AVOID_MSYNC || !(txn->mt_flags & MDBX_WRITEMAP)) { + if (!txn->tw.spill_pages) { + txn->tw.spill_least_removed = INT_MAX; + txn->tw.spill_pages = pnl_alloc(wanna_spill); + if (unlikely(!txn->tw.spill_pages)) { + rc = MDBX_ENOMEM; + bailout: + txn->mt_flags |= MDBX_TXN_ERROR; + return rc; } + } else { + /* purge deleted slots */ + spill_purge(txn); + rc = pnl_reserve(&txn->tw.spill_pages, wanna_spill); + (void)rc /* ignore since the resulting list may be shorter + and pnl_append() will increase pnl on demand */ + ; } - - tASSERT(txn, span == r - 1 - w && w == txn->tw.loose_count); - dl->sorted = (dl->sorted == dl->length) ? w : 0; - dpl_setlen(dl, w); - tASSERT(txn, dirtylist_check(txn)); - - if (!MDBX_FAKE_SPILL_WRITEMAP && ctx.flush_end > ctx.flush_begin) { - MDBX_env *const env = txn->mt_env; -#if MDBX_ENABLE_PGOP_STAT - env->me_lck->mti_pgop_stat.wops.weak += 1; -#endif /* MDBX_ENABLE_PGOP_STAT */ - rc = osal_msync(&env->me_dxb_mmap, - pgno_align2os_bytes(env, ctx.flush_begin), - pgno_align2os_bytes(env, ctx.flush_end - ctx.flush_begin), - MDBX_SYNC_NONE); - } - return rc; - } - - tASSERT(txn, !(txn->mt_flags & MDBX_WRITEMAP)); - if (!txn->tw.spill_pages) { - txn->tw.spill_least_removed = INT_MAX; - txn->tw.spill_pages = pnl_alloc(wanna_spill); - if (unlikely(!txn->tw.spill_pages)) { - rc = MDBX_ENOMEM; - bailout: - txn->mt_flags |= MDBX_TXN_ERROR; - return rc; - } - } else { - /* purge deleted slots */ - spill_purge(txn); - rc = pnl_reserve(&txn->tw.spill_pages, wanna_spill); - (void)rc /* ignore since the resulting list may be shorter - and pnl_append() will increase pnl on demand */ - ; } /* Сортируем чтобы запись на диск была полее последовательна */ @@ -4848,8 +4837,10 @@ static int txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0, if (unlikely(rc != MDBX_SUCCESS)) goto bailout; - pnl_sort(txn->tw.spill_pages, (size_t)txn->mt_next_pgno << 1); - txn->mt_flags |= MDBX_TXN_SPILLS; + if (!MDBX_AVOID_MSYNC || !(txn->mt_flags & MDBX_WRITEMAP)) { + pnl_sort(txn->tw.spill_pages, (size_t)txn->mt_next_pgno << 1); + txn->mt_flags |= MDBX_TXN_SPILLS; + } NOTICE("spilled %u dirty-entries, now have %u dirty-room", spilled_entries, txn->tw.dirtyroom); } else { @@ -5783,9 +5774,13 @@ static int meta_unsteady(int err, MDBX_env *env, const txnid_t early_than, if (env->me_flags & MDBX_WRITEMAP) { unaligned_poke_u64(4, meta->mm_sign, wipe); osal_flush_incoherent_cpu_writeback(); - err = osal_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, NUM_METAS), - MDBX_SYNC_DATA); - if (unlikely(err != MDBX_SUCCESS)) + if (!MDBX_AVOID_MSYNC) { + err = + osal_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, NUM_METAS), + MDBX_SYNC_DATA | MDBX_SYNC_IODQ); +#if MDBX_ENABLE_PGOP_STAT + env->me_lck->mti_pgop_stat.msync.weak += 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ return err; } ptr = data_page(meta); @@ -7082,18 +7077,26 @@ static int meta_sync(const MDBX_env *env, const meta_ptr_t head) { int rc = MDBX_RESULT_TRUE; if (env->me_flags & MDBX_WRITEMAP) { -#if MDBX_ENABLE_PGOP_ST - env->me_lck->mti_pgop_stat.wops.weak += 1; -#endif /* MDBX_ENABLE_PGOP_STAT */ - const MDBX_page *page = data_page(head.ptr_c); - rc = osal_pwrite(env->me_fd4meta, page, env->me_psize, - (uint8_t *)page - env->me_map); - - if (likely(rc == MDBX_SUCCESS) && env->me_fd4meta == env->me_lazy_fd) { - rc = osal_fsync(env->me_lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); + if (!MDBX_AVOID_MSYNC) { + rc = osal_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, NUM_METAS), + MDBX_SYNC_DATA | MDBX_SYNC_IODQ); #if MDBX_ENABLE_PGOP_STAT - env->me_lck->mti_pgop_stat.fsync.weak += 1; + env->me_lck->mti_pgop_stat.msync.weak += 1; #endif /* MDBX_ENABLE_PGOP_STAT */ + } else { +#if MDBX_ENABLE_PGOP_ST + env->me_lck->mti_pgop_stat.wops.weak += 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ + const MDBX_page *page = data_page(head.ptr_c); + rc = osal_pwrite(env->me_fd4meta, page, env->me_psize, + (uint8_t *)page - env->me_map); + + if (likely(rc == MDBX_SUCCESS) && env->me_fd4meta == env->me_lazy_fd) { + rc = osal_fsync(env->me_lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); +#if MDBX_ENABLE_PGOP_STAT + env->me_lck->mti_pgop_stat.fsync.weak += 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ + } } } else { rc = osal_fsync(env->me_lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); @@ -9948,8 +9951,9 @@ bailout: } static int txn_write(MDBX_txn *txn, iov_ctx_t *ctx) { - MDBX_dpl *const dl = - (txn->mt_flags & MDBX_WRITEMAP) ? txn->tw.dirtylist : dpl_sort(txn); + MDBX_dpl *dl = txn->tw.dirtylist; + if (MDBX_AVOID_MSYNC || !(txn->mt_flags & MDBX_WRITEMAP)) + dl = dpl_sort(txn); int rc = MDBX_SUCCESS; unsigned r, w; for (w = 0, r = 1; r <= dl->length; ++r) { @@ -11273,15 +11277,19 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending, if (flags & MDBX_NOMETASYNC) mode_bits |= MDBX_SYNC_IODQ; } + if (!MDBX_AVOID_MSYNC && (flags & MDBX_WRITEMAP)) { #if MDBX_ENABLE_PGOP_STAT env->me_lck->mti_pgop_stat.msync.weak += sync_op; #endif /* MDBX_ENABLE_PGOP_STAT */ - if (flags & MDBX_WRITEMAP) rc = osal_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, pending->mm_geo.next), mode_bits); - else + } else { +#if MDBX_ENABLE_PGOP_STAT + env->me_lck->mti_pgop_stat.fsync.weak += sync_op; +#endif /* MDBX_ENABLE_PGOP_STAT */ rc = osal_fsync(env->me_lazy_fd, mode_bits); + } if (unlikely(rc != MDBX_SUCCESS)) goto fail; rc = (flags & MDBX_SAFE_NOSYNC) ? MDBX_RESULT_TRUE /* carry non-steady */ @@ -11399,14 +11407,33 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending, memcpy(target->mm_sign, pending->mm_sign, 8); osal_flush_incoherent_cpu_writeback(); jitter4testing(true); - /* sync meta-pages */ + if (!MDBX_AVOID_MSYNC) { + /* sync meta-pages */ #if MDBX_ENABLE_PGOP_STAT - env->me_lck->mti_pgop_stat.msync.weak += 1; + env->me_lck->mti_pgop_stat.msync.weak += 1; #endif /* MDBX_ENABLE_PGOP_STAT */ - rc = osal_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, NUM_METAS), - (flags & MDBX_NOMETASYNC) - ? MDBX_SYNC_NONE - : MDBX_SYNC_DATA | MDBX_SYNC_IODQ); + rc = osal_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, NUM_METAS), + (flags & MDBX_NOMETASYNC) + ? MDBX_SYNC_NONE + : MDBX_SYNC_DATA | MDBX_SYNC_IODQ); + } else { +#if MDBX_ENABLE_PGOP_STAT + env->me_lck->mti_pgop_stat.wops.weak += 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ + const MDBX_page *page = data_page(target); + rc = osal_pwrite(env->me_fd4meta, page, env->me_psize, + (uint8_t *)page - env->me_map); + if (likely(rc == MDBX_SUCCESS)) { + osal_flush_incoherent_mmap(target, sizeof(MDBX_meta), env->me_os_psize); + if ((flags & MDBX_NOMETASYNC) == 0 && + env->me_fd4meta == env->me_lazy_fd) { +#if MDBX_ENABLE_PGOP_STAT + env->me_lck->mti_pgop_stat.fsync.weak += 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ + rc = osal_fsync(env->me_lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); + } + } + } if (unlikely(rc != MDBX_SUCCESS)) goto fail; } else { @@ -13347,8 +13374,16 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, uint8_t ior_flags = 0; if ((flags & (MDBX_RDONLY | MDBX_SAFE_NOSYNC)) == MDBX_SYNC_DURABLE) { ior_flags = IOR_OVERLAPPED; + if ((flags & MDBX_WRITEMAP) && MDBX_AVOID_MSYNC) { + MDBX_meta header; + if (read_header(env, &header, MDBX_SUCCESS, true) == MDBX_SUCCESS && + header.mm_psize >= env->me_os_psize) + ior_flags |= IOR_DIRECT; + } + rc = - osal_openfile(MDBX_OPEN_DXB_OVERLAPPED, + osal_openfile((ior_flags & IOR_DIRECT) ? MDBX_OPEN_DXB_OVERLAPPED_DIRECT + : MDBX_OPEN_DXB_OVERLAPPED, env, env_pathname.dxb, &env->me_overlapped_fd, 0); if (rc != MDBX_SUCCESS) goto bailout; @@ -23481,6 +23516,7 @@ __dll_export " MDBX_64BIT_ATOMIC=" MDBX_64BIT_ATOMIC_CONFIG " MDBX_64BIT_CAS=" MDBX_64BIT_CAS_CONFIG " MDBX_TRUST_RTC=" MDBX_TRUST_RTC_CONFIG + " MDBX_AVOID_MSYNC=" MDBX_STRINGIFY(MDBX_AVOID_MSYNC) " MDBX_ENABLE_REFUND=" MDBX_STRINGIFY(MDBX_ENABLE_REFUND) " MDBX_ENABLE_MADVISE=" MDBX_STRINGIFY(MDBX_ENABLE_MADVISE) #if MDBX_DISABLE_VALIDATION diff --git a/src/options.h b/src/options.h index 08018630..1a28e619 100644 --- a/src/options.h +++ b/src/options.h @@ -121,23 +121,22 @@ #error MDBX_DPL_PREALLOC_FOR_RADIXSORT must be defined as 0 or 1 #endif /* MDBX_DPL_PREALLOC_FOR_RADIXSORT */ -/** Basically, this build-option is for TODO. Guess it should be replaced - * with MDBX_ENABLE_WRITEMAP_SPILLING with the three variants: - * 0/OFF = Don't track dirty pages at all and don't spilling ones. - * This should be by-default on Linux and may-be other systems - * (not sure: Darwin/OSX, FreeBSD, Windows 10) where kernel provides - * properly LRU tracking and async writing on-demand. - * 1/ON = Lite tracking of dirty pages but with LRU labels and explicit - * spilling with msync(MS_ASYNC). */ -#ifndef MDBX_FAKE_SPILL_WRITEMAP -#if defined(__linux__) || defined(__gnu_linux__) -#define MDBX_FAKE_SPILL_WRITEMAP 1 /* msync(MS_ASYNC) is no-op on Linux */ +/** Controls dirty pages tracking, spilling and persisting in MDBX_WRITEMAP + * mode. 0/OFF = Don't track dirty pages at all, don't spill ones, and use + * msync() to persist data. This is by-default on Linux and other systems where + * kernel provides properly LRU tracking and effective flushing on-demand. 1/ON + * = Tracking of dirty pages but with LRU labels for spilling and explicit + * persist ones by write(). This may be reasonable for systems which low + * performance of msync() and/or LRU tracking. */ +#ifndef MDBX_AVOID_MSYNC +#if defined(_WIN32) || defined(_WIN64) +#define MDBX_AVOID_MSYNC 1 #else -#define MDBX_FAKE_SPILL_WRITEMAP 0 +#define MDBX_AVOID_MSYNC 0 #endif -#elif !(MDBX_FAKE_SPILL_WRITEMAP == 0 || MDBX_FAKE_SPILL_WRITEMAP == 1) -#error MDBX_FAKE_SPILL_WRITEMAP must be defined as 0 or 1 -#endif /* MDBX_FAKE_SPILL_WRITEMAP */ +#elif !(MDBX_AVOID_MSYNC == 0 || MDBX_AVOID_MSYNC == 1) +#error MDBX_AVOID_MSYNC must be defined as 0 or 1 +#endif /* MDBX_AVOID_MSYNC */ /** Controls sort order of internal page number lists. * This mostly experimental/advanced option with not for regular MDBX users. diff --git a/src/osal.c b/src/osal.c index 77b6adfc..2e0bb56a 100644 --- a/src/osal.c +++ b/src/osal.c @@ -637,7 +637,7 @@ MDBX_INTERNAL_FUNC int osal_ioring_add(osal_ioring_t *ior, const size_t offset, #if defined(_WIN32) || defined(_WIN64) const unsigned segments = (unsigned)(bytes >> ior->pagesize_ln2); const bool use_gather = - (ior->flags & IOR_UNBUFFERED) && ior->slots_left >= segments; + (ior->flags & IOR_DIRECT) && ior->slots_left >= segments; #endif /* Windows */ ior_item_t *item = ior->pool; @@ -1179,6 +1179,10 @@ MDBX_INTERNAL_FUNC int osal_openfile(const enum osal_openfile_purpose purpose, case MDBX_OPEN_DXB_LAZY: DesiredAccess |= GENERIC_READ | GENERIC_WRITE; break; + case MDBX_OPEN_DXB_OVERLAPPED_DIRECT: + FlagsAndAttributes |= FILE_FLAG_NO_BUFFERING; + /* fall through */ + __fallthrough; case MDBX_OPEN_DXB_OVERLAPPED: FlagsAndAttributes |= FILE_FLAG_OVERLAPPED; /* fall through */ diff --git a/src/osal.h b/src/osal.h index 11ef24f8..568ae9c0 100644 --- a/src/osal.h +++ b/src/osal.h @@ -318,7 +318,7 @@ typedef struct osal_ioring { unsigned slots_left; unsigned allocated; #if defined(_WIN32) || defined(_WIN64) -#define IOR_UNBUFFERED 1 +#define IOR_DIRECT 1 #define IOR_OVERLAPPED 2 #define IOR_STATE_LOCKED 1 unsigned pagesize; @@ -501,6 +501,7 @@ enum osal_openfile_purpose { MDBX_OPEN_DXB_DSYNC, #if defined(_WIN32) || defined(_WIN64) MDBX_OPEN_DXB_OVERLAPPED, + MDBX_OPEN_DXB_OVERLAPPED_DIRECT, #endif /* Windows */ MDBX_OPEN_LCK, MDBX_OPEN_COPY,