From 23d236f70e690b21065fe2119caad0b71d9adf4b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?=
 =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= <leo@yuriev.ru>
Date: Sat, 3 Dec 2022 14:55:38 +0300
Subject: [PATCH] =?UTF-8?q?mdbx:=20=D0=B4=D0=BE=D0=B1=D0=B0=D0=B2=D0=BB?=
 =?UTF-8?q?=D0=B5=D0=BD=D0=B8=D0=B5=20`MDBX=5Fopt=5Fwritethrough=5Fthresho?=
 =?UTF-8?q?ld`=20=D0=B8=20=D1=81=D0=BE=D0=BF=D1=83=D1=82=D1=81=D1=82=D0=B2?=
 =?UTF-8?q?=D1=83=D1=8E=D1=89=D0=B8=D0=B5=20=D0=B4=D0=BE=D1=80=D0=B0=D0=B1?=
 =?UTF-8?q?=D0=BE=D1=82=D0=BA=D0=B8.?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 mdbx.h            |  28 ++++++
 src/core.c        | 218 +++++++++++++++++++++++++++++++++-------------
 src/internals.h   |  11 ++-
 src/lck-windows.c |  36 +++++---
 src/osal.c        |  62 +++++++------
 src/osal.h        |  24 ++---
 6 files changed, 266 insertions(+), 113 deletions(-)

diff --git a/mdbx.h b/mdbx.h
index b4d25f74..3f58ccb0 100644
--- a/mdbx.h
+++ b/mdbx.h
@@ -2220,6 +2220,34 @@ enum MDBX_option_t {
    * to 50% (half empty) which corresponds to the range from 8192 and to 32768
    * in units respectively. */
   MDBX_opt_merge_threshold_16dot16_percent,
+
+  /** \brief Controls the choosing between use write-through disk writes and
+   * usual ones with followed flush by the `fdatasync()` syscall.
+   * \details Depending on the operating system, storage subsystem
+   * characteristics and the use case, higher performance can be achieved by
+   * either using write-through or a serie of usual/lazy writes followed by
+   * the flush-to-disk.
+   *
+   * Basically for N chunks the latency/cost of write-through is:
+   *  latency = N * (emit + round-trip-to-storage + storage-execution);
+   * And for serie of lazy writes with flush is:
+   *  latency = N * (emit + storage-execution) + flush + round-trip-to-storage.
+   *
+   * So, for large N and/or noteable round-trip-to-storage the write+flush
+   * approach is win. But for small N and/or near-zero NVMe-like latency
+   * the write-through is better.
+   *
+   * To solve this issue libmdbx provide `MDBX_opt_writethrough_threshold`:
+   *  - when N described above less or equal specified threshold,
+   *    a write-through approach will be used;
+   *  - otherwise, when N great than specified threshold,
+   *    a write-and-flush approach will be used.
+   *
+   * \note MDBX_opt_writethrough_threshold affects only \ref MDBX_SYNC_DURABLE
+   * mode without \ref MDBX_WRITEMAP, and not supported on Windows.
+   * On Windows a write-through is used always but \ref MDBX_NOMETASYNC could
+   * be used for switching to write-and-flush. */
+  MDBX_opt_writethrough_threshold,
 };
 #ifndef __cplusplus
 /** \ingroup c_settings */
diff --git a/src/core.c b/src/core.c
index e5953d96..a01c18cd 100644
--- a/src/core.c
+++ b/src/core.c
@@ -4446,6 +4446,7 @@ static __inline int page_retire(MDBX_cursor *mc, MDBX_page *mp) {
 typedef struct iov_ctx {
   MDBX_env *env;
   osal_ioring_t *ior;
+  mdbx_filehandle_t fd;
   int err;
 #ifndef MDBX_NEED_WRITTEN_RANGE
 #define MDBX_NEED_WRITTEN_RANGE 1
@@ -4458,10 +4459,12 @@ typedef struct iov_ctx {
 } iov_ctx_t;
 
 __must_check_result static int iov_init(MDBX_txn *const txn, iov_ctx_t *ctx,
-                                        size_t items, size_t npages) {
+                                        size_t items, size_t npages,
+                                        mdbx_filehandle_t fd) {
   ctx->env = txn->mt_env;
   ctx->ior = &txn->mt_env->me_ioring;
-  ctx->err = osal_ioring_reserve(ctx->ior, items,
+  ctx->fd = fd;
+  ctx->err = osal_ioring_prepare(ctx->ior, items,
                                  pgno_align2os_bytes(txn->mt_env, npages));
   if (likely(ctx->err == MDBX_SUCCESS)) {
 #if MDBX_NEED_WRITTEN_RANGE
@@ -4534,12 +4537,10 @@ static void iov_complete(iov_ctx_t *ctx) {
 
 __must_check_result static int iov_write(iov_ctx_t *ctx) {
   eASSERT(ctx->env, !iov_empty(ctx));
-  osal_ioring_write_result_t r = osal_ioring_write(ctx->ior);
+  osal_ioring_write_result_t r = osal_ioring_write(ctx->ior, ctx->fd);
 #if MDBX_ENABLE_PGOP_STAT
   ctx->env->me_lck->mti_pgop_stat.wops.weak += r.wops;
 #endif /* MDBX_ENABLE_PGOP_STAT */
-  if (!ctx->env->me_lck->mti_eoos_timestamp.weak)
-    ctx->env->me_lck->mti_eoos_timestamp.weak = osal_monotime();
   ctx->err = r.err;
   if (unlikely(ctx->err != MDBX_SUCCESS))
     ERROR("Write error: %s", mdbx_strerror(ctx->err));
@@ -4596,7 +4597,6 @@ __must_check_result static int iov_page(MDBX_txn *txn, iov_ctx_t *ctx,
                        ? ctx->flush_end
                        : dp->mp_pgno + (pgno_t)npages;
 #endif /* MDBX_NEED_WRITTEN_RANGE */
-  env->me_lck->mti_unsynced_pages.weak += npages;
   return MDBX_SUCCESS;
 }
 
@@ -4816,6 +4816,8 @@ __cold static int txn_spill_slowpath(MDBX_txn *const txn, MDBX_cursor *const m0,
                    pgno_align2os_bytes(env, txn->mt_next_pgno), MDBX_SYNC_KICK);
     if (unlikely(rc != MDBX_SUCCESS))
       goto bailout;
+    env->me_lck->mti_unsynced_pages.weak +=
+        txn->tw.dirtylist->pages_including_loose - txn->tw.loose_count;
     dpl_clear(txn->tw.dirtylist);
     txn->tw.dirtyroom = env->me_options.dp_limit - txn->tw.loose_count;
     for (MDBX_page *lp = txn->tw.loose_pages; lp != nullptr; lp = mp_next(lp)) {
@@ -4950,7 +4952,12 @@ __cold static int txn_spill_slowpath(MDBX_txn *const txn, MDBX_cursor *const m0,
     tASSERT(txn, prio2spill < prio2adjacent && prio2adjacent <= 256);
 
     iov_ctx_t ctx;
-    rc = iov_init(txn, &ctx, amount_entries, amount_npages);
+    rc =
+        iov_init(txn, &ctx, amount_entries, amount_npages,
+#if defined(_WIN32) || defined(_WIN64)
+                 txn->mt_env->me_overlapped_fd ? txn->mt_env->me_overlapped_fd :
+#endif
+                                               txn->mt_env->me_lazy_fd);
     if (unlikely(rc != MDBX_SUCCESS))
       goto bailout;
 
@@ -5028,6 +5035,7 @@ __cold static int txn_spill_slowpath(MDBX_txn *const txn, MDBX_cursor *const m0,
     if (unlikely(rc != MDBX_SUCCESS))
       goto bailout;
 
+    txn->mt_env->me_lck->mti_unsynced_pages.weak += spilled_npages;
     if (!MDBX_AVOID_MSYNC || !(txn->mt_flags & MDBX_WRITEMAP)) {
       pnl_sort(txn->tw.spilled.list, (size_t)txn->mt_next_pgno << 1);
       txn->mt_flags |= MDBX_TXN_SPILLS;
@@ -10543,7 +10551,7 @@ static int txn_write(MDBX_txn *txn, iov_ctx_t *ctx) {
   tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC);
   MDBX_dpl *const dl = dpl_sort(txn);
   int rc = MDBX_SUCCESS;
-  size_t r, w;
+  size_t r, w, total_npages = 0;
   for (w = 0, r = 1; r <= dl->length; ++r) {
     MDBX_page *dp = dl->items[r].ptr;
     if (dp->mp_flags & P_LOOSE) {
@@ -10551,9 +10559,10 @@ static int txn_write(MDBX_txn *txn, iov_ctx_t *ctx) {
       continue;
     }
     unsigned npages = dpl_npages(dl, r);
+    total_npages += npages;
     rc = iov_page(txn, ctx, dp, npages);
     if (unlikely(rc != MDBX_SUCCESS))
-      break;
+      return rc;
   }
 
   if (!iov_empty(ctx)) {
@@ -10561,6 +10570,13 @@ static int txn_write(MDBX_txn *txn, iov_ctx_t *ctx) {
     rc = iov_write(ctx);
   }
 
+  if (likely(rc == MDBX_SUCCESS) && ctx->fd == txn->mt_env->me_lazy_fd) {
+    txn->mt_env->me_lck->mti_unsynced_pages.weak += total_npages;
+    if (!txn->mt_env->me_lck->mti_eoos_timestamp.weak)
+      txn->mt_env->me_lck->mti_eoos_timestamp.weak = osal_monotime();
+  }
+
+  txn->tw.dirtylist->pages_including_loose -= total_npages;
   while (r <= dl->length)
     dl->items[++w] = dl->items[r++];
 
@@ -10569,6 +10585,8 @@ static int txn_write(MDBX_txn *txn, iov_ctx_t *ctx) {
   tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length ==
                    (txn->mt_parent ? txn->mt_parent->tw.dirtyroom
                                    : txn->mt_env->me_options.dp_limit));
+  tASSERT(txn, txn->tw.dirtylist->length == txn->tw.loose_count);
+  tASSERT(txn, txn->tw.dirtylist->pages_including_loose == txn->tw.loose_count);
   return rc;
 }
 
@@ -11235,6 +11253,7 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) {
   if (unlikely(rc != MDBX_SUCCESS))
     goto fail;
 
+  tASSERT(txn, txn->tw.loose_count == 0);
   txn->mt_dbs[FREE_DBI].md_mod_txnid = (txn->mt_dbistate[FREE_DBI] & DBI_DIRTY)
                                            ? txn->mt_txnid
                                            : txn->mt_dbs[FREE_DBI].md_mod_txnid;
@@ -11252,40 +11271,74 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) {
       goto fail;
   }
 
+  bool need_flush_for_nometasync = false;
   const meta_ptr_t head = meta_recent(env, &txn->tw.troika);
+  const uint32_t meta_sync_txnid =
+      atomic_load32(&env->me_lck->mti_meta_sync_txnid, mo_Relaxed);
   /* sync prev meta */
-  if (head.is_steady && atomic_load32(&env->me_lck->mti_meta_sync_txnid,
-                                      mo_Relaxed) != (uint32_t)head.txnid) {
-    /* FIXME: Тут есть унаследованный от LMDB недочет.
+  if (head.is_steady && meta_sync_txnid != (uint32_t)head.txnid) {
+    /* Исправление унаследованного от LMDB недочета:
      *
-     * Проблем нет, если все процессы работающие с БД не используют WRITEMAP.
+     * Всё хорошо, если все процессы работающие с БД не используют WRITEMAP.
      * Тогда мета-страница (обновленная, но не сброшенная на диск) будет
      * сохранена в результате fdatasync() при записи данных этой транзакции.
      *
-     * Проблем нет, если все процессы работающие с БД используют WRITEMAP
+     * Всё хорошо, если все процессы работающие с БД используют WRITEMAP
      * без MDBX_AVOID_MSYNC.
      * Тогда мета-страница (обновленная, но не сброшенная на диск) будет
      * сохранена в результате msync() при записи данных этой транзакции.
      *
-     * Если же происходит комбинирование WRITEMAP и записи через файловый
-     * дескриптор, то требуется явно обновлять мета-страницу. Однако,
-     * так полностью теряется выгода от NOMETASYNC.
-     *
-     * Дефект же в том, что сейчас нет возможности отличить последний случай от
-     * двух предыдущих и поэтому приходится всегда задействовать meta_sync(). */
-    rc = meta_sync(env, head);
-    if (unlikely(rc != MDBX_SUCCESS)) {
-      ERROR("txn-%s: error %d", "presync-meta", rc);
-      goto fail;
+     * Если же в процессах работающих с БД используется оба метода, как sync()
+     * в режиме MDBX_WRITEMAP, так и записи через файловый дескриптор, то
+     * становится невозможным обеспечить фиксацию на диске мета-страницы
+     * предыдущей транзакции и данных текущей транзакции, за счет одной
+     * sync-операцией выполняемой после записи данных текущей транзакции.
+     * Соответственно, требуется явно обновлять мета-страницу, что полностью
+     * уничтожает выгоду от NOMETASYNC. */
+    const uint32_t txnid_dist =
+        ((txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC)
+            ? MDBX_NOMETASYNC_LAZY_FD
+            : MDBX_NOMETASYNC_LAZY_WRITEMAP;
+    /* Смысл "магии" в том, чтобы избежать отдельного вызова fdatasync()
+     * или msync() для гарантированной фиксации на диске мета-страницы,
+     * которая была "лениво" отправлена на запись в предыдущей транзакции,
+     * но не сброшена на диск из-за активного режима MDBX_NOMETASYNC. */
+    if (
+#if defined(_WIN32) || defined(_WIN64)
+        !env->me_overlapped_fd &&
+#endif
+        meta_sync_txnid == (uint32_t)head.txnid - txnid_dist)
+      need_flush_for_nometasync = true;
+    else {
+      rc = meta_sync(env, head);
+      if (unlikely(rc != MDBX_SUCCESS)) {
+        ERROR("txn-%s: error %d", "presync-meta", rc);
+        goto fail;
+      }
     }
   }
 
   if (txn->tw.dirtylist) {
     tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC);
+    tASSERT(txn, txn->tw.loose_count == 0);
+
+    mdbx_filehandle_t fd =
+#if defined(_WIN32) || defined(_WIN64)
+        env->me_overlapped_fd ? env->me_overlapped_fd : env->me_lazy_fd;
+    (void)need_flush_for_nometasync;
+#else
+#define MDBX_WRITETHROUGH_THRESHOLD_DEFAULT 2
+        (need_flush_for_nometasync ||
+         env->me_dsync_fd == INVALID_HANDLE_VALUE ||
+         txn->tw.dirtylist->length > env->me_options.writethrough_threshold ||
+         atomic_load64(&env->me_lck->mti_unsynced_pages, mo_Relaxed))
+            ? env->me_lazy_fd
+            : env->me_dsync_fd;
+#endif /* Windows */
+
     iov_ctx_t write_ctx;
     rc = iov_init(txn, &write_ctx, txn->tw.dirtylist->length,
-                  txn->tw.dirtylist->pages_including_loose -
-                      txn->tw.loose_count);
+                  txn->tw.dirtylist->pages_including_loose, fd);
     if (unlikely(rc != MDBX_SUCCESS)) {
       ERROR("txn-%s: error %d", "iov-init", rc);
       goto fail;
@@ -11298,6 +11351,9 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) {
     }
   } else {
     tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) != 0 && !MDBX_AVOID_MSYNC);
+    env->me_lck->mti_unsynced_pages.weak += txn->tw.writemap_dirty_npages;
+    if (!env->me_lck->mti_eoos_timestamp.weak)
+      env->me_lck->mti_eoos_timestamp.weak = osal_monotime();
   }
 
   /* TODO: use ctx.flush_begin & ctx.flush_end for range-sync */
@@ -12020,6 +12076,8 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending,
     atomic_store64(&env->me_lck->mti_unsynced_pages, 0, mo_Relaxed);
   } else {
     assert(rc == MDBX_RESULT_TRUE /* carry non-steady */);
+    eASSERT(env, env->me_lck->mti_unsynced_pages.weak > 0);
+    eASSERT(env, env->me_lck->mti_eoos_timestamp.weak != 0);
     unaligned_poke_u64(4, pending->mm_sign, MDBX_DATASIGN_WEAK);
   }
 
@@ -12188,9 +12246,15 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending,
     if (unlikely(rc != MDBX_RESULT_TRUE))
       goto fail;
   }
+
+  const uint32_t sync_txnid_dist =
+      ((flags & MDBX_NOMETASYNC) == 0) ? 0
+      : ((flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC)
+          ? MDBX_NOMETASYNC_LAZY_FD
+          : MDBX_NOMETASYNC_LAZY_WRITEMAP;
   env->me_lck->mti_meta_sync_txnid.weak =
       pending->mm_txnid_a[__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__].weak -
-      ((flags & MDBX_NOMETASYNC) ? UINT32_MAX / 3 : 0);
+      sync_txnid_dist;
 
   *troika = meta_tap(env);
   for (MDBX_txn *txn = env->me_txn0; txn; txn = txn->mt_child)
@@ -12349,11 +12413,8 @@ __cold int mdbx_env_create(MDBX_env **penv) {
 
   env->me_maxreaders = DEFAULT_READERS;
   env->me_maxdbs = env->me_numdbs = CORE_DBS;
-  env->me_lazy_fd = env->me_dsync_fd = env->me_fd4meta = env->me_fd4data =
-#if defined(_WIN32) || defined(_WIN64)
-      env->me_overlapped_fd =
-#endif /* Windows */
-          env->me_lfd = INVALID_HANDLE_VALUE;
+  env->me_lazy_fd = env->me_dsync_fd = env->me_fd4meta = env->me_lfd =
+      INVALID_HANDLE_VALUE;
   env->me_pid = osal_getpid();
   env->me_stuck_meta = -1;
 
@@ -12371,6 +12432,14 @@ __cold int mdbx_env_create(MDBX_env **penv) {
   env->me_options.dp_loose_limit = 64;
   env->me_options.merge_threshold_16dot16_percent = 65536 / 4 /* 25% */;
 
+#if !(defined(_WIN32) || defined(_WIN64))
+  env->me_options.writethrough_threshold =
+#if defined(__linux__) || defined(__gnu_linux__)
+      mdbx_RunningOnWSL1 ? MAX_PAGENO :
+#endif /* Linux */
+                         MDBX_WRITETHROUGH_THRESHOLD_DEFAULT;
+#endif /* Windows */
+
   env->me_os_psize = (unsigned)os_psize;
   setup_pagesize(env, (env->me_os_psize < MAX_PAGESIZE) ? env->me_os_psize
                                                         : MAX_PAGESIZE);
@@ -14184,12 +14253,12 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname,
   const uint64_t safe_parking_lot_offset = UINT64_C(0x7fffFFFF80000000);
   osal_fseek(env->me_lazy_fd, safe_parking_lot_offset);
 
-  env->me_fd4data = env->me_fd4meta = env->me_lazy_fd;
+  env->me_fd4meta = env->me_lazy_fd;
 #if defined(_WIN32) || defined(_WIN64)
-  uint8_t ior_flags = 0;
-  if ((flags & (MDBX_RDONLY | MDBX_SAFE_NOSYNC)) == MDBX_SYNC_DURABLE) {
-    ior_flags = IOR_OVERLAPPED;
-    if ((flags & MDBX_WRITEMAP) && MDBX_AVOID_MSYNC) {
+  eASSERT(env, env->me_overlapped_fd == 0);
+  bool ior_direct = false;
+  if (!(flags & (MDBX_RDONLY | MDBX_SAFE_NOSYNC | MDBX_NOMETASYNC))) {
+    if (MDBX_AVOID_MSYNC && (flags & MDBX_WRITEMAP)) {
       /* Запрошен режим MDBX_SAFE_NOSYNC | MDBX_WRITEMAP при активной опции
        * MDBX_AVOID_MSYNC.
        *
@@ -14203,23 +14272,30 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname,
        * 2) Кроме этого, в Windows запись в заблокированный регион файла
        * возможно только через тот-же дескриптор. Поэтому изначальный захват
        * блокировок посредством osal_lck_seize(), захват/освобождение блокировок
-       * во время пишущих транзакций и запись данных должны выполнять через один
-       * дескриптор.
+       * во время пишущих транзакций и запись данных должны выполнятся через
+       * один дескриптор.
        *
        * Таким образом, требуется прочитать волатильный заголовок БД, чтобы
        * узнать размер страницы, чтобы открыть дескриптор файла в режиме нужном
        * для записи данных, чтобы использовать именно этот дескриптор для
        * изначального захвата блокировок. */
       MDBX_meta header;
-      if (read_header(env, &header, MDBX_SUCCESS, true) == MDBX_SUCCESS &&
-          header.mm_psize >= env->me_os_psize)
-        ior_flags |= IOR_DIRECT;
+      uint64_t dxb_filesize;
+      int err = read_header(env, &header, MDBX_SUCCESS, true);
+      if ((err == MDBX_SUCCESS && header.mm_psize >= env->me_os_psize) ||
+          (err == MDBX_ENODATA && mode && env->me_psize >= env->me_os_psize &&
+           osal_filesize(env->me_lazy_fd, &dxb_filesize) == MDBX_SUCCESS &&
+           dxb_filesize == 0))
+        /* Может быть коллизия, если два процесса пытаются одновременно создать
+         * БД с разным размером страницы, который у одного меньше системной
+         * страницы, а у другого НЕ меньше. Эта допустимая, но очень странная
+         * ситуация. Поэтому считаем её ошибочной и не пытаемся разрешить. */
+        ior_direct = true;
     }
 
-    rc =
-        osal_openfile((ior_flags & IOR_DIRECT) ? MDBX_OPEN_DXB_OVERLAPPED_DIRECT
-                                               : MDBX_OPEN_DXB_OVERLAPPED,
-                      env, env_pathname.dxb, &env->me_overlapped_fd, 0);
+    rc = osal_openfile(ior_direct ? MDBX_OPEN_DXB_OVERLAPPED_DIRECT
+                                  : MDBX_OPEN_DXB_OVERLAPPED,
+                       env, env_pathname.dxb, &env->me_overlapped_fd, 0);
     if (rc != MDBX_SUCCESS)
       goto bailout;
     env->me_data_lock_event = CreateEventW(nullptr, true, false, nullptr);
@@ -14227,7 +14303,6 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname,
       rc = (int)GetLastError();
       goto bailout;
     }
-    env->me_fd4data = env->me_overlapped_fd;
     osal_fseek(env->me_overlapped_fd, safe_parking_lot_offset);
   }
 #else
@@ -14260,17 +14335,12 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname,
                                       MDBX_DEPRECATED_COALESCE | MDBX_NORDAHEAD;
 
   eASSERT(env, env->me_dsync_fd == INVALID_HANDLE_VALUE);
-  if ((flags & (MDBX_RDONLY | MDBX_SAFE_NOSYNC)) == 0 &&
-      (env->me_fd4data == env->me_lazy_fd || !(flags & MDBX_NOMETASYNC))) {
+  if (!(flags & (MDBX_RDONLY | MDBX_SAFE_NOSYNC | MDBX_DEPRECATED_MAPASYNC))) {
     rc = osal_openfile(MDBX_OPEN_DXB_DSYNC, env, env_pathname.dxb,
                        &env->me_dsync_fd, 0);
     if (env->me_dsync_fd != INVALID_HANDLE_VALUE) {
       if ((flags & MDBX_NOMETASYNC) == 0)
         env->me_fd4meta = env->me_dsync_fd;
-#if defined(_WIN32) || defined(_WIN64)
-      if (env->me_fd4data == env->me_lazy_fd)
-        env->me_fd4data = env->me_dsync_fd;
-#endif /* Windows must die */
       osal_fseek(env->me_dsync_fd, safe_parking_lot_offset);
     }
   }
@@ -14386,11 +14456,12 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname,
         rc = MDBX_ENOMEM;
     }
     if (rc == MDBX_SUCCESS)
-      rc = osal_ioring_create(&env->me_ioring,
+      rc = osal_ioring_create(&env->me_ioring
 #if defined(_WIN32) || defined(_WIN64)
-                              ior_flags,
+                              ,
+                              ior_direct, env->me_overlapped_fd
 #endif /* Windows */
-                              env->me_fd4data);
+      );
   }
 
 #if MDBX_DEBUG
@@ -14462,10 +14533,13 @@ __cold static int env_close(MDBX_env *env) {
   }
 
 #if defined(_WIN32) || defined(_WIN64)
-  if (env->me_overlapped_fd != INVALID_HANDLE_VALUE) {
-    CloseHandle(env->me_data_lock_event);
+  if (env->me_overlapped_fd) {
     CloseHandle(env->me_overlapped_fd);
-    env->me_overlapped_fd = INVALID_HANDLE_VALUE;
+    env->me_overlapped_fd = 0;
+  }
+  if (env->me_data_lock_event != INVALID_HANDLE_VALUE) {
+    CloseHandle(env->me_data_lock_event);
+    env->me_data_lock_event = INVALID_HANDLE_VALUE;
   }
 #endif /* Windows */
 
@@ -24054,6 +24128,24 @@ __cold int mdbx_env_set_option(MDBX_env *env, const MDBX_option_t option,
     recalculate_merge_threshold(env);
     break;
 
+  case MDBX_opt_writethrough_threshold:
+    if (value != (unsigned)value)
+      err = MDBX_EINVAL;
+    else
+#if defined(_WIN32) || defined(_WIN64)
+      /* позволяем "установить" значение по-умолчанию и совпадающее
+       * с поведением соответствующим текущей установке MDBX_NOMETASYNC */
+      if ((unsigned)-1 != (unsigned)value &&
+          value != ((env->me_flags & MDBX_NOMETASYNC) ? 0 : INT_MAX))
+        err = MDBX_EINVAL;
+#else
+      env->me_options.writethrough_threshold =
+          ((unsigned)-1 == (unsigned)value)
+              ? MDBX_WRITETHROUGH_THRESHOLD_DEFAULT
+              : (unsigned)value;
+#endif
+
+    break;
   default:
     return MDBX_EINVAL;
   }
@@ -24127,6 +24219,14 @@ __cold int mdbx_env_get_option(const MDBX_env *env, const MDBX_option_t option,
     *pvalue = env->me_options.merge_threshold_16dot16_percent;
     break;
 
+  case MDBX_opt_writethrough_threshold:
+#if defined(_WIN32) || defined(_WIN64)
+    *pvalue = (env->me_flags & MDBX_NOMETASYNC) ? 0 : INT_MAX;
+#else
+    *pvalue = env->me_options.writethrough_threshold;
+#endif
+    break;
+
   default:
     return MDBX_EINVAL;
   }
diff --git a/src/internals.h b/src/internals.h
index 64d9a779..d504684c 100644
--- a/src/internals.h
+++ b/src/internals.h
@@ -761,6 +761,10 @@ typedef struct MDBX_lockinfo {
 
   /* Low 32-bit of txnid with which meta-pages was synced,
    * i.e. for sync-polling in the MDBX_NOMETASYNC mode. */
+#define MDBX_NOMETASYNC_LAZY_UNK (UINT32_MAX / 3)
+#define MDBX_NOMETASYNC_LAZY_FD (MDBX_NOMETASYNC_LAZY_UNK + UINT32_MAX / 8)
+#define MDBX_NOMETASYNC_LAZY_WRITEMAP                                          \
+  (MDBX_NOMETASYNC_LAZY_UNK - UINT32_MAX / 8)
   MDBX_atomic_uint32_t mti_meta_sync_txnid;
 
   /* Period for timed auto-sync feature, i.e. at the every steady checkpoint
@@ -1213,10 +1217,10 @@ struct MDBX_env {
   osal_mmap_t me_dxb_mmap; /* The main data file */
 #define me_map me_dxb_mmap.base
 #define me_lazy_fd me_dxb_mmap.fd
-#define me_fd4data me_ioring.fd
   mdbx_filehandle_t me_dsync_fd, me_fd4meta;
 #if defined(_WIN32) || defined(_WIN64)
-  HANDLE me_overlapped_fd, me_data_lock_event;
+#define me_overlapped_fd me_ioring.overlapped_fd
+  HANDLE me_data_lock_event;
 #endif                     /* Windows */
   osal_mmap_t me_lck_mmap; /* The lock file */
 #define me_lfd me_lck_mmap.fd
@@ -1259,6 +1263,9 @@ struct MDBX_env {
     uint8_t spill_min_denominator;
     uint8_t spill_parent4child_denominator;
     unsigned merge_threshold_16dot16_percent;
+#if !(defined(_WIN32) || defined(_WIN64))
+    unsigned writethrough_threshold;
+#endif /* Windows */
     union {
       unsigned all;
       /* tracks options with non-auto values but tuned by user */
diff --git a/src/lck-windows.c b/src/lck-windows.c
index 3bbe3f3b..e6ae78d2 100644
--- a/src/lck-windows.c
+++ b/src/lck-windows.c
@@ -152,8 +152,10 @@ static __inline int flock(HANDLE fd, unsigned flags, size_t offset,
 
 static __inline int flock_data(const MDBX_env *env, unsigned flags,
                                size_t offset, size_t bytes) {
-  return flock_with_event(env->me_fd4data, env->me_data_lock_event, flags,
-                          offset, bytes);
+  const HANDLE fd4data =
+      env->me_overlapped_fd ? env->me_overlapped_fd : env->me_lazy_fd;
+  return flock_with_event(fd4data, env->me_data_lock_event, flags, offset,
+                          bytes);
 }
 
 static int funlock(mdbx_filehandle_t fd, size_t offset, size_t bytes) {
@@ -195,17 +197,19 @@ int mdbx_txn_lock(MDBX_env *env, bool dontwait) {
   if (env->me_flags & MDBX_EXCLUSIVE)
     return MDBX_SUCCESS;
 
-  int rc = flock_with_event(env->me_fd4data, env->me_data_lock_event,
+  const HANDLE fd4data =
+      env->me_overlapped_fd ? env->me_overlapped_fd : env->me_lazy_fd;
+  int rc = flock_with_event(fd4data, env->me_data_lock_event,
                             dontwait ? (LCK_EXCLUSIVE | LCK_DONTWAIT)
                                      : (LCK_EXCLUSIVE | LCK_WAITFOR),
                             DXB_BODY);
   if (rc == ERROR_LOCK_VIOLATION && dontwait) {
     SleepEx(0, true);
-    rc = flock_with_event(env->me_fd4data, env->me_data_lock_event,
+    rc = flock_with_event(fd4data, env->me_data_lock_event,
                           LCK_EXCLUSIVE | LCK_DONTWAIT, DXB_BODY);
     if (rc == ERROR_LOCK_VIOLATION) {
       SleepEx(0, true);
-      rc = flock_with_event(env->me_fd4data, env->me_data_lock_event,
+      rc = flock_with_event(fd4data, env->me_data_lock_event,
                             LCK_EXCLUSIVE | LCK_DONTWAIT, DXB_BODY);
     }
   }
@@ -218,7 +222,9 @@ int mdbx_txn_lock(MDBX_env *env, bool dontwait) {
 
 void mdbx_txn_unlock(MDBX_env *env) {
   if ((env->me_flags & MDBX_EXCLUSIVE) == 0) {
-    int err = funlock(env->me_fd4data, DXB_BODY);
+    const HANDLE fd4data =
+        env->me_overlapped_fd ? env->me_overlapped_fd : env->me_lazy_fd;
+    int err = funlock(fd4data, DXB_BODY);
     if (err != MDBX_SUCCESS)
       mdbx_panic("%s failed: err %u", __func__, err);
   }
@@ -451,18 +457,20 @@ static void lck_unlock(MDBX_env *env) {
     SetLastError(ERROR_SUCCESS);
   }
 
-  if (env->me_fd4data != INVALID_HANDLE_VALUE) {
+  const HANDLE fd4data =
+      env->me_overlapped_fd ? env->me_overlapped_fd : env->me_lazy_fd;
+  if (fd4data != INVALID_HANDLE_VALUE) {
     /* explicitly unlock to avoid latency for other processes (windows kernel
      * releases such locks via deferred queues) */
     do
-      err = funlock(env->me_fd4data, DXB_BODY);
+      err = funlock(fd4data, DXB_BODY);
     while (err == MDBX_SUCCESS);
     assert(err == ERROR_NOT_LOCKED ||
            (mdbx_RunningUnderWine() && err == ERROR_LOCK_VIOLATION));
     SetLastError(ERROR_SUCCESS);
 
     do
-      err = funlock(env->me_fd4data, DXB_WHOLE);
+      err = funlock(fd4data, DXB_WHOLE);
     while (err == MDBX_SUCCESS);
     assert(err == ERROR_NOT_LOCKED ||
            (mdbx_RunningUnderWine() && err == ERROR_LOCK_VIOLATION));
@@ -522,7 +530,9 @@ static int internal_seize_lck(HANDLE lfd) {
 }
 
 MDBX_INTERNAL_FUNC int osal_lck_seize(MDBX_env *env) {
-  assert(env->me_fd4data != INVALID_HANDLE_VALUE);
+  const HANDLE fd4data =
+      env->me_overlapped_fd ? env->me_overlapped_fd : env->me_lazy_fd;
+  assert(fd4data != INVALID_HANDLE_VALUE);
   if (env->me_flags & MDBX_EXCLUSIVE)
     return MDBX_RESULT_TRUE /* nope since files were must be opened
                                non-shareable */
@@ -554,7 +564,7 @@ MDBX_INTERNAL_FUNC int osal_lck_seize(MDBX_env *env) {
       return err;
     }
     jitter4testing(false);
-    err = funlock(env->me_fd4data, DXB_WHOLE);
+    err = funlock(fd4data, DXB_WHOLE);
     if (err != MDBX_SUCCESS)
       mdbx_panic("%s(%s) failed: err %u", __func__,
                  "unlock-against-without-lck", err);
@@ -564,8 +574,10 @@ MDBX_INTERNAL_FUNC int osal_lck_seize(MDBX_env *env) {
 }
 
 MDBX_INTERNAL_FUNC int osal_lck_downgrade(MDBX_env *env) {
+  const HANDLE fd4data =
+      env->me_overlapped_fd ? env->me_overlapped_fd : env->me_lazy_fd;
   /* Transite from exclusive-write state (E-E) to used (S-?) */
-  assert(env->me_fd4data != INVALID_HANDLE_VALUE);
+  assert(fd4data != INVALID_HANDLE_VALUE);
   assert(env->me_lfd != INVALID_HANDLE_VALUE);
 
   if (env->me_flags & MDBX_EXCLUSIVE)
diff --git a/src/osal.c b/src/osal.c
index b1e6ded4..83e7ca57 100644
--- a/src/osal.c
+++ b/src/osal.c
@@ -606,16 +606,18 @@ static size_t osal_iov_max;
 #undef OSAL_IOV_MAX
 #endif /* OSAL_IOV_MAX */
 
-MDBX_INTERNAL_FUNC int osal_ioring_create(osal_ioring_t *ior,
+MDBX_INTERNAL_FUNC int osal_ioring_create(osal_ioring_t *ior
 #if defined(_WIN32) || defined(_WIN64)
-                                          uint8_t flags,
+                                          ,
+                                          bool enable_direct,
+                                          mdbx_filehandle_t overlapped_fd
 #endif /* Windows */
-                                          mdbx_filehandle_t fd) {
+) {
   memset(ior, 0, sizeof(osal_ioring_t));
-  ior->fd = fd;
 
 #if defined(_WIN32) || defined(_WIN64)
-  ior->flags = flags;
+  ior->overlapped_fd = overlapped_fd;
+  ior->direct = enable_direct && overlapped_fd;
   const unsigned pagesize = (unsigned)osal_syspagesize();
   ior->pagesize = pagesize;
   ior->pagesize_ln2 = (uint8_t)log2n_powerof2(pagesize);
@@ -664,7 +666,7 @@ MDBX_INTERNAL_FUNC int osal_ioring_add(osal_ioring_t *ior, const size_t offset,
 #if defined(_WIN32) || defined(_WIN64)
   const unsigned segments = (unsigned)(bytes >> ior->pagesize_ln2);
   const bool use_gather =
-      (ior->flags & IOR_DIRECT) && ior->slots_left >= segments;
+      ior->direct && ior->overlapped_fd && ior->slots_left >= segments;
 #endif /* Windows */
 
   ior_item_t *item = ior->pool;
@@ -678,6 +680,7 @@ MDBX_INTERNAL_FUNC int osal_ioring_add(osal_ioring_t *ior, const size_t offset,
             (uintptr_t)(uint64_t)item->sgv[0].Buffer) &
            ior_alignment_mask) == 0 &&
           ior->last_sgvcnt + segments < OSAL_IOV_MAX) {
+        assert(ior->overlapped_fd);
         assert((item->single.iov_len & ior_WriteFile_flag) == 0);
         assert(item->sgv[ior->last_sgvcnt].Buffer == 0);
         ior->last_bytes += bytes;
@@ -745,6 +748,7 @@ MDBX_INTERNAL_FUNC int osal_ioring_add(osal_ioring_t *ior, const size_t offset,
     assert((item->single.iov_len & ior_WriteFile_flag) != 0);
   } else {
     /* WriteFileGather() */
+    assert(ior->overlapped_fd);
     item->sgv[0].Buffer = PtrToPtr64(data);
     for (size_t i = 1; i < segments; ++i) {
       data = ptr_disp(data, ior->pagesize);
@@ -814,7 +818,7 @@ MDBX_INTERNAL_FUNC void osal_ioring_walk(
 }
 
 MDBX_INTERNAL_FUNC osal_ioring_write_result_t
-osal_ioring_write(osal_ioring_t *ior) {
+osal_ioring_write(osal_ioring_t *ior, mdbx_filehandle_t fd) {
   osal_ioring_write_result_t r = {MDBX_SUCCESS, 0};
 
 #if defined(_WIN32) || defined(_WIN64)
@@ -828,6 +832,7 @@ osal_ioring_write(osal_ioring_t *ior) {
     size_t i = 1, bytes = item->single.iov_len - ior_WriteFile_flag;
     r.wops += 1;
     if (bytes & ior_WriteFile_flag) {
+      assert(ior->overlapped_fd && fd == ior->overlapped_fd);
       bytes = ior->pagesize;
       while (item->sgv[i].Buffer) {
         bytes += ior->pagesize;
@@ -840,11 +845,10 @@ osal_ioring_write(osal_ioring_t *ior) {
         r.err = GetLastError();
       bailout_rc:
         assert(r.err != MDBX_SUCCESS);
-        CancelIo(ior->fd);
+        CancelIo(fd);
         return r;
       }
-      if (WriteFileGather(ior->fd, item->sgv, (DWORD)bytes, nullptr,
-                          &item->ov)) {
+      if (WriteFileGather(fd, item->sgv, (DWORD)bytes, nullptr, &item->ov)) {
         assert(item->ov.Internal == 0 &&
                WaitForSingleObject(item->ov.hEvent, 0) == WAIT_OBJECT_0);
         ior_put_event(ior, item->ov.hEvent);
@@ -854,7 +858,7 @@ osal_ioring_write(osal_ioring_t *ior) {
         if (unlikely(r.err != ERROR_IO_PENDING)) {
           ERROR("%s: fd %p, item %p (%zu), pgno %u, bytes %zu, offset %" PRId64
                 ", err %d",
-                "WriteFileGather", ior->fd, __Wpedantic_format_voidptr(item),
+                "WriteFileGather", fd, __Wpedantic_format_voidptr(item),
                 item - ior->pool, ((MDBX_page *)item->single.iov_base)->mp_pgno,
                 bytes, item->ov.Offset + ((uint64_t)item->ov.OffsetHigh << 32),
                 r.err);
@@ -863,11 +867,11 @@ osal_ioring_write(osal_ioring_t *ior) {
         assert(wait_for > ior->event_pool + ior->event_stack);
         *--wait_for = item->ov.hEvent;
       }
-    } else if (ior->flags & IOR_OVERLAPPED) {
+    } else if (fd == ior->overlapped_fd) {
       assert(bytes < MAX_WRITE);
     retry:
       item->ov.hEvent = ior;
-      if (WriteFileEx(ior->fd, item->single.iov_base, (DWORD)bytes, &item->ov,
+      if (WriteFileEx(fd, item->single.iov_base, (DWORD)bytes, &item->ov,
                       ior_wocr)) {
         async_started += 1;
       } else {
@@ -876,7 +880,7 @@ osal_ioring_write(osal_ioring_t *ior) {
         default:
           ERROR("%s: fd %p, item %p (%zu), pgno %u, bytes %zu, offset %" PRId64
                 ", err %d",
-                "WriteFileEx", ior->fd, __Wpedantic_format_voidptr(item),
+                "WriteFileEx", fd, __Wpedantic_format_voidptr(item),
                 item - ior->pool, ((MDBX_page *)item->single.iov_base)->mp_pgno,
                 bytes, item->ov.Offset + ((uint64_t)item->ov.OffsetHigh << 32),
                 r.err);
@@ -887,7 +891,7 @@ osal_ioring_write(osal_ioring_t *ior) {
           WARNING(
               "%s: fd %p, item %p (%zu), pgno %u, bytes %zu, offset %" PRId64
               ", err %d",
-              "WriteFileEx", ior->fd, __Wpedantic_format_voidptr(item),
+              "WriteFileEx", fd, __Wpedantic_format_voidptr(item),
               item - ior->pool, ((MDBX_page *)item->single.iov_base)->mp_pgno,
               bytes, item->ov.Offset + ((uint64_t)item->ov.OffsetHigh << 32),
               r.err);
@@ -905,12 +909,12 @@ osal_ioring_write(osal_ioring_t *ior) {
     } else {
       assert(bytes < MAX_WRITE);
       DWORD written = 0;
-      if (!WriteFile(ior->fd, item->single.iov_base, (DWORD)bytes, &written,
+      if (!WriteFile(fd, item->single.iov_base, (DWORD)bytes, &written,
                      &item->ov)) {
         r.err = (int)GetLastError();
         ERROR("%s: fd %p, item %p (%zu), pgno %u, bytes %zu, offset %" PRId64
               ", err %d",
-              "WriteFile", ior->fd, __Wpedantic_format_voidptr(item),
+              "WriteFile", fd, __Wpedantic_format_voidptr(item),
               item - ior->pool, ((MDBX_page *)item->single.iov_base)->mp_pgno,
               bytes, item->ov.Offset + ((uint64_t)item->ov.OffsetHigh << 32),
               r.err);
@@ -974,8 +978,7 @@ osal_ioring_write(osal_ioring_t *ior) {
         }
         if (!HasOverlappedIoCompleted(&item->ov)) {
           DWORD written = 0;
-          if (unlikely(
-                  !GetOverlappedResult(ior->fd, &item->ov, &written, true))) {
+          if (unlikely(!GetOverlappedResult(fd, &item->ov, &written, true))) {
             ERROR("%s: item %p (%zu), pgno %u, bytes %zu, offset %" PRId64
                   ", err %d",
                   "GetOverlappedResult", __Wpedantic_format_voidptr(item),
@@ -1025,16 +1028,16 @@ osal_ioring_write(osal_ioring_t *ior) {
 #if MDBX_HAVE_PWRITEV
     assert(item->sgvcnt > 0);
     if (item->sgvcnt == 1)
-      r.err = osal_pwrite(ior->fd, item->sgv[0].iov_base, item->sgv[0].iov_len,
+      r.err = osal_pwrite(fd, item->sgv[0].iov_base, item->sgv[0].iov_len,
                           item->offset);
     else
-      r.err = osal_pwritev(ior->fd, item->sgv, item->sgvcnt, item->offset);
+      r.err = osal_pwritev(fd, item->sgv, item->sgvcnt, item->offset);
 
     // TODO: io_uring_prep_write(sqe, fd, ...);
 
     item = ior_next(item, item->sgvcnt);
 #else
-    r.err = osal_pwrite(ior->fd, item->single.iov_base, item->single.iov_len,
+    r.err = osal_pwrite(fd, item->single.iov_base, item->single.iov_len,
                         item->offset);
     item = ior_next(item, 1);
 #endif
@@ -1055,8 +1058,10 @@ MDBX_INTERNAL_FUNC void osal_ioring_reset(osal_ioring_t *ior) {
 #if defined(_WIN32) || defined(_WIN64)
   if (ior->last) {
     for (ior_item_t *item = ior->pool; item <= ior->last;) {
-      if (!HasOverlappedIoCompleted(&item->ov))
-        CancelIoEx(ior->fd, &item->ov);
+      if (!HasOverlappedIoCompleted(&item->ov)) {
+        assert(ior->overlapped_fd);
+        CancelIoEx(ior->overlapped_fd, &item->ov);
+      }
       if (item->ov.hEvent && item->ov.hEvent != ior)
         ior_put_event(ior, item->ov.hEvent);
       size_t i = 1;
@@ -1090,13 +1095,12 @@ MDBX_INTERNAL_FUNC int osal_ioring_resize(osal_ioring_t *ior, size_t items) {
 #if defined(_WIN32) || defined(_WIN64)
   if (ior->state & IOR_STATE_LOCKED)
     return MDBX_SUCCESS;
-  const bool useSetFileIoOverlappedRange = (ior->flags & IOR_OVERLAPPED) &&
-                                           mdbx_SetFileIoOverlappedRange &&
-                                           items > 7;
+  const bool useSetFileIoOverlappedRange =
+      ior->overlapped_fd && mdbx_SetFileIoOverlappedRange && items > 42;
   const size_t ceiling =
       useSetFileIoOverlappedRange
           ? ((items < 65536 / 2 / sizeof(ior_item_t)) ? 65536 : 65536 * 4)
-          : 4096;
+          : 1024;
   const size_t bytes = ceil_powerof2(sizeof(ior_item_t) * items, ceiling);
   items = bytes / sizeof(ior_item_t);
 #endif /* Windows */
@@ -1134,7 +1138,7 @@ MDBX_INTERNAL_FUNC int osal_ioring_resize(osal_ioring_t *ior, size_t items) {
     ior->boundary = ptr_disp(ior->pool, ior->allocated);
 #if defined(_WIN32) || defined(_WIN64)
     if (useSetFileIoOverlappedRange) {
-      if (mdbx_SetFileIoOverlappedRange(ior->fd, ptr, (ULONG)bytes))
+      if (mdbx_SetFileIoOverlappedRange(ior->overlapped_fd, ptr, (ULONG)bytes))
         ior->state += IOR_STATE_LOCKED;
       else
         return GetLastError();
diff --git a/src/osal.h b/src/osal.h
index aaa7809a..31640c89 100644
--- a/src/osal.h
+++ b/src/osal.h
@@ -312,13 +312,12 @@ typedef struct osal_ioring {
   unsigned slots_left;
   unsigned allocated;
 #if defined(_WIN32) || defined(_WIN64)
-#define IOR_DIRECT 1
-#define IOR_OVERLAPPED 2
 #define IOR_STATE_LOCKED 1
+  HANDLE overlapped_fd;
   unsigned pagesize;
   unsigned last_sgvcnt;
   size_t last_bytes;
-  uint8_t flags, state, pagesize_ln2;
+  uint8_t direct, state, pagesize_ln2;
   unsigned event_stack;
   HANDLE *event_pool;
   volatile LONG async_waiting;
@@ -335,7 +334,6 @@ typedef struct osal_ioring {
 #define ior_last_sgvcnt(ior, item) (1)
 #define ior_last_bytes(ior, item) (item)->single.iov_len
 #endif /* !Windows */
-  mdbx_filehandle_t fd;
   ior_item_t *last;
   ior_item_t *pool;
   char *boundary;
@@ -344,11 +342,13 @@ typedef struct osal_ioring {
 #ifndef __cplusplus
 
 /* Actually this is not ioring for now, but on the way. */
-MDBX_INTERNAL_FUNC int osal_ioring_create(osal_ioring_t *,
+MDBX_INTERNAL_FUNC int osal_ioring_create(osal_ioring_t *
 #if defined(_WIN32) || defined(_WIN64)
-                                          uint8_t flags,
+                                          ,
+                                          bool enable_direct,
+                                          mdbx_filehandle_t overlapped_fd
 #endif /* Windows */
-                                          mdbx_filehandle_t fd);
+);
 MDBX_INTERNAL_FUNC int osal_ioring_resize(osal_ioring_t *, size_t items);
 MDBX_INTERNAL_FUNC void osal_ioring_destroy(osal_ioring_t *);
 MDBX_INTERNAL_FUNC void osal_ioring_reset(osal_ioring_t *);
@@ -359,7 +359,7 @@ typedef struct osal_ioring_write_result {
   unsigned wops;
 } osal_ioring_write_result_t;
 MDBX_INTERNAL_FUNC osal_ioring_write_result_t
-osal_ioring_write(osal_ioring_t *ior);
+osal_ioring_write(osal_ioring_t *ior, mdbx_filehandle_t fd);
 
 typedef struct iov_ctx iov_ctx_t;
 MDBX_INTERNAL_FUNC void osal_ioring_walk(
@@ -377,11 +377,13 @@ osal_ioring_used(const osal_ioring_t *ior) {
 }
 
 MDBX_MAYBE_UNUSED static inline int
-osal_ioring_reserve(osal_ioring_t *ior, size_t items, size_t bytes) {
+osal_ioring_prepare(osal_ioring_t *ior, size_t items, size_t bytes) {
   items = (items > 32) ? items : 32;
 #if defined(_WIN32) || defined(_WIN64)
-  const size_t npages = bytes >> ior->pagesize_ln2;
-  items = (items > npages) ? items : npages;
+  if (ior->direct) {
+    const size_t npages = bytes >> ior->pagesize_ln2;
+    items = (items > npages) ? items : npages;
+  }
 #else
   (void)bytes;
 #endif