mdbx: split page_alloc() and page_new() to fast- and slow/rare- parts.

2025-11-26 10:22:26 +08:00 · 2022-06-21 19:48:49 +03:00
parent f1ccc717b4
commit ece2fe2514
1 changed files with 140 additions and 93 deletions
--- a/src/core.c
+++ b/src/core.c
@@ -3819,13 +3819,12 @@ struct page_result {
  int err;
 };

-static struct page_result mdbx_page_alloc(MDBX_cursor *mc, const pgno_t num,
-                                          int flags);
 static txnid_t mdbx_kick_longlived_readers(MDBX_env *env,
                                           const txnid_t laggard);

-static struct page_result mdbx_page_new(MDBX_cursor *mc, const unsigned flags,
-                                        const unsigned npages);
+static struct page_result page_new(MDBX_cursor *mc, const unsigned flags);
+static struct page_result page_new_large(MDBX_cursor *mc,
+                                         const unsigned npages);
 static int mdbx_page_touch(MDBX_cursor *mc);
 static int mdbx_cursor_touch(MDBX_cursor *mc);
 static int mdbx_touch_dbi(MDBX_cursor *mc);
@@ -6508,8 +6507,8 @@ __cold static int mdbx_wipe_steady(MDBX_env *env, const txnid_t last_steady) {
 #define MDBX_ALLOC_NOLOG 32
 #define MDBX_ALLOC_ALL (MDBX_ALLOC_CACHE | MDBX_ALLOC_GC | MDBX_ALLOC_NEW)

-__hot static struct page_result mdbx_page_alloc(MDBX_cursor *mc,
-                                                const pgno_t num, int flags) {
+__cold static struct page_result
+page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num, int flags) {
  struct page_result ret;
  MDBX_txn *const txn = mc->mc_txn;
  MDBX_env *const env = txn->mt_env;
@@ -6535,37 +6534,6 @@ __hot static struct page_result mdbx_page_alloc(MDBX_cursor *mc,
      flags &= ~(MDBX_ALLOC_GC | MDBX_COALESCE);
  }

-  if (likely(num == 1 && (flags & MDBX_ALLOC_CACHE) != 0)) {
-    /* If there are any loose pages, just use them */
-    mdbx_assert(env, (flags & MDBX_ALLOC_SLOT) == 0);
-    if (likely(txn->tw.loose_pages)) {
-#if MDBX_ENABLE_REFUND
-      if (txn->tw.loose_refund_wl > txn->mt_next_pgno) {
-        mdbx_refund(txn);
-        if (unlikely(!txn->tw.loose_pages))
-          goto no_loose;
-      }
-#endif /* MDBX_ENABLE_REFUND */
-
-      ret.page = txn->tw.loose_pages;
-      txn->tw.loose_pages = ret.page->mp_next;
-      txn->tw.loose_count--;
-      mdbx_debug_extra("db %d use loose page %" PRIaPGNO, DDBI(mc),
-                       ret.page->mp_pgno);
-      mdbx_tassert(txn, ret.page->mp_pgno < txn->mt_next_pgno);
-      mdbx_ensure(env, ret.page->mp_pgno >= NUM_METAS);
-      VALGRIND_MAKE_MEM_UNDEFINED(page_data(ret.page), page_space(txn->mt_env));
-      MDBX_ASAN_UNPOISON_MEMORY_REGION(page_data(ret.page),
-                                       page_space(txn->mt_env));
-      ret.page->mp_txnid = txn->mt_front;
-      ret.err = MDBX_SUCCESS;
-      return ret;
-    }
-  }
-#if MDBX_ENABLE_REFUND
-no_loose:
-#endif /* MDBX_ENABLE_REFUND */
-
  mdbx_tassert(txn,
               mdbx_pnl_check4assert(txn->tw.reclaimed_pglist,
                                     txn->mt_next_pgno - MDBX_ENABLE_REFUND));
@@ -6968,7 +6936,6 @@ done:

  if (env->me_flags & MDBX_WRITEMAP) {
    ret.page = pgno2page(env, pgno);
-    /* LY: reset no-access flag from mdbx_page_loose() */
    VALGRIND_MAKE_MEM_UNDEFINED(ret.page, pgno2bytes(env, num));
    MDBX_ASAN_UNPOISON_MEMORY_REGION(ret.page, pgno2bytes(env, num));
  } else {
@@ -7022,6 +6989,76 @@ done:
  return ret;
 }

+__hot static struct page_result page_alloc(MDBX_cursor *mc) {
+  MDBX_txn *const txn = mc->mc_txn;
+
+  /* If there are any loose pages, just use them */
+  while (likely(txn->tw.loose_pages)) {
+#if MDBX_ENABLE_REFUND
+    if (unlikely(txn->tw.loose_refund_wl > txn->mt_next_pgno)) {
+      mdbx_refund(txn);
+      if (!txn->tw.loose_pages)
+        break;
+    }
+#endif /* MDBX_ENABLE_REFUND */
+
+    MDBX_page *page = txn->tw.loose_pages;
+    txn->tw.loose_pages = page->mp_next;
+    txn->tw.loose_count--;
+    mdbx_debug_extra("db %d use loose page %" PRIaPGNO, DDBI(mc),
+                     page->mp_pgno);
+    mdbx_tassert(txn, page->mp_pgno < txn->mt_next_pgno);
+    mdbx_tassert(txn, page->mp_pgno >= NUM_METAS);
+    VALGRIND_MAKE_MEM_UNDEFINED(page_data(page), page_space(txn->mt_env));
+    MDBX_ASAN_UNPOISON_MEMORY_REGION(page_data(page), page_space(txn->mt_env));
+    page->mp_txnid = txn->mt_front;
+    struct page_result ret = {page, MDBX_SUCCESS};
+    return ret;
+  }
+
+  if (likely(!(mc->mc_flags & C_GCFREEZE))) {
+    MDBX_PNL pnl = txn->tw.reclaimed_pglist;
+    const unsigned len = MDBX_PNL_SIZE(pnl);
+    if (likely(len > 0)) {
+      MDBX_PNL_SIZE(pnl) = len - 1;
+#if MDBX_PNL_ASCENDING
+      const pgno_t pgno = pnl[1];
+      for (unsigned i = 1; i < len; ++i)
+        pnl[i] = pnl[i + 1];
+#else
+      const pgno_t pgno = pnl[len];
+#endif
+
+      MDBX_env *const env = txn->mt_env;
+      struct page_result ret;
+      if (env->me_flags & MDBX_WRITEMAP) {
+        ret.page = pgno2page(env, pgno);
+        MDBX_ASAN_UNPOISON_MEMORY_REGION(ret.page, env->me_psize);
+      } else {
+        ret.page = mdbx_page_malloc(txn, 1);
+        if (unlikely(!ret.page)) {
+          ret.err = MDBX_ENOMEM;
+          return ret;
+        }
+      }
+
+      VALGRIND_MAKE_MEM_UNDEFINED(ret.page, env->me_psize);
+      ret.page->mp_pgno = pgno;
+      ret.page->mp_leaf2_ksize = 0;
+      ret.page->mp_flags = 0;
+      mdbx_tassert(txn, ret.page->mp_pgno >= NUM_METAS);
+
+      ret.err = mdbx_page_dirty(txn, ret.page, 1);
+      mdbx_tassert(
+          txn, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist,
+                                     txn->mt_next_pgno - MDBX_ENABLE_REFUND));
+      return ret;
+    }
+  }
+
+  return page_alloc_slowpath(mc, 1, MDBX_ALLOC_ALL);
+}
+
 /* Copy the used portions of a non-overflow page. */
 __hot static void mdbx_page_copy(MDBX_page *dst, const MDBX_page *src,
                                 size_t psize) {
@@ -7131,7 +7168,7 @@ __hot static int mdbx_page_touch(MDBX_cursor *mc) {
    rc = mdbx_pnl_need(&txn->tw.retired_pages, 1);
    if (unlikely(rc != MDBX_SUCCESS))
      goto fail;
-    const struct page_result par = mdbx_page_alloc(mc, 1, MDBX_ALLOC_ALL);
+    const struct page_result par = page_alloc(mc);
    rc = par.err;
    np = par.page;
    if (unlikely(rc != MDBX_SUCCESS))
@@ -9126,9 +9163,9 @@ static int gcu_clean_stored_retired(MDBX_txn *txn, gcu_context_t *ctx) {
  return err;
 }

-/* LY: Prepare a backlog of pages to modify GC itself,
- * while reclaiming is prohibited. It should be enough to prevent search
- * in mdbx_page_alloc() during a deleting, when GC tree is unbalanced. */
+/* Prepare a backlog of pages to modify GC itself, while reclaiming is
+ * prohibited. It should be enough to prevent search in page_alloc_slowpath()
+ * during a deleting, when GC tree is unbalanced. */
 static int gcu_prepare_backlog(MDBX_txn *txn, gcu_context_t *ctx,
                               const bool reserve4retired) {
  const unsigned pages4retiredlist =
@@ -9171,8 +9208,8 @@ static int gcu_prepare_backlog(MDBX_txn *txn, gcu_context_t *ctx,
    err = gcu_clean_stored_retired(txn, ctx);
    if (unlikely(err != MDBX_SUCCESS))
      return err;
-    err = mdbx_page_alloc(&ctx->cursor.outer, pages4retiredlist,
-                          MDBX_ALLOC_GC | MDBX_ALLOC_FAKE)
+    err = page_alloc_slowpath(&ctx->cursor.outer, pages4retiredlist,
+                              MDBX_ALLOC_GC | MDBX_ALLOC_FAKE)
              .err;
    mdbx_trace("== after-4linear, backlog %u, err %d", gcu_backlog_size(txn),
               err);
@@ -9183,9 +9220,9 @@ static int gcu_prepare_backlog(MDBX_txn *txn, gcu_context_t *ctx,

  while (gcu_backlog_size(txn) < backlog4cow + pages4retiredlist &&
         err == MDBX_SUCCESS)
-    err = mdbx_page_alloc(&ctx->cursor.outer, 0,
-                          MDBX_ALLOC_GC | MDBX_ALLOC_SLOT | MDBX_ALLOC_FAKE |
-                              MDBX_ALLOC_NOLOG)
+    err = page_alloc_slowpath(&ctx->cursor.outer, 0,
+                              MDBX_ALLOC_GC | MDBX_ALLOC_SLOT |
+                                  MDBX_ALLOC_FAKE | MDBX_ALLOC_NOLOG)
              .err;

  ctx->cursor.outer.mc_flags |= C_RECLAIMING;
@@ -9378,10 +9415,10 @@ retry:
        if (txn->tw.loose_count > 0) {
          mdbx_trace("%s: try allocate gc-slot for %u loose-pages",
                     dbg_prefix_mode, txn->tw.loose_count);
-          rc =
-              mdbx_page_alloc(&ctx->cursor.outer, 0,
-                              MDBX_ALLOC_GC | MDBX_ALLOC_SLOT | MDBX_ALLOC_FAKE)
-                  .err;
+          rc = page_alloc_slowpath(&ctx->cursor.outer, 0,
+                                   MDBX_ALLOC_GC | MDBX_ALLOC_SLOT |
+                                       MDBX_ALLOC_FAKE)
+                   .err;
          if (rc == MDBX_SUCCESS) {
            mdbx_trace("%s: retry since gc-slot for %u loose-pages available",
                       dbg_prefix_mode, txn->tw.loose_count);
@@ -9552,10 +9589,10 @@ retry:
        ctx->cursor.outer.mc_flags &= ~C_RECLAIMING;
        do {
          snap_oldest = mdbx_find_oldest(txn);
-          rc =
-              mdbx_page_alloc(&ctx->cursor.outer, 0,
-                              MDBX_ALLOC_GC | MDBX_ALLOC_SLOT | MDBX_ALLOC_FAKE)
-                  .err;
+          rc = page_alloc_slowpath(&ctx->cursor.outer, 0,
+                                   MDBX_ALLOC_GC | MDBX_ALLOC_SLOT |
+                                       MDBX_ALLOC_FAKE)
+                   .err;
          if (likely(rc == MDBX_SUCCESS)) {
            mdbx_trace("%s: took @%" PRIaTXN " from GC", dbg_prefix_mode,
                       MDBX_PNL_LAST(txn->tw.lifo_reclaimed));
@@ -9585,7 +9622,7 @@ retry:
        } else {
          mdbx_tassert(txn, txn->tw.last_reclaimed == 0);
          if (unlikely(mdbx_find_oldest(txn) != snap_oldest))
-            /* should retry mdbx_page_alloc(MDBX_ALLOC_GC)
+            /* should retry page_alloc_slowpath(MDBX_ALLOC_GC)
             * if the oldest reader changes since the last attempt */
            goto retry_rid;
          /* no reclaimable GC entries,
@@ -15655,7 +15692,7 @@ int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data,
      if (unlikely(err != MDBX_SUCCESS))
        return err;
    }
-    struct page_result npr = mdbx_page_new(mc, P_LEAF, 1);
+    struct page_result npr = page_new(mc, P_LEAF);
    if (unlikely(npr.err != MDBX_SUCCESS))
      return npr.err;
    npr.err = mdbx_cursor_push(mc, npr.page);
@@ -15945,7 +15982,7 @@ int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data,
          nested_dupdb.md_entries = page_numkeys(fp);
          xdata.iov_len = sizeof(nested_dupdb);
          xdata.iov_base = &nested_dupdb;
-          const struct page_result par = mdbx_page_alloc(mc, 1, MDBX_ALLOC_ALL);
+          const struct page_result par = page_alloc(mc);
          mp = par.page;
          if (unlikely(par.err != MDBX_SUCCESS))
            return par.err;
@@ -16301,49 +16338,59 @@ fail:
 }

 /* Allocate and initialize new pages for a database.
- * Set MDBX_TXN_ERROR on failure.
- *
- * [in] mc a  cursor on the database being added to.
- * [in] flags flags defining what type of page is being allocated.
- * [in] num   the number of pages to allocate. This is usually 1,
- *            unless allocating overflow pages for a large record.
- * [out] mp   Address of a page, or NULL on failure.
- *
- * Returns 0 on success, non-zero on failure. */
-static struct page_result mdbx_page_new(MDBX_cursor *mc, const unsigned flags,
-                                        const unsigned npages) {
-  struct page_result ret = mdbx_page_alloc(mc, npages, MDBX_ALLOC_ALL);
+ * Set MDBX_TXN_ERROR on failure. */
+static struct page_result page_new(MDBX_cursor *mc, const unsigned flags) {
+  mdbx_cassert(mc, (flags & P_OVERFLOW) == 0);
+  struct page_result ret = page_alloc(mc);
  if (unlikely(ret.err != MDBX_SUCCESS))
    return ret;

-  mdbx_debug("db %u allocated new page %" PRIaPGNO ", num %u", mc->mc_dbi,
-             ret.page->mp_pgno, npages);
+  mdbx_debug("db %u allocated new page %" PRIaPGNO, mc->mc_dbi,
+             ret.page->mp_pgno);
  ret.page->mp_flags = (uint16_t)flags;
  ret.page->mp_txnid = mc->mc_txn->mt_front;
  mdbx_cassert(mc, *mc->mc_dbistate & DBI_DIRTY);
  mdbx_cassert(mc, mc->mc_txn->mt_flags & MDBX_TXN_DIRTY);
+#if MDBX_ENABLE_PGOP_STAT
+  mc->mc_txn->mt_env->me_lck->mti_pgop_stat.newly.weak += 1;
+#endif /* MDBX_ENABLE_PGOP_STAT */
+
+  STATIC_ASSERT(P_BRANCH == 1);
+  const unsigned is_branch = flags & P_BRANCH;
+
+  ret.page->mp_lower = 0;
+  ret.page->mp_upper = (indx_t)(mc->mc_txn->mt_env->me_psize - PAGEHDRSZ);
+  mc->mc_db->md_branch_pages += is_branch;
+  mc->mc_db->md_leaf_pages += 1 - is_branch;
+  if (unlikely(mc->mc_flags & C_SUB)) {
+    MDBX_db *outer = mdbx_outer_db(mc);
+    outer->md_branch_pages += is_branch;
+    outer->md_leaf_pages += 1 - is_branch;
+  }
+  return ret;
+}
+
+static struct page_result page_new_large(MDBX_cursor *mc,
+                                         const unsigned npages) {
+  struct page_result ret =
+      likely(npages == 1) ? page_alloc(mc)
+                          : page_alloc_slowpath(mc, npages, MDBX_ALLOC_ALL);
+  if (unlikely(ret.err != MDBX_SUCCESS))
+    return ret;
+
+  mdbx_debug("db %u allocated new large-page %" PRIaPGNO ", num %u", mc->mc_dbi,
+             ret.page->mp_pgno, npages);
+  ret.page->mp_flags = P_OVERFLOW;
+  ret.page->mp_txnid = mc->mc_txn->mt_front;
+  mdbx_cassert(mc, *mc->mc_dbistate & DBI_DIRTY);
+  mdbx_cassert(mc, mc->mc_txn->mt_flags & MDBX_TXN_DIRTY);
 #if MDBX_ENABLE_PGOP_STAT
  mc->mc_txn->mt_env->me_lck->mti_pgop_stat.newly.weak += npages;
 #endif /* MDBX_ENABLE_PGOP_STAT */

-  if (likely((flags & P_OVERFLOW) == 0)) {
-    STATIC_ASSERT(P_BRANCH == 1);
-    const bool is_branch = flags & P_BRANCH;
-    ret.page->mp_lower = 0;
-    ret.page->mp_upper = (indx_t)(mc->mc_txn->mt_env->me_psize - PAGEHDRSZ);
-    mc->mc_db->md_branch_pages += is_branch;
-    mc->mc_db->md_leaf_pages += 1 - is_branch;
-    if (unlikely(mc->mc_flags & C_SUB)) {
-      MDBX_db *outer = mdbx_outer_db(mc);
-      outer->md_branch_pages += is_branch;
-      outer->md_leaf_pages += 1 - is_branch;
-    }
-  } else {
-    mc->mc_db->md_overflow_pages += npages;
-    ret.page->mp_pages = npages;
-    mdbx_cassert(mc, !(mc->mc_flags & C_SUB));
-  }
-
+  mc->mc_db->md_overflow_pages += npages;
+  ret.page->mp_pages = npages;
+  mdbx_cassert(mc, !(mc->mc_flags & C_SUB));
  return ret;
 }

@@ -16464,7 +16511,7 @@ static int __must_check_result mdbx_node_add_leaf(MDBX_cursor *mc,
      return MDBX_PROBLEM;
    }
    const pgno_t ovpages = number_of_ovpages(mc->mc_txn->mt_env, data->iov_len);
-    const struct page_result npr = mdbx_page_new(mc, P_OVERFLOW, ovpages);
+    const struct page_result npr = page_new_large(mc, ovpages);
    if (unlikely(npr.err != MDBX_SUCCESS))
      return npr.err;
    largepage = npr.page;
@@ -18599,7 +18646,7 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *const newkey,
  mdbx_cassert(mc, nkeys + 1 >= minkeys * 2);

  /* Create a new sibling page. */
-  struct page_result npr = mdbx_page_new(mc, mp->mp_flags, 1);
+  struct page_result npr = page_new(mc, mp->mp_flags);
  if (unlikely(npr.err != MDBX_SUCCESS))
    return npr.err;
  MDBX_page *const sister = npr.page;
@@ -18611,7 +18658,7 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *const newkey,
   * the cursor height may be greater because it walks
   * up the stack while finding the branch slot to update. */
  if (mc->mc_top < 1) {
-    npr = mdbx_page_new(mc, P_BRANCH, 1);
+    npr = page_new(mc, P_BRANCH);
    rc = npr.err;
    if (unlikely(rc != MDBX_SUCCESS))
      goto done;