From e008f3132d7d96f86fffb7fc73e46ea1e7d39d3a Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Thu, 4 Jun 2020 20:09:02 +0300 Subject: [PATCH] mdbx: support for huge transactions (MDBX_HUGE_TRANSACTIONS option). Change-Id: I5d6cce6a7fb816add8cb4c066cc50f31cdebf9d5 --- CMakeLists.txt | 1 + src/config.h.in | 1 + src/core.c | 29 ++++++++++++++++------------- src/internals.h | 9 ++++++++- src/options.h | 5 +++++ 5 files changed, 31 insertions(+), 14 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c3652ee9..40355c27 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -378,6 +378,7 @@ add_mdbx_option(MDBX_BUILD_SHARED_LIBRARY "Build libmdbx as shared library (DLL) add_mdbx_option(MDBX_BUILD_TOOLS "Build MDBX tools (mdbx_chk/stat/dump/load/copy)" ${MDBX_BUILD_TOOLS_DEFAULT}) add_mdbx_option(MDBX_TXN_CHECKOWNER "Checking transaction matches the calling thread inside libmdbx's API" ON) add_mdbx_option(MDBX_TXN_CHECKPID "Paranoid checking PID inside libmdbx's API" AUTO) +add_mdbx_option(MDBX_HUGE_TRANSACTIONS "Support for huge write-transactions" OFF) mark_as_advanced(MDBX_TXN_CHECKPID) if(${CMAKE_SYSTEM_NAME} STREQUAL "Linux") add_mdbx_option(MDBX_DISABLE_GNU_SOURCE "Don't use GNU/Linux libc extensions" OFF) diff --git a/src/config.h.in b/src/config.h.in index 12f3dc63..871944e1 100644 --- a/src/config.h.in +++ b/src/config.h.in @@ -12,6 +12,7 @@ #cmakedefine MDBX_FORCE_ASSERTIONS /* Common */ +#cmakedefine01 MDBX_HUGE_TRANSACTIONS #cmakedefine01 MDBX_TXN_CHECKOWNER #cmakedefine MDBX_TXN_CHECKPID_AUTO #ifndef MDBX_TXN_CHECKPID_AUTO diff --git a/src/core.c b/src/core.c index 6176b408..94f18b2f 100644 --- a/src/core.c +++ b/src/core.c @@ -394,9 +394,9 @@ __cold intptr_t mdbx_limits_valsize_max(intptr_t pagesize, unsigned flags) { const unsigned page_ln2 = log2n(pagesize); const size_t hard = 0x7FF00000ul; const size_t hard_pages = hard >> page_ln2; - const size_t limit = (hard_pages < MDBX_DPL_TXNFULL) + const size_t limit = (hard_pages < MDBX_DPL_TXNFULL / 3) ? hard - : ((size_t)MDBX_DPL_TXNFULL << page_ln2); + : ((size_t)MDBX_DPL_TXNFULL / 3 << page_ln2); return (limit < MAX_MAPSIZE) ? limit / 2 : MAX_MAPSIZE / 2; } @@ -4201,8 +4201,17 @@ static int mdbx_page_spill(MDBX_cursor *mc, const MDBX_val *key, if (txn->tw.dirtyroom > i) return MDBX_SUCCESS; + /* Less aggressive spill - we originally spilled the entire dirty list, + * with a few exceptions for cursor pages and DB root pages. But this + * turns out to be a lot of wasted effort because in a large txn many + * of those pages will need to be used again. So now we spill only 1/8th + * of the dirty pages. Testing revealed this to be a good tradeoff, + * better than 1/2, 1/4, or 1/10. */ + if (need < MDBX_DPL_TXNFULL / 8) + need = MDBX_DPL_TXNFULL / 8; + if (!txn->tw.spill_pages) { - txn->tw.spill_pages = mdbx_pnl_alloc(MDBX_DPL_TXNFULL / 8); + txn->tw.spill_pages = mdbx_pnl_alloc(need); if (unlikely(!txn->tw.spill_pages)) return MDBX_ENOMEM; } else { @@ -4221,15 +4230,6 @@ static int mdbx_page_spill(MDBX_cursor *mc, const MDBX_val *key, if (unlikely(rc != MDBX_SUCCESS)) goto bailout; - /* Less aggressive spill - we originally spilled the entire dirty list, - * with a few exceptions for cursor pages and DB root pages. But this - * turns out to be a lot of wasted effort because in a large txn many - * of those pages will need to be used again. So now we spill only 1/8th - * of the dirty pages. Testing revealed this to be a good tradeoff, - * better than 1/2, 1/4, or 1/10. */ - if (need < MDBX_DPL_TXNFULL / 8) - need = MDBX_DPL_TXNFULL / 8; - /* Save the page IDs of all the pages we're flushing */ /* flush from the tail forward, this saves a lot of shifting later on. */ for (i = dl->length; i && need; i--) { @@ -5173,7 +5173,7 @@ skip_cache: } /* Don't try to coalesce too much. */ - if (unlikely(re_len > MDBX_DPL_TXNFULL / 4)) + if (unlikely(re_len > MDBX_DPL_TXNFULL / 42)) break; if (re_len /* current size */ >= env->me_maxgc_ov1page || (re_len > prev_re_len && re_len - prev_re_len /* delta from prev */ >= @@ -18813,6 +18813,9 @@ __dll_export #else #error "FIXME: Unsupported byte order" #endif /* __BYTE_ORDER__ */ +#if MDBX_HUGE_TRANSACTIONS + " MDBX_HUGE_TRANSACTIONS=YES" +#endif /* MDBX_HUGE_TRANSACTIONS */ " MDBX_TXN_CHECKPID=" MDBX_TXN_CHECKPID_CONFIG " MDBX_TXN_CHECKOWNER=" MDBX_TXN_CHECKOWNER_CONFIG " MDBX_64BIT_ATOMIC=" MDBX_64BIT_ATOMIC_CONFIG diff --git a/src/internals.h b/src/internals.h index 650ea3c4..a8b391e3 100644 --- a/src/internals.h +++ b/src/internals.h @@ -647,9 +647,16 @@ typedef MDBX_DP *MDBX_DPL; #define MDBX_PNL_GRANULATE 1024 #define MDBX_PNL_INITIAL \ (MDBX_PNL_GRANULATE - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(pgno_t)) + +#if MDBX_HUGE_TRANSACTIONS +#define MDBX_PNL_MAX \ + ((1u << 26) - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(pgno_t)) +#define MDBX_DPL_TXNFULL (MDBX_PNL_MAX / 2) +#else #define MDBX_PNL_MAX \ ((1u << 24) - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(pgno_t)) #define MDBX_DPL_TXNFULL (MDBX_PNL_MAX / 4) +#endif /* MDBX_HUGE_TRANSACTIONS */ #define MDBX_TXL_GRANULATE 32 #define MDBX_TXL_INITIAL \ @@ -925,7 +932,7 @@ struct MDBX_env { MDBX_page *me_dpages; /* list of malloc'd blocks for re-use */ /* PNL of pages that became unused in a write txn */ MDBX_PNL me_retired_pages; - /* MDBX_DP of pages written during a write txn. Length MDBX_DPL_TXNFULL. */ + /* MDBX_DP of pages written during a write txn. */ MDBX_DPL me_dirtylist; /* Number of freelist items that can fit in a single overflow page */ unsigned me_maxgc_ov1page; diff --git a/src/options.h b/src/options.h index 715f883b..f3805b1e 100644 --- a/src/options.h +++ b/src/options.h @@ -13,6 +13,11 @@ * */ +/* Support for huge write-transactions */ +#ifndef MDBX_HUGE_TRANSACTIONS +#define MDBX_HUGE_TRANSACTIONS 0 +#endif /* MDBX_HUGE_TRANSACTIONS */ + /* using fcntl(F_FULLFSYNC) with 5-10 times slowdown */ #define MDBX_OSX_WANNA_DURABILITY 0 /* using fsync() with chance of data lost on power failure */