mdbx: support for huge transactions (MDBX_HUGE_TRANSACTIONS option).

Change-Id: I5d6cce6a7fb816add8cb4c066cc50f31cdebf9d5
This commit is contained in:
Leonid Yuriev 2020-06-04 20:09:02 +03:00
parent fdc92b136f
commit e008f3132d
5 changed files with 31 additions and 14 deletions

View File

@ -378,6 +378,7 @@ add_mdbx_option(MDBX_BUILD_SHARED_LIBRARY "Build libmdbx as shared library (DLL)
add_mdbx_option(MDBX_BUILD_TOOLS "Build MDBX tools (mdbx_chk/stat/dump/load/copy)" ${MDBX_BUILD_TOOLS_DEFAULT}) add_mdbx_option(MDBX_BUILD_TOOLS "Build MDBX tools (mdbx_chk/stat/dump/load/copy)" ${MDBX_BUILD_TOOLS_DEFAULT})
add_mdbx_option(MDBX_TXN_CHECKOWNER "Checking transaction matches the calling thread inside libmdbx's API" ON) add_mdbx_option(MDBX_TXN_CHECKOWNER "Checking transaction matches the calling thread inside libmdbx's API" ON)
add_mdbx_option(MDBX_TXN_CHECKPID "Paranoid checking PID inside libmdbx's API" AUTO) add_mdbx_option(MDBX_TXN_CHECKPID "Paranoid checking PID inside libmdbx's API" AUTO)
add_mdbx_option(MDBX_HUGE_TRANSACTIONS "Support for huge write-transactions" OFF)
mark_as_advanced(MDBX_TXN_CHECKPID) mark_as_advanced(MDBX_TXN_CHECKPID)
if(${CMAKE_SYSTEM_NAME} STREQUAL "Linux") if(${CMAKE_SYSTEM_NAME} STREQUAL "Linux")
add_mdbx_option(MDBX_DISABLE_GNU_SOURCE "Don't use GNU/Linux libc extensions" OFF) add_mdbx_option(MDBX_DISABLE_GNU_SOURCE "Don't use GNU/Linux libc extensions" OFF)

View File

@ -12,6 +12,7 @@
#cmakedefine MDBX_FORCE_ASSERTIONS #cmakedefine MDBX_FORCE_ASSERTIONS
/* Common */ /* Common */
#cmakedefine01 MDBX_HUGE_TRANSACTIONS
#cmakedefine01 MDBX_TXN_CHECKOWNER #cmakedefine01 MDBX_TXN_CHECKOWNER
#cmakedefine MDBX_TXN_CHECKPID_AUTO #cmakedefine MDBX_TXN_CHECKPID_AUTO
#ifndef MDBX_TXN_CHECKPID_AUTO #ifndef MDBX_TXN_CHECKPID_AUTO

View File

@ -394,9 +394,9 @@ __cold intptr_t mdbx_limits_valsize_max(intptr_t pagesize, unsigned flags) {
const unsigned page_ln2 = log2n(pagesize); const unsigned page_ln2 = log2n(pagesize);
const size_t hard = 0x7FF00000ul; const size_t hard = 0x7FF00000ul;
const size_t hard_pages = hard >> page_ln2; const size_t hard_pages = hard >> page_ln2;
const size_t limit = (hard_pages < MDBX_DPL_TXNFULL) const size_t limit = (hard_pages < MDBX_DPL_TXNFULL / 3)
? hard ? hard
: ((size_t)MDBX_DPL_TXNFULL << page_ln2); : ((size_t)MDBX_DPL_TXNFULL / 3 << page_ln2);
return (limit < MAX_MAPSIZE) ? limit / 2 : MAX_MAPSIZE / 2; return (limit < MAX_MAPSIZE) ? limit / 2 : MAX_MAPSIZE / 2;
} }
@ -4201,8 +4201,17 @@ static int mdbx_page_spill(MDBX_cursor *mc, const MDBX_val *key,
if (txn->tw.dirtyroom > i) if (txn->tw.dirtyroom > i)
return MDBX_SUCCESS; return MDBX_SUCCESS;
/* Less aggressive spill - we originally spilled the entire dirty list,
* with a few exceptions for cursor pages and DB root pages. But this
* turns out to be a lot of wasted effort because in a large txn many
* of those pages will need to be used again. So now we spill only 1/8th
* of the dirty pages. Testing revealed this to be a good tradeoff,
* better than 1/2, 1/4, or 1/10. */
if (need < MDBX_DPL_TXNFULL / 8)
need = MDBX_DPL_TXNFULL / 8;
if (!txn->tw.spill_pages) { if (!txn->tw.spill_pages) {
txn->tw.spill_pages = mdbx_pnl_alloc(MDBX_DPL_TXNFULL / 8); txn->tw.spill_pages = mdbx_pnl_alloc(need);
if (unlikely(!txn->tw.spill_pages)) if (unlikely(!txn->tw.spill_pages))
return MDBX_ENOMEM; return MDBX_ENOMEM;
} else { } else {
@ -4221,15 +4230,6 @@ static int mdbx_page_spill(MDBX_cursor *mc, const MDBX_val *key,
if (unlikely(rc != MDBX_SUCCESS)) if (unlikely(rc != MDBX_SUCCESS))
goto bailout; goto bailout;
/* Less aggressive spill - we originally spilled the entire dirty list,
* with a few exceptions for cursor pages and DB root pages. But this
* turns out to be a lot of wasted effort because in a large txn many
* of those pages will need to be used again. So now we spill only 1/8th
* of the dirty pages. Testing revealed this to be a good tradeoff,
* better than 1/2, 1/4, or 1/10. */
if (need < MDBX_DPL_TXNFULL / 8)
need = MDBX_DPL_TXNFULL / 8;
/* Save the page IDs of all the pages we're flushing */ /* Save the page IDs of all the pages we're flushing */
/* flush from the tail forward, this saves a lot of shifting later on. */ /* flush from the tail forward, this saves a lot of shifting later on. */
for (i = dl->length; i && need; i--) { for (i = dl->length; i && need; i--) {
@ -5173,7 +5173,7 @@ skip_cache:
} }
/* Don't try to coalesce too much. */ /* Don't try to coalesce too much. */
if (unlikely(re_len > MDBX_DPL_TXNFULL / 4)) if (unlikely(re_len > MDBX_DPL_TXNFULL / 42))
break; break;
if (re_len /* current size */ >= env->me_maxgc_ov1page || if (re_len /* current size */ >= env->me_maxgc_ov1page ||
(re_len > prev_re_len && re_len - prev_re_len /* delta from prev */ >= (re_len > prev_re_len && re_len - prev_re_len /* delta from prev */ >=
@ -18813,6 +18813,9 @@ __dll_export
#else #else
#error "FIXME: Unsupported byte order" #error "FIXME: Unsupported byte order"
#endif /* __BYTE_ORDER__ */ #endif /* __BYTE_ORDER__ */
#if MDBX_HUGE_TRANSACTIONS
" MDBX_HUGE_TRANSACTIONS=YES"
#endif /* MDBX_HUGE_TRANSACTIONS */
" MDBX_TXN_CHECKPID=" MDBX_TXN_CHECKPID_CONFIG " MDBX_TXN_CHECKPID=" MDBX_TXN_CHECKPID_CONFIG
" MDBX_TXN_CHECKOWNER=" MDBX_TXN_CHECKOWNER_CONFIG " MDBX_TXN_CHECKOWNER=" MDBX_TXN_CHECKOWNER_CONFIG
" MDBX_64BIT_ATOMIC=" MDBX_64BIT_ATOMIC_CONFIG " MDBX_64BIT_ATOMIC=" MDBX_64BIT_ATOMIC_CONFIG

View File

@ -647,9 +647,16 @@ typedef MDBX_DP *MDBX_DPL;
#define MDBX_PNL_GRANULATE 1024 #define MDBX_PNL_GRANULATE 1024
#define MDBX_PNL_INITIAL \ #define MDBX_PNL_INITIAL \
(MDBX_PNL_GRANULATE - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(pgno_t)) (MDBX_PNL_GRANULATE - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(pgno_t))
#if MDBX_HUGE_TRANSACTIONS
#define MDBX_PNL_MAX \
((1u << 26) - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(pgno_t))
#define MDBX_DPL_TXNFULL (MDBX_PNL_MAX / 2)
#else
#define MDBX_PNL_MAX \ #define MDBX_PNL_MAX \
((1u << 24) - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(pgno_t)) ((1u << 24) - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(pgno_t))
#define MDBX_DPL_TXNFULL (MDBX_PNL_MAX / 4) #define MDBX_DPL_TXNFULL (MDBX_PNL_MAX / 4)
#endif /* MDBX_HUGE_TRANSACTIONS */
#define MDBX_TXL_GRANULATE 32 #define MDBX_TXL_GRANULATE 32
#define MDBX_TXL_INITIAL \ #define MDBX_TXL_INITIAL \
@ -925,7 +932,7 @@ struct MDBX_env {
MDBX_page *me_dpages; /* list of malloc'd blocks for re-use */ MDBX_page *me_dpages; /* list of malloc'd blocks for re-use */
/* PNL of pages that became unused in a write txn */ /* PNL of pages that became unused in a write txn */
MDBX_PNL me_retired_pages; MDBX_PNL me_retired_pages;
/* MDBX_DP of pages written during a write txn. Length MDBX_DPL_TXNFULL. */ /* MDBX_DP of pages written during a write txn. */
MDBX_DPL me_dirtylist; MDBX_DPL me_dirtylist;
/* Number of freelist items that can fit in a single overflow page */ /* Number of freelist items that can fit in a single overflow page */
unsigned me_maxgc_ov1page; unsigned me_maxgc_ov1page;

View File

@ -13,6 +13,11 @@
* *
*/ */
/* Support for huge write-transactions */
#ifndef MDBX_HUGE_TRANSACTIONS
#define MDBX_HUGE_TRANSACTIONS 0
#endif /* MDBX_HUGE_TRANSACTIONS */
/* using fcntl(F_FULLFSYNC) with 5-10 times slowdown */ /* using fcntl(F_FULLFSYNC) with 5-10 times slowdown */
#define MDBX_OSX_WANNA_DURABILITY 0 #define MDBX_OSX_WANNA_DURABILITY 0
/* using fsync() with chance of data lost on power failure */ /* using fsync() with chance of data lost on power failure */