mdbx: support for huge transactions (MDBX_HUGE_TRANSACTIONS option).

Change-Id: I5d6cce6a7fb816add8cb4c066cc50f31cdebf9d5
This commit is contained in:
Leonid Yuriev 2020-06-04 20:09:02 +03:00
parent fdc92b136f
commit e008f3132d
5 changed files with 31 additions and 14 deletions

View File

@ -378,6 +378,7 @@ add_mdbx_option(MDBX_BUILD_SHARED_LIBRARY "Build libmdbx as shared library (DLL)
add_mdbx_option(MDBX_BUILD_TOOLS "Build MDBX tools (mdbx_chk/stat/dump/load/copy)" ${MDBX_BUILD_TOOLS_DEFAULT})
add_mdbx_option(MDBX_TXN_CHECKOWNER "Checking transaction matches the calling thread inside libmdbx's API" ON)
add_mdbx_option(MDBX_TXN_CHECKPID "Paranoid checking PID inside libmdbx's API" AUTO)
add_mdbx_option(MDBX_HUGE_TRANSACTIONS "Support for huge write-transactions" OFF)
mark_as_advanced(MDBX_TXN_CHECKPID)
if(${CMAKE_SYSTEM_NAME} STREQUAL "Linux")
add_mdbx_option(MDBX_DISABLE_GNU_SOURCE "Don't use GNU/Linux libc extensions" OFF)

View File

@ -12,6 +12,7 @@
#cmakedefine MDBX_FORCE_ASSERTIONS
/* Common */
#cmakedefine01 MDBX_HUGE_TRANSACTIONS
#cmakedefine01 MDBX_TXN_CHECKOWNER
#cmakedefine MDBX_TXN_CHECKPID_AUTO
#ifndef MDBX_TXN_CHECKPID_AUTO

View File

@ -394,9 +394,9 @@ __cold intptr_t mdbx_limits_valsize_max(intptr_t pagesize, unsigned flags) {
const unsigned page_ln2 = log2n(pagesize);
const size_t hard = 0x7FF00000ul;
const size_t hard_pages = hard >> page_ln2;
const size_t limit = (hard_pages < MDBX_DPL_TXNFULL)
const size_t limit = (hard_pages < MDBX_DPL_TXNFULL / 3)
? hard
: ((size_t)MDBX_DPL_TXNFULL << page_ln2);
: ((size_t)MDBX_DPL_TXNFULL / 3 << page_ln2);
return (limit < MAX_MAPSIZE) ? limit / 2 : MAX_MAPSIZE / 2;
}
@ -4201,8 +4201,17 @@ static int mdbx_page_spill(MDBX_cursor *mc, const MDBX_val *key,
if (txn->tw.dirtyroom > i)
return MDBX_SUCCESS;
/* Less aggressive spill - we originally spilled the entire dirty list,
* with a few exceptions for cursor pages and DB root pages. But this
* turns out to be a lot of wasted effort because in a large txn many
* of those pages will need to be used again. So now we spill only 1/8th
* of the dirty pages. Testing revealed this to be a good tradeoff,
* better than 1/2, 1/4, or 1/10. */
if (need < MDBX_DPL_TXNFULL / 8)
need = MDBX_DPL_TXNFULL / 8;
if (!txn->tw.spill_pages) {
txn->tw.spill_pages = mdbx_pnl_alloc(MDBX_DPL_TXNFULL / 8);
txn->tw.spill_pages = mdbx_pnl_alloc(need);
if (unlikely(!txn->tw.spill_pages))
return MDBX_ENOMEM;
} else {
@ -4221,15 +4230,6 @@ static int mdbx_page_spill(MDBX_cursor *mc, const MDBX_val *key,
if (unlikely(rc != MDBX_SUCCESS))
goto bailout;
/* Less aggressive spill - we originally spilled the entire dirty list,
* with a few exceptions for cursor pages and DB root pages. But this
* turns out to be a lot of wasted effort because in a large txn many
* of those pages will need to be used again. So now we spill only 1/8th
* of the dirty pages. Testing revealed this to be a good tradeoff,
* better than 1/2, 1/4, or 1/10. */
if (need < MDBX_DPL_TXNFULL / 8)
need = MDBX_DPL_TXNFULL / 8;
/* Save the page IDs of all the pages we're flushing */
/* flush from the tail forward, this saves a lot of shifting later on. */
for (i = dl->length; i && need; i--) {
@ -5173,7 +5173,7 @@ skip_cache:
}
/* Don't try to coalesce too much. */
if (unlikely(re_len > MDBX_DPL_TXNFULL / 4))
if (unlikely(re_len > MDBX_DPL_TXNFULL / 42))
break;
if (re_len /* current size */ >= env->me_maxgc_ov1page ||
(re_len > prev_re_len && re_len - prev_re_len /* delta from prev */ >=
@ -18813,6 +18813,9 @@ __dll_export
#else
#error "FIXME: Unsupported byte order"
#endif /* __BYTE_ORDER__ */
#if MDBX_HUGE_TRANSACTIONS
" MDBX_HUGE_TRANSACTIONS=YES"
#endif /* MDBX_HUGE_TRANSACTIONS */
" MDBX_TXN_CHECKPID=" MDBX_TXN_CHECKPID_CONFIG
" MDBX_TXN_CHECKOWNER=" MDBX_TXN_CHECKOWNER_CONFIG
" MDBX_64BIT_ATOMIC=" MDBX_64BIT_ATOMIC_CONFIG

View File

@ -647,9 +647,16 @@ typedef MDBX_DP *MDBX_DPL;
#define MDBX_PNL_GRANULATE 1024
#define MDBX_PNL_INITIAL \
(MDBX_PNL_GRANULATE - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(pgno_t))
#if MDBX_HUGE_TRANSACTIONS
#define MDBX_PNL_MAX \
((1u << 26) - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(pgno_t))
#define MDBX_DPL_TXNFULL (MDBX_PNL_MAX / 2)
#else
#define MDBX_PNL_MAX \
((1u << 24) - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(pgno_t))
#define MDBX_DPL_TXNFULL (MDBX_PNL_MAX / 4)
#endif /* MDBX_HUGE_TRANSACTIONS */
#define MDBX_TXL_GRANULATE 32
#define MDBX_TXL_INITIAL \
@ -925,7 +932,7 @@ struct MDBX_env {
MDBX_page *me_dpages; /* list of malloc'd blocks for re-use */
/* PNL of pages that became unused in a write txn */
MDBX_PNL me_retired_pages;
/* MDBX_DP of pages written during a write txn. Length MDBX_DPL_TXNFULL. */
/* MDBX_DP of pages written during a write txn. */
MDBX_DPL me_dirtylist;
/* Number of freelist items that can fit in a single overflow page */
unsigned me_maxgc_ov1page;

View File

@ -13,6 +13,11 @@
*
*/
/* Support for huge write-transactions */
#ifndef MDBX_HUGE_TRANSACTIONS
#define MDBX_HUGE_TRANSACTIONS 0
#endif /* MDBX_HUGE_TRANSACTIONS */
/* using fcntl(F_FULLFSYNC) with 5-10 times slowdown */
#define MDBX_OSX_WANNA_DURABILITY 0
/* using fsync() with chance of data lost on power failure */