mirror of
https://github.com/isar/libmdbx.git
synced 2025-01-06 18:44:13 +08:00
mdbx: refine mdbx_page_spill()
.
More for https://github.com/erthink/libmdbx/issues/132. Change-Id: I40788c13b54453c17d5e5dae6c3d2f93226f8e00
This commit is contained in:
parent
12ee5e6cac
commit
338de2e1fb
73
src/core.c
73
src/core.c
@ -4317,51 +4317,58 @@ static int mdbx_page_spill(MDBX_cursor *mc, const MDBX_val *key,
|
|||||||
const MDBX_val *data) {
|
const MDBX_val *data) {
|
||||||
if (mc->mc_flags & C_SUB)
|
if (mc->mc_flags & C_SUB)
|
||||||
return MDBX_SUCCESS;
|
return MDBX_SUCCESS;
|
||||||
|
|
||||||
MDBX_txn *txn = mc->mc_txn;
|
MDBX_txn *txn = mc->mc_txn;
|
||||||
MDBX_dpl *const dl = txn->tw.dirtylist;
|
if (txn->mt_flags & MDBX_WRITEMAP)
|
||||||
|
|
||||||
/* Estimate how much space this op will take */
|
|
||||||
pgno_t i = mc->mc_db->md_depth;
|
|
||||||
/* Named DBs also dirty the main DB */
|
|
||||||
if (mc->mc_dbi >= CORE_DBS)
|
|
||||||
i += txn->mt_dbs[MAIN_DBI].md_depth;
|
|
||||||
/* For puts, roughly factor in the key+data size */
|
|
||||||
if (key)
|
|
||||||
i += bytes2pgno(txn->mt_env, node_size(key, data) + txn->mt_env->me_psize);
|
|
||||||
i += i; /* double it for good measure */
|
|
||||||
pgno_t need = i;
|
|
||||||
|
|
||||||
if (txn->tw.dirtyroom > i)
|
|
||||||
return MDBX_SUCCESS;
|
return MDBX_SUCCESS;
|
||||||
|
|
||||||
/* Less aggressive spill - we originally spilled the entire dirty list,
|
/* Estimate how much space this op will take: */
|
||||||
* with a few exceptions for cursor pages and DB root pages. But this
|
/* 1) Max b-tree height, reasonable enough with including dups' sub-tree */
|
||||||
* turns out to be a lot of wasted effort because in a large txn many
|
size_t need = CURSOR_STACK + 3;
|
||||||
* of those pages will need to be used again. So now we spill only 1/8th
|
/* 2) GC/FreeDB for any payload */
|
||||||
* of the dirty pages. Testing revealed this to be a good tradeoff,
|
if (mc->mc_dbi > FREE_DBI) {
|
||||||
* better than 1/2, 1/4, or 1/10. */
|
need += txn->mt_dbs[FREE_DBI].md_depth + 3;
|
||||||
if (need < txn->mt_env->me_options.dp_limit / 8)
|
/* 3) Named DBs also dirty the main DB */
|
||||||
need = txn->mt_env->me_options.dp_limit / 8;
|
if (mc->mc_dbi > MAIN_DBI)
|
||||||
|
need += txn->mt_dbs[MAIN_DBI].md_depth + 3;
|
||||||
|
}
|
||||||
|
/* 4) Roughly factor in the key+data size */
|
||||||
|
need += bytes2pgno(txn->mt_env, node_size(key, data)) + 1;
|
||||||
|
/* 5) Double it for safety enough reserve */
|
||||||
|
need += need;
|
||||||
|
if (likely(txn->tw.dirtyroom > need))
|
||||||
|
return MDBX_SUCCESS;
|
||||||
|
|
||||||
|
const size_t spill_min = (txn->tw.dirtylist->length / /* TODO: options */ 8);
|
||||||
|
const size_t spill_max = (txn->tw.dirtylist->length / /* TODO: options */ 2);
|
||||||
|
size_t spill = need - txn->tw.dirtyroom;
|
||||||
|
spill = (spill < spill_max) ? spill : spill_max;
|
||||||
|
spill = (spill > spill_min) ? spill : spill_min;
|
||||||
|
|
||||||
|
int rc;
|
||||||
if (!txn->tw.spill_pages) {
|
if (!txn->tw.spill_pages) {
|
||||||
txn->tw.spill_pages = mdbx_pnl_alloc(need);
|
txn->tw.spill_pages = mdbx_pnl_alloc(spill);
|
||||||
if (unlikely(!txn->tw.spill_pages))
|
if (unlikely(!txn->tw.spill_pages)) {
|
||||||
return MDBX_ENOMEM;
|
rc = MDBX_ENOMEM;
|
||||||
|
goto bailout;
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
/* purge deleted slots */
|
/* purge deleted slots */
|
||||||
mdbx_pnl_purge_odd(txn->tw.spill_pages, 1);
|
mdbx_pnl_purge_odd(txn->tw.spill_pages, 1);
|
||||||
|
mdbx_pnl_reserve(&txn->tw.spill_pages, spill);
|
||||||
}
|
}
|
||||||
|
mdbx_notice("spilling %zu pages (have %u dirty-room, need %zu)", spill,
|
||||||
|
txn->tw.dirtyroom, need);
|
||||||
|
|
||||||
/* Preserve pages which may soon be dirtied again */
|
/* Preserve pages which may soon be dirtied again */
|
||||||
mdbx_pages_xkeep(mc, P_DIRTY, true);
|
mdbx_pages_xkeep(mc, P_DIRTY, true);
|
||||||
|
|
||||||
|
MDBX_dpl *const dl = mdbx_dpl_sort(txn->tw.dirtylist);
|
||||||
/* Save the page IDs of all the pages we're flushing */
|
/* Save the page IDs of all the pages we're flushing */
|
||||||
/* flush from the tail forward, this saves a lot of shifting later on. */
|
/* flush from the tail forward, this saves a lot of shifting later on. */
|
||||||
int rc;
|
size_t keep = dl->length;
|
||||||
for (i = dl->length; i && need; i--) {
|
for (; keep && spill; keep--) {
|
||||||
pgno_t pn = dl->items[i].pgno << 1;
|
pgno_t pn = dl->items[keep].pgno << 1;
|
||||||
MDBX_page *dp = dl->items[i].ptr;
|
MDBX_page *dp = dl->items[keep].ptr;
|
||||||
if (dp->mp_flags & (P_LOOSE | P_KEEP))
|
if (dp->mp_flags & (P_LOOSE | P_KEEP))
|
||||||
continue;
|
continue;
|
||||||
/* Can't spill twice,
|
/* Can't spill twice,
|
||||||
@ -4381,17 +4388,17 @@ static int mdbx_page_spill(MDBX_cursor *mc, const MDBX_val *key,
|
|||||||
rc = mdbx_pnl_append(&txn->tw.spill_pages, pn);
|
rc = mdbx_pnl_append(&txn->tw.spill_pages, pn);
|
||||||
if (unlikely(rc != MDBX_SUCCESS))
|
if (unlikely(rc != MDBX_SUCCESS))
|
||||||
goto bailout;
|
goto bailout;
|
||||||
need--;
|
spill--;
|
||||||
}
|
}
|
||||||
mdbx_pnl_sort(txn->tw.spill_pages);
|
mdbx_pnl_sort(txn->tw.spill_pages);
|
||||||
|
|
||||||
/* Flush the spilled part of dirty list */
|
/* Flush the spilled part of dirty list */
|
||||||
rc = mdbx_page_flush(txn, i);
|
rc = mdbx_page_flush(txn, keep);
|
||||||
if (unlikely(rc != MDBX_SUCCESS))
|
if (unlikely(rc != MDBX_SUCCESS))
|
||||||
goto bailout;
|
goto bailout;
|
||||||
|
|
||||||
/* Reset any dirty pages we kept that page_flush didn't see */
|
/* Reset any dirty pages we kept that page_flush didn't see */
|
||||||
mdbx_pages_xkeep(mc, P_DIRTY | P_KEEP, i != 0);
|
mdbx_pages_xkeep(mc, P_DIRTY | P_KEEP, keep > 0);
|
||||||
|
|
||||||
bailout:
|
bailout:
|
||||||
txn->mt_flags |= rc ? MDBX_TXN_ERROR : MDBX_TXN_SPILLS;
|
txn->mt_flags |= rc ? MDBX_TXN_ERROR : MDBX_TXN_SPILLS;
|
||||||
|
@ -831,13 +831,10 @@ struct MDBX_txn {
|
|||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
/* Enough space for 2^32 nodes with minimum of 2 keys per node. I.e., plenty.
|
|
||||||
* At 4 keys per node, enough for 2^64 nodes, so there's probably no need to
|
|
||||||
* raise this on a 64 bit machine. */
|
|
||||||
#if MDBX_WORDBITS >= 64
|
#if MDBX_WORDBITS >= 64
|
||||||
#define CURSOR_STACK 28
|
#define CURSOR_STACK 32
|
||||||
#else
|
#else
|
||||||
#define CURSOR_STACK 20
|
#define CURSOR_STACK 24
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
struct MDBX_xcursor;
|
struct MDBX_xcursor;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user