mdbx: refine page-rebalance.

Change-Id: Ia94379fa51eb5da1e1ec1a52cc1dd8b67ceb150a
This commit is contained in:
Leonid Yuriev 2021-03-03 16:05:52 +03:00
parent 46dcd6e7ca
commit 0cebc50291

View File

@ -16175,25 +16175,29 @@ static int mdbx_rebalance(MDBX_cursor *mc) {
/* The threshold of minimum page fill factor, as a number of free bytes on a /* The threshold of minimum page fill factor, as a number of free bytes on a
* page. Pages emptier than this are candidates for merging. */ * page. Pages emptier than this are candidates for merging. */
const unsigned room_threshold = unsigned room_threshold =
page_space(mc->mc_txn->mt_env) - page_space(mc->mc_txn->mt_env) -
(page_space(mc->mc_txn->mt_env) >> threshold_fill_exp2); (page_space(mc->mc_txn->mt_env) >> threshold_fill_exp2);
const MDBX_page *const tp = mc->mc_pg[mc->mc_top]; const MDBX_page *const tp = mc->mc_pg[mc->mc_top];
mdbx_debug("rebalancing %s page %" PRIaPGNO " (has %u keys, %.1f%% full)", const unsigned numkeys = page_numkeys(tp);
(pagetype & P_LEAF) ? "leaf" : "branch", tp->mp_pgno, const unsigned room = page_room(tp);
page_numkeys(tp), page_fill(mc->mc_txn->mt_env, tp)); mdbx_debug("rebalancing %s page %" PRIaPGNO
" (has %u keys, full %.1f%%, used %u, room %u bytes )",
(pagetype & P_LEAF) ? "leaf" : "branch", tp->mp_pgno, numkeys,
page_fill(mc->mc_txn->mt_env, tp),
page_used(mc->mc_txn->mt_env, tp), room);
if (unlikely(page_numkeys(tp) < minkeys)) { if (unlikely(numkeys < minkeys)) {
mdbx_debug("page %" PRIaPGNO " must be merged due keys < %u threshold", mdbx_debug("page %" PRIaPGNO " must be merged due keys < %u threshold",
tp->mp_pgno, minkeys); tp->mp_pgno, minkeys);
} else if (unlikely(page_room(tp) > room_threshold)) { } else if (unlikely(room > room_threshold)) {
mdbx_debug("page %" PRIaPGNO " should be merged due room %u > %u threshold", mdbx_debug("page %" PRIaPGNO " should be merged due room %u > %u threshold",
tp->mp_pgno, page_room(tp), room_threshold); tp->mp_pgno, room, room_threshold);
} else { } else {
mdbx_debug("no need to rebalance page %" PRIaPGNO mdbx_debug("no need to rebalance page %" PRIaPGNO
", room %u < %u threshold", ", room %u < %u threshold",
tp->mp_pgno, page_room(tp), room_threshold); tp->mp_pgno, room, room_threshold);
mdbx_cassert(mc, mc->mc_db->md_entries > 0); mdbx_cassert(mc, mc->mc_db->md_entries > 0);
return MDBX_SUCCESS; return MDBX_SUCCESS;
} }
@ -16316,110 +16320,86 @@ static int mdbx_rebalance(MDBX_cursor *mc) {
mdbx_cassert(mc, PAGETYPE(right) == PAGETYPE(mc->mc_pg[mc->mc_top])); mdbx_cassert(mc, PAGETYPE(right) == PAGETYPE(mc->mc_pg[mc->mc_top]));
} }
const indx_t ki_top = mc->mc_ki[mc->mc_top]; const unsigned ki_top = mc->mc_ki[mc->mc_top];
const indx_t ki_pre_top = mn.mc_ki[pre_top]; const unsigned ki_pre_top = mn.mc_ki[pre_top];
const indx_t nkeys = (indx_t)page_numkeys(mn.mc_pg[mn.mc_top]); const unsigned nkeys = page_numkeys(mn.mc_pg[mn.mc_top]);
if (left && page_room(left) > room_threshold &&
(!right || page_room(right) < page_room(left))) { const unsigned left_room = left ? page_room(left) : 0;
const unsigned right_room = right ? page_room(right) : 0;
retry:
if (left_room > room_threshold && left_room >= right_room) {
/* try merge with left */ /* try merge with left */
mdbx_cassert(mc, page_numkeys(left) >= minkeys); mdbx_cassert(mc, page_numkeys(left) >= minkeys);
mn.mc_pg[mn.mc_top] = left; mn.mc_pg[mn.mc_top] = left;
mn.mc_ki[mn.mc_top - 1] = ki_pre_top - 1; mn.mc_ki[mn.mc_top - 1] = (indx_t)(ki_pre_top - 1);
mn.mc_ki[mn.mc_top] = (indx_t)(page_numkeys(left) - 1); mn.mc_ki[mn.mc_top] = (indx_t)(page_numkeys(left) - 1);
mc->mc_ki[mc->mc_top] = 0; mc->mc_ki[mc->mc_top] = 0;
const indx_t new_ki = (indx_t)(ki_top + page_numkeys(left)); const unsigned new_ki = ki_top + page_numkeys(left);
mn.mc_ki[mn.mc_top] += mc->mc_ki[mn.mc_top] + 1; mn.mc_ki[mn.mc_top] += mc->mc_ki[mn.mc_top] + 1;
/* We want mdbx_rebalance to find mn when doing fixups */ /* We want mdbx_rebalance to find mn when doing fixups */
WITH_CURSOR_TRACKING(mn, rc = mdbx_page_merge(mc, &mn)); WITH_CURSOR_TRACKING(mn, rc = mdbx_page_merge(mc, &mn));
if (likely(rc != MDBX_RESULT_TRUE)) { if (likely(rc != MDBX_RESULT_TRUE)) {
cursor_copy_internal(&mn, mc); cursor_copy_internal(&mn, mc);
mc->mc_ki[mc->mc_top] = new_ki; mc->mc_ki[mc->mc_top] = (indx_t)new_ki;
mdbx_cassert(mc, rc || page_numkeys(mc->mc_pg[mc->mc_top]) >= minkeys); mdbx_cassert(mc, rc || page_numkeys(mc->mc_pg[mc->mc_top]) >= minkeys);
return rc; return rc;
} }
} }
if (right && page_room(right) > room_threshold) { if (right_room > room_threshold) {
/* try merge with right */ /* try merge with right */
mdbx_cassert(mc, page_numkeys(right) >= minkeys); mdbx_cassert(mc, page_numkeys(right) >= minkeys);
mn.mc_pg[mn.mc_top] = right; mn.mc_pg[mn.mc_top] = right;
mn.mc_ki[mn.mc_top - 1] = ki_pre_top + 1; mn.mc_ki[mn.mc_top - 1] = (indx_t)(ki_pre_top + 1);
mn.mc_ki[mn.mc_top] = 0; mn.mc_ki[mn.mc_top] = 0;
mc->mc_ki[mc->mc_top] = nkeys; mc->mc_ki[mc->mc_top] = (indx_t)nkeys;
WITH_CURSOR_TRACKING(mn, rc = mdbx_page_merge(&mn, mc)); WITH_CURSOR_TRACKING(mn, rc = mdbx_page_merge(&mn, mc));
if (likely(rc != MDBX_RESULT_TRUE)) { if (likely(rc != MDBX_RESULT_TRUE)) {
mc->mc_ki[mc->mc_top] = ki_top; mc->mc_ki[mc->mc_top] = (indx_t)ki_top;
mdbx_cassert(mc, rc || page_numkeys(mc->mc_pg[mc->mc_top]) >= minkeys); mdbx_cassert(mc, rc || page_numkeys(mc->mc_pg[mc->mc_top]) >= minkeys);
return rc; return rc;
} }
} }
if (left && page_numkeys(left) > minkeys &&
(!right || page_numkeys(right) <= minkeys || const unsigned left_nkeys = left ? page_numkeys(left) : 0;
page_room(right) > page_room(left))) { const unsigned right_nkeys = right ? page_numkeys(right) : 0;
if (left_nkeys > minkeys &&
(right_nkeys <= left_nkeys || right_room >= left_room)) {
/* try move from left */ /* try move from left */
mn.mc_pg[mn.mc_top] = left; mn.mc_pg[mn.mc_top] = left;
mn.mc_ki[mn.mc_top - 1] = ki_pre_top - 1; mn.mc_ki[mn.mc_top - 1] = (indx_t)(ki_pre_top - 1);
mn.mc_ki[mn.mc_top] = (indx_t)(page_numkeys(left) - 1); mn.mc_ki[mn.mc_top] = (indx_t)(page_numkeys(left) - 1);
mc->mc_ki[mc->mc_top] = 0; mc->mc_ki[mc->mc_top] = 0;
WITH_CURSOR_TRACKING(mn, rc = mdbx_node_move(&mn, mc, true)); WITH_CURSOR_TRACKING(mn, rc = mdbx_node_move(&mn, mc, true));
if (likely(rc != MDBX_RESULT_TRUE)) { if (likely(rc != MDBX_RESULT_TRUE)) {
mc->mc_ki[mc->mc_top] = ki_top + 1; mc->mc_ki[mc->mc_top] = (indx_t)(ki_top + 1);
mdbx_cassert(mc, rc || page_numkeys(mc->mc_pg[mc->mc_top]) >= minkeys); mdbx_cassert(mc, rc || page_numkeys(mc->mc_pg[mc->mc_top]) >= minkeys);
return rc; return rc;
} }
} }
if (right && page_numkeys(right) > minkeys) { if (right_nkeys > minkeys) {
/* try move from right */ /* try move from right */
mn.mc_pg[mn.mc_top] = right; mn.mc_pg[mn.mc_top] = right;
mn.mc_ki[mn.mc_top - 1] = ki_pre_top + 1; mn.mc_ki[mn.mc_top - 1] = (indx_t)(ki_pre_top + 1);
mn.mc_ki[mn.mc_top] = 0; mn.mc_ki[mn.mc_top] = 0;
mc->mc_ki[mc->mc_top] = nkeys; mc->mc_ki[mc->mc_top] = (indx_t)nkeys;
WITH_CURSOR_TRACKING(mn, rc = mdbx_node_move(&mn, mc, false)); WITH_CURSOR_TRACKING(mn, rc = mdbx_node_move(&mn, mc, false));
if (likely(rc != MDBX_RESULT_TRUE)) { if (likely(rc != MDBX_RESULT_TRUE)) {
mc->mc_ki[mc->mc_top] = ki_top; mc->mc_ki[mc->mc_top] = (indx_t)ki_top;
mdbx_cassert(mc, rc || page_numkeys(mc->mc_pg[mc->mc_top]) >= minkeys); mdbx_cassert(mc, rc || page_numkeys(mc->mc_pg[mc->mc_top]) >= minkeys);
return rc; return rc;
} }
} }
if (nkeys >= minkeys) { if (nkeys >= minkeys) {
#if MDBX_DEBUG > 0 mc->mc_ki[mc->mc_top] = (indx_t)ki_top;
if (mdbx_audit_enabled()) if (!mdbx_audit_enabled())
return mdbx_cursor_check(mc, C_UPDATING);
#endif
return MDBX_SUCCESS; return MDBX_SUCCESS;
return mdbx_cursor_check(mc, C_UPDATING);
} }
if (left && (!right || page_room(left) > page_room(right))) { if (likely(room_threshold > 0)) {
/* try merge with left */ room_threshold = 0;
mdbx_cassert(mc, page_numkeys(left) >= minkeys); goto retry;
mn.mc_pg[mn.mc_top] = left;
mn.mc_ki[mn.mc_top - 1] = ki_pre_top - 1;
mn.mc_ki[mn.mc_top] = (indx_t)(page_numkeys(left) - 1);
mc->mc_ki[mc->mc_top] = 0;
const indx_t new_ki = (indx_t)(ki_top + page_numkeys(left));
mn.mc_ki[mn.mc_top] += mc->mc_ki[mn.mc_top] + 1;
/* We want mdbx_rebalance to find mn when doing fixups */
WITH_CURSOR_TRACKING(mn, rc = mdbx_page_merge(mc, &mn));
if (likely(rc != MDBX_RESULT_TRUE)) {
cursor_copy_internal(&mn, mc);
mc->mc_ki[mc->mc_top] = new_ki;
mdbx_cassert(mc, rc || page_numkeys(mc->mc_pg[mc->mc_top]) >= minkeys);
return rc;
}
}
if (likely(right)) {
/* try merge with right */
mdbx_cassert(mc, page_numkeys(right) >= minkeys);
mn.mc_pg[mn.mc_top] = right;
mn.mc_ki[mn.mc_top - 1] = ki_pre_top + 1;
mn.mc_ki[mn.mc_top] = 0;
mc->mc_ki[mc->mc_top] = nkeys;
WITH_CURSOR_TRACKING(mn, rc = mdbx_page_merge(&mn, mc));
if (likely(rc != MDBX_RESULT_TRUE)) {
mc->mc_ki[mc->mc_top] = ki_top;
mdbx_cassert(mc, rc || page_numkeys(mc->mc_pg[mc->mc_top]) >= minkeys);
return rc;
}
} }
return MDBX_PROBLEM; return MDBX_PROBLEM;
} }