mdbx: alter mdbx_rebalance().

This commit is contained in:
Leo Yuriev 2018-09-09 21:51:04 +03:00
parent ff738f1512
commit 76f7c118c6

View File

@ -10011,12 +10011,14 @@ static void mdbx_cursor_copy(const MDBX_cursor *csrc, MDBX_cursor *cdst) {
* Returns 0 on success, non-zero on failure. */ * Returns 0 on success, non-zero on failure. */
static int mdbx_rebalance(MDBX_cursor *mc) { static int mdbx_rebalance(MDBX_cursor *mc) {
MDBX_node *node; MDBX_node *node;
int rc, fromleft; int rc;
unsigned ptop, minkeys, thresh; unsigned minkeys, thresh;
MDBX_cursor mn;
indx_t oldki;
if (IS_BRANCH(mc->mc_pg[mc->mc_top])) { mdbx_cassert(mc, mc->mc_snum > 0);
mdbx_cassert(mc, mc->mc_snum < mc->mc_db->md_depth ||
IS_LEAF(mc->mc_pg[mc->mc_db->md_depth - 1]));
const int pagetype = PAGETYPE(mc->mc_pg[mc->mc_top]);
if (pagetype == P_BRANCH) {
minkeys = 2; minkeys = 2;
thresh = 1; thresh = 1;
} else { } else {
@ -10024,9 +10026,9 @@ static int mdbx_rebalance(MDBX_cursor *mc) {
thresh = FILL_THRESHOLD; thresh = FILL_THRESHOLD;
} }
mdbx_debug("rebalancing %s page %" PRIaPGNO " (has %u keys, %.1f%% full)", mdbx_debug("rebalancing %s page %" PRIaPGNO " (has %u keys, %.1f%% full)",
IS_LEAF(mc->mc_pg[mc->mc_top]) ? "leaf" : "branch", (pagetype & P_LEAF) ? "leaf" : "branch",
mc->mc_pg[mc->mc_top]->mp_pgno, NUMKEYS(mc->mc_pg[mc->mc_top]), mc->mc_pg[mc->mc_top]->mp_pgno, NUMKEYS(mc->mc_pg[mc->mc_top]),
(float)PAGEFILL(mc->mc_txn->mt_env, mc->mc_pg[mc->mc_top]) / 10); PAGEFILL(mc->mc_txn->mt_env, mc->mc_pg[mc->mc_top]) / 10.24);
if (PAGEFILL(mc->mc_txn->mt_env, mc->mc_pg[mc->mc_top]) >= thresh && if (PAGEFILL(mc->mc_txn->mt_env, mc->mc_pg[mc->mc_top]) >= thresh &&
NUMKEYS(mc->mc_pg[mc->mc_top]) >= minkeys) { NUMKEYS(mc->mc_pg[mc->mc_top]) >= minkeys) {
@ -10042,6 +10044,7 @@ static int mdbx_rebalance(MDBX_cursor *mc) {
mdbx_cassert(mc, (mc->mc_db->md_entries == 0) == (nkeys == 0)); mdbx_cassert(mc, (mc->mc_db->md_entries == 0) == (nkeys == 0));
if (IS_SUBP(mp)) { if (IS_SUBP(mp)) {
mdbx_debug("Can't rebalance a subpage, ignoring"); mdbx_debug("Can't rebalance a subpage, ignoring");
mdbx_cassert(mc, pagetype & P_LEAF);
return MDBX_SUCCESS; return MDBX_SUCCESS;
} }
if (nkeys == 0) { if (nkeys == 0) {
@ -10056,7 +10059,7 @@ static int mdbx_rebalance(MDBX_cursor *mc) {
if (mc->mc_flags & C_SUB) if (mc->mc_flags & C_SUB)
mdbx_outer_db(mc)->md_leaf_pages -= 1; mdbx_outer_db(mc)->md_leaf_pages -= 1;
rc = mdbx_pnl_append(&mc->mc_txn->mt_befree_pages, mp->mp_pgno); rc = mdbx_pnl_append(&mc->mc_txn->mt_befree_pages, mp->mp_pgno);
if (unlikely(rc)) if (unlikely(rc != MDBX_SUCCESS))
return rc; return rc;
/* Adjust cursors pointing to mp */ /* Adjust cursors pointing to mp */
const MDBX_dbi dbi = mc->mc_dbi; const MDBX_dbi dbi = mc->mc_dbi;
@ -10064,7 +10067,8 @@ static int mdbx_rebalance(MDBX_cursor *mc) {
m2 = m2->mc_next) { m2 = m2->mc_next) {
MDBX_cursor *m3 = MDBX_cursor *m3 =
(mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2;
if (!(m3->mc_flags & C_INITIALIZED) || (m3->mc_snum < mc->mc_snum)) if (m3 == mc || !(m3->mc_flags & C_INITIALIZED) ||
(m3->mc_snum < mc->mc_snum))
continue; continue;
if (m3->mc_pg[0] == mp) { if (m3->mc_pg[0] == mp) {
m3->mc_snum = 0; m3->mc_snum = 0;
@ -10082,7 +10086,7 @@ static int mdbx_rebalance(MDBX_cursor *mc) {
return rc; return rc;
mc->mc_db->md_root = NODEPGNO(NODEPTR(mp, 0)); mc->mc_db->md_root = NODEPGNO(NODEPTR(mp, 0));
rc = mdbx_page_get(mc, mc->mc_db->md_root, &mc->mc_pg[0], NULL); rc = mdbx_page_get(mc, mc->mc_db->md_root, &mc->mc_pg[0], NULL);
if (unlikely(rc)) if (unlikely(rc != MDBX_SUCCESS))
return rc; return rc;
mc->mc_db->md_depth--; mc->mc_db->md_depth--;
mc->mc_db->md_branch_pages--; mc->mc_db->md_branch_pages--;
@ -10093,19 +10097,14 @@ static int mdbx_rebalance(MDBX_cursor *mc) {
mc->mc_pg[i] = mc->mc_pg[i + 1]; mc->mc_pg[i] = mc->mc_pg[i + 1];
mc->mc_ki[i] = mc->mc_ki[i + 1]; mc->mc_ki[i] = mc->mc_ki[i + 1];
} }
{
/* Adjust other cursors pointing to mp */ /* Adjust other cursors pointing to mp */
MDBX_cursor *m2, *m3; MDBX_cursor *m2, *m3;
MDBX_dbi dbi = mc->mc_dbi; MDBX_dbi dbi = mc->mc_dbi;
for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) { for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) {
if (mc->mc_flags & C_SUB) m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2;
m3 = &m2->mc_xcursor->mx_cursor; if (m3 == mc || !(m3->mc_flags & C_INITIALIZED))
else
m3 = m2;
if (m3 == mc)
continue;
if (!(m3->mc_flags & C_INITIALIZED))
continue; continue;
if (m3->mc_pg[0] == mp) { if (m3->mc_pg[0] == mp) {
for (int i = 0; i < mc->mc_db->md_depth; i++) { for (int i = 0; i < mc->mc_db->md_depth; i++) {
@ -10116,7 +10115,11 @@ static int mdbx_rebalance(MDBX_cursor *mc) {
m3->mc_top--; m3->mc_top--;
} }
} }
}
mdbx_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top]) ||
PAGETYPE(mc->mc_pg[mc->mc_top]) == pagetype);
mdbx_cassert(mc, mc->mc_snum < mc->mc_db->md_depth ||
IS_LEAF(mc->mc_pg[mc->mc_db->md_depth - 1]));
} else { } else {
mdbx_debug("root page %" PRIaPGNO mdbx_debug("root page %" PRIaPGNO
" doesn't need rebalancing (flags 0x%x)", " doesn't need rebalancing (flags 0x%x)",
@ -10127,50 +10130,53 @@ static int mdbx_rebalance(MDBX_cursor *mc) {
/* The parent (branch page) must have at least 2 pointers, /* The parent (branch page) must have at least 2 pointers,
* otherwise the tree is invalid. */ * otherwise the tree is invalid. */
ptop = mc->mc_top - 1; const unsigned pre_top = mc->mc_top - 1;
mdbx_cassert(mc, IS_BRANCH(mc->mc_pg[ptop])); mdbx_cassert(mc, IS_BRANCH(mc->mc_pg[pre_top]));
mdbx_cassert(mc, NUMKEYS(mc->mc_pg[ptop]) > 1); mdbx_cassert(mc, !IS_SUBP(mc->mc_pg[0]));
mdbx_cassert(mc, NUMKEYS(mc->mc_pg[pre_top]) > 1);
/* Leaf page fill factor is below the threshold. /* Leaf page fill factor is below the threshold.
* Try to move keys from left or right neighbor, or * Try to move keys from left or right neighbor, or
* merge with a neighbor page. */ * merge with a neighbor page. */
/* Find neighbors. */ /* Find neighbors. */
MDBX_cursor mn;
mdbx_cursor_copy(mc, &mn); mdbx_cursor_copy(mc, &mn);
mn.mc_xcursor = NULL; mn.mc_xcursor = NULL;
oldki = mc->mc_ki[mc->mc_top]; indx_t oldki = mc->mc_ki[mc->mc_top];
if (mc->mc_ki[ptop] == 0) { bool fromleft;
if (mc->mc_ki[pre_top] == 0) {
/* We're the leftmost leaf in our parent. */ /* We're the leftmost leaf in our parent. */
mdbx_debug("reading right neighbor"); mdbx_debug("reading right neighbor");
mn.mc_ki[ptop]++; mn.mc_ki[pre_top]++;
node = NODEPTR(mc->mc_pg[ptop], mn.mc_ki[ptop]); node = NODEPTR(mc->mc_pg[pre_top], mn.mc_ki[pre_top]);
rc = mdbx_page_get(mc, NODEPGNO(node), &mn.mc_pg[mn.mc_top], NULL); rc = mdbx_page_get(mc, NODEPGNO(node), &mn.mc_pg[mn.mc_top], NULL);
if (unlikely(rc)) if (unlikely(rc != MDBX_SUCCESS))
return rc; return rc;
mdbx_cassert(mc, PAGETYPE(mn.mc_pg[mn.mc_top]) == mdbx_cassert(mc, PAGETYPE(mn.mc_pg[mn.mc_top]) ==
PAGETYPE(mc->mc_pg[mc->mc_top])); PAGETYPE(mc->mc_pg[mc->mc_top]));
mn.mc_ki[mn.mc_top] = 0; mn.mc_ki[mn.mc_top] = 0;
mc->mc_ki[mc->mc_top] = NUMKEYS(mc->mc_pg[mc->mc_top]); mc->mc_ki[mc->mc_top] = NUMKEYS(mc->mc_pg[mc->mc_top]);
fromleft = 0; fromleft = false;
} else { } else {
/* There is at least one neighbor to the left. */ /* There is at least one neighbor to the left. */
mdbx_debug("reading left neighbor"); mdbx_debug("reading left neighbor");
mn.mc_ki[ptop]--; mn.mc_ki[pre_top]--;
node = NODEPTR(mc->mc_pg[ptop], mn.mc_ki[ptop]); node = NODEPTR(mc->mc_pg[pre_top], mn.mc_ki[pre_top]);
rc = mdbx_page_get(mc, NODEPGNO(node), &mn.mc_pg[mn.mc_top], NULL); rc = mdbx_page_get(mc, NODEPGNO(node), &mn.mc_pg[mn.mc_top], NULL);
if (unlikely(rc)) if (unlikely(rc != MDBX_SUCCESS))
return rc; return rc;
mdbx_cassert(mc, PAGETYPE(mn.mc_pg[mn.mc_top]) == mdbx_cassert(mc, PAGETYPE(mn.mc_pg[mn.mc_top]) ==
PAGETYPE(mc->mc_pg[mc->mc_top])); PAGETYPE(mc->mc_pg[mc->mc_top]));
mn.mc_ki[mn.mc_top] = NUMKEYS(mn.mc_pg[mn.mc_top]) - 1; mn.mc_ki[mn.mc_top] = NUMKEYS(mn.mc_pg[mn.mc_top]) - 1;
mc->mc_ki[mc->mc_top] = 0; mc->mc_ki[mc->mc_top] = 0;
fromleft = 1; fromleft = true;
} }
mdbx_debug("found neighbor page %" PRIaPGNO " (%u keys, %.1f%% full)", mdbx_debug("found neighbor page %" PRIaPGNO " (%u keys, %.1f%% full)",
mn.mc_pg[mn.mc_top]->mp_pgno, NUMKEYS(mn.mc_pg[mn.mc_top]), mn.mc_pg[mn.mc_top]->mp_pgno, NUMKEYS(mn.mc_pg[mn.mc_top]),
(float)PAGEFILL(mc->mc_txn->mt_env, mn.mc_pg[mn.mc_top]) / 10); PAGEFILL(mc->mc_txn->mt_env, mn.mc_pg[mn.mc_top]) / 10.24);
/* If the neighbor page is above threshold and has enough keys, /* If the neighbor page is above threshold and has enough keys,
* move one key from it. Otherwise we should try to merge them. * move one key from it. Otherwise we should try to merge them.
@ -10178,24 +10184,39 @@ static int mdbx_rebalance(MDBX_cursor *mc) {
if (PAGEFILL(mc->mc_txn->mt_env, mn.mc_pg[mn.mc_top]) >= thresh && if (PAGEFILL(mc->mc_txn->mt_env, mn.mc_pg[mn.mc_top]) >= thresh &&
NUMKEYS(mn.mc_pg[mn.mc_top]) > minkeys) { NUMKEYS(mn.mc_pg[mn.mc_top]) > minkeys) {
rc = mdbx_node_move(&mn, mc, fromleft); rc = mdbx_node_move(&mn, mc, fromleft);
if (fromleft) { if (unlikely(rc != MDBX_SUCCESS))
/* if we inserted on left, bump position up */ return rc;
oldki++; oldki += fromleft /* if we inserted on left, bump position up */;
} mdbx_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top]) ||
PAGETYPE(mc->mc_pg[mc->mc_top]) == pagetype);
mdbx_cassert(mc, mc->mc_snum < mc->mc_db->md_depth ||
IS_LEAF(mc->mc_pg[mc->mc_db->md_depth - 1]));
} else { } else {
if (!fromleft) { if (!fromleft) {
rc = mdbx_page_merge(&mn, mc); rc = mdbx_page_merge(&mn, mc);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
mdbx_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top]) ||
PAGETYPE(mc->mc_pg[mc->mc_top]) == pagetype);
mdbx_cassert(mc, mc->mc_snum < mc->mc_db->md_depth ||
IS_LEAF(mc->mc_pg[mc->mc_db->md_depth - 1]));
} else { } else {
oldki += NUMKEYS(mn.mc_pg[mn.mc_top]); oldki += NUMKEYS(mn.mc_pg[mn.mc_top]);
mn.mc_ki[mn.mc_top] += mc->mc_ki[mn.mc_top] + 1; mn.mc_ki[mn.mc_top] += mc->mc_ki[mn.mc_top] + 1;
/* We want mdbx_rebalance to find mn when doing fixups */ /* We want mdbx_rebalance to find mn when doing fixups */
WITH_CURSOR_TRACKING(mn, rc = mdbx_page_merge(mc, &mn)); WITH_CURSOR_TRACKING(mn, rc = mdbx_page_merge(mc, &mn));
if (unlikely(rc != MDBX_SUCCESS))
return rc;
mdbx_cursor_copy(&mn, mc); mdbx_cursor_copy(&mn, mc);
mdbx_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top]) ||
PAGETYPE(mc->mc_pg[mc->mc_top]) == pagetype);
mdbx_cassert(mc, mc->mc_snum < mc->mc_db->md_depth ||
IS_LEAF(mc->mc_pg[mc->mc_db->md_depth - 1]));
} }
mc->mc_flags &= ~C_EOF; mc->mc_flags &= ~C_EOF;
} }
mc->mc_ki[mc->mc_top] = oldki; mc->mc_ki[mc->mc_top] = oldki;
return rc; return MDBX_SUCCESS;
} }
/* Complete a delete operation started by mdbx_cursor_del(). */ /* Complete a delete operation started by mdbx_cursor_del(). */