mdbx: MAJOR rework page splitting (squashed).

Basically, this (squashed) commit introduces:
 - An "auto-appending" feature upon insertion for both ascending and
   descending key sequences. As a result, the optimality of page filling
   increases significantly (more densely, less slackness) while
   inserting ordered sequences of keys,
 - A "splitting at middle" for more balanced page tree on average.

---

1. Using left/middle/right tactics for finding the split point of a page:
   - If a key is inserted close to an edge of page,
     then the page splits at that edge;
   - Otherwise a page splits at the middle,
     which leads to a more balanced tree on average;
   - So I expect a better behavior on average,
     but actually effects should be studied further practically.

2. New code for calculating the midpoint of a page split.

3. APPEND-flags no longer affect choosing the page split point.

4. Added left-side splitting by inserting a pure page with a new entry.

Change-Id: Id7441acfc8c90636e3be6bc00a0df15714690f3c
This commit is contained in:
Leonid Yuriev 2021-03-29 00:49:34 +03:00
parent e0795227e4
commit af9b7b5605

View File

@ -57,6 +57,13 @@ MDBX_NOTHROW_CONST_FUNCTION static unsigned log2n(size_t value) {
#endif #endif
} }
MDBX_NOTHROW_CONST_FUNCTION static unsigned branchless_abs(int value) {
assert(value > INT_MIN);
const unsigned expanded_sign =
(unsigned)(value >> (sizeof(value) * CHAR_BIT - 1));
return ((unsigned)value + expanded_sign) ^ expanded_sign;
}
/* Pack/Unpack 16-bit values for Grow step & Shrink threshold */ /* Pack/Unpack 16-bit values for Grow step & Shrink threshold */
MDBX_NOTHROW_CONST_FUNCTION static __inline pgno_t me2v(unsigned m, MDBX_NOTHROW_CONST_FUNCTION static __inline pgno_t me2v(unsigned m,
unsigned e) { unsigned e) {
@ -3683,8 +3690,8 @@ static int __must_check_result mdbx_page_flush(MDBX_txn *txn,
#define MDBX_SPLIT_REPLACE MDBX_APPENDDUP /* newkey is not new */ #define MDBX_SPLIT_REPLACE MDBX_APPENDDUP /* newkey is not new */
static int __must_check_result mdbx_page_split(MDBX_cursor *mc, static int __must_check_result mdbx_page_split(MDBX_cursor *mc,
const MDBX_val *newkey, const MDBX_val *const newkey,
MDBX_val *newdata, MDBX_val *const newdata,
pgno_t newpgno, unsigned nflags); pgno_t newpgno, unsigned nflags);
static int __must_check_result mdbx_read_header(MDBX_env *env, MDBX_meta *meta, static int __must_check_result mdbx_read_header(MDBX_env *env, MDBX_meta *meta,
@ -14583,8 +14590,6 @@ new_sub:;
size_t nsize = IS_LEAF2(mc->mc_pg[mc->mc_top]) ? key->iov_len size_t nsize = IS_LEAF2(mc->mc_pg[mc->mc_top]) ? key->iov_len
: leaf_size(env, key, rdata); : leaf_size(env, key, rdata);
if (page_room(mc->mc_pg[mc->mc_top]) < nsize) { if (page_room(mc->mc_pg[mc->mc_top]) < nsize) {
if ((flags & (F_DUPDATA | F_SUBDATA)) == F_DUPDATA)
nflags &= ~MDBX_APPEND; /* sub-page may need room to grow */
if (!insert_key) if (!insert_key)
nflags |= MDBX_SPLIT_REPLACE; nflags |= MDBX_SPLIT_REPLACE;
rc = mdbx_page_split(mc, key, rdata, P_INVALID, nflags); rc = mdbx_page_split(mc, key, rdata, P_INVALID, nflags);
@ -17079,49 +17084,58 @@ static int mdbx_del0(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key,
* [in] newpgno The page number, if the new node is a branch node. * [in] newpgno The page number, if the new node is a branch node.
* [in] nflags The NODE_ADD_FLAGS for the new node. * [in] nflags The NODE_ADD_FLAGS for the new node.
* Returns 0 on success, non-zero on failure. */ * Returns 0 on success, non-zero on failure. */
static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *newkey, static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *const newkey,
MDBX_val *newdata, pgno_t newpgno, unsigned nflags) { MDBX_val *const newdata, pgno_t newpgno,
unsigned nflags) {
unsigned flags; unsigned flags;
int rc = MDBX_SUCCESS, foliage = 0, did_split = 0; int rc = MDBX_SUCCESS, foliage = 0;
pgno_t pgno = 0;
unsigned i, ptop; unsigned i, ptop;
MDBX_env *env = mc->mc_txn->mt_env; MDBX_env *const env = mc->mc_txn->mt_env;
MDBX_val sepkey, rkey, xdata; MDBX_val sepkey, rkey, xdata;
MDBX_page *copy = NULL; MDBX_page *tmp_ki_copy = NULL;
MDBX_page *rp, *pp;
MDBX_cursor mn;
DKBUF; DKBUF;
MDBX_page *mp = mc->mc_pg[mc->mc_top]; MDBX_page *const mp = mc->mc_pg[mc->mc_top];
unsigned newindx = mc->mc_ki[mc->mc_top]; const unsigned newindx = mc->mc_ki[mc->mc_top];
unsigned nkeys = page_numkeys(mp); unsigned nkeys = page_numkeys(mp);
if (mdbx_audit_enabled()) { if (mdbx_audit_enabled()) {
rc = mdbx_cursor_check(mc, C_UPDATING); rc = mdbx_cursor_check(mc, C_UPDATING);
if (unlikely(rc != MDBX_SUCCESS)) if (unlikely(rc != MDBX_SUCCESS))
return rc; return rc;
} }
STATIC_ASSERT(P_BRANCH == 1);
const unsigned minkeys = (mp->mp_flags & P_BRANCH) + 1;
mdbx_cassert(mc, nkeys + 1 >= (unsigned)(IS_BRANCH(mp) ? 4 : 2)); mdbx_debug(">> splitting %s-page %" PRIaPGNO
mdbx_debug("-----> splitting %s page %" PRIaPGNO " and adding %zu+%zu [%s] at %i, nkeys %i",
" and adding [%s] at index %i/%i", IS_LEAF(mp) ? "leaf" : "branch", mp->mp_pgno, newkey->iov_len,
IS_LEAF(mp) ? "leaf" : "branch", mp->mp_pgno, DKEY(newkey), newdata ? newdata->iov_len : 0, DKEY_DEBUG(newkey),
mc->mc_ki[mc->mc_top], nkeys); mc->mc_ki[mc->mc_top], nkeys);
mdbx_cassert(mc, nkeys + 1 >= minkeys * 2);
/* Create a right sibling. */ /* Create a new sibling page. */
if ((rc = mdbx_page_new(mc, mp->mp_flags, 1, &rp))) MDBX_page *sister;
rc = mdbx_page_new(mc, mp->mp_flags, 1, &sister);
if (unlikely(rc != MDBX_SUCCESS))
return rc; return rc;
rp->mp_leaf2_ksize = mp->mp_leaf2_ksize; sister->mp_leaf2_ksize = mp->mp_leaf2_ksize;
mdbx_debug("new right sibling: page %" PRIaPGNO, rp->mp_pgno); mdbx_debug("new sibling: page %" PRIaPGNO, sister->mp_pgno);
/* Usually when splitting the root page, the cursor /* Usually when splitting the root page, the cursor
* height is 1. But when called from mdbx_update_key, * height is 1. But when called from mdbx_update_key,
* the cursor height may be greater because it walks * the cursor height may be greater because it walks
* up the stack while finding the branch slot to update. */ * up the stack while finding the branch slot to update. */
if (mc->mc_top < 1) { if (mc->mc_top < 1) {
if ((rc = mdbx_page_new(mc, P_BRANCH, 1, &pp))) MDBX_page *pp;
rc = mdbx_page_new(mc, P_BRANCH, 1, &pp);
if (unlikely(rc != MDBX_SUCCESS))
goto done; goto done;
/* shift current top to make room for new parent */ /* shift current top to make room for new parent */
mdbx_cassert(mc, mc->mc_snum < 2 && mc->mc_db->md_depth > 0); mdbx_cassert(mc, mc->mc_snum < 2 && mc->mc_db->md_depth > 0);
#if MDBX_DEBUG
memset(mc->mc_pg + 3, 0, sizeof(mc->mc_pg) - sizeof(mc->mc_pg[0]) * 3);
memset(mc->mc_ki + 3, -1, sizeof(mc->mc_ki) - sizeof(mc->mc_ki[0]) * 3);
#endif
mc->mc_pg[2] = mc->mc_pg[1]; mc->mc_pg[2] = mc->mc_pg[1];
mc->mc_ki[2] = mc->mc_ki[1]; mc->mc_ki[2] = mc->mc_ki[1];
mc->mc_pg[1] = mc->mc_pg[0]; mc->mc_pg[1] = mc->mc_pg[0];
@ -17133,8 +17147,8 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *newkey,
foliage = mc->mc_db->md_depth++; foliage = mc->mc_db->md_depth++;
/* Add left (implicit) pointer. */ /* Add left (implicit) pointer. */
if (unlikely((rc = mdbx_node_add_branch(mc, 0, NULL, mp->mp_pgno)) != rc = mdbx_node_add_branch(mc, 0, NULL, mp->mp_pgno);
MDBX_SUCCESS)) { if (unlikely(rc != MDBX_SUCCESS)) {
/* undo the pre-push */ /* undo the pre-push */
mc->mc_pg[0] = mc->mc_pg[1]; mc->mc_pg[0] = mc->mc_pg[1];
mc->mc_ki[0] = mc->mc_ki[1]; mc->mc_ki[0] = mc->mc_ki[1];
@ -17145,26 +17159,73 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *newkey,
mc->mc_snum++; mc->mc_snum++;
mc->mc_top++; mc->mc_top++;
ptop = 0; ptop = 0;
if (mdbx_audit_enabled()) {
rc = mdbx_cursor_check(mc, C_UPDATING);
if (unlikely(rc != MDBX_SUCCESS))
goto done;
}
} else { } else {
ptop = mc->mc_top - 1; ptop = mc->mc_top - 1;
mdbx_debug("parent branch page is %" PRIaPGNO, mc->mc_pg[ptop]->mp_pgno); mdbx_debug("parent branch page is %" PRIaPGNO, mc->mc_pg[ptop]->mp_pgno);
} }
MDBX_cursor mn;
cursor_copy_internal(mc, &mn); cursor_copy_internal(mc, &mn);
mn.mc_xcursor = NULL; mn.mc_xcursor = NULL;
mn.mc_pg[mn.mc_top] = rp; mn.mc_pg[mn.mc_top] = sister;
mn.mc_ki[mn.mc_top] = 0; mn.mc_ki[mn.mc_top] = 0;
mn.mc_ki[ptop] = mc->mc_ki[ptop] + 1; mn.mc_ki[ptop] = mc->mc_ki[ptop] + 1;
unsigned split_indx; unsigned split_indx =
if (nflags & MDBX_APPEND) { (newindx < nkeys)
mn.mc_ki[mn.mc_top] = 0; ? /* split at the middle */ (nkeys + 1) / 2
: /* split at the end (i.e. like append-mode ) */ nkeys - minkeys + 1;
mdbx_cassert(mc, !IS_BRANCH(mp) || newindx > 0);
/* It is reasonable and possible to split the page at the begin */
if (unlikely(newindx < minkeys)) {
split_indx = minkeys;
if (newindx == 0 && foliage == 0 && !(nflags & MDBX_SPLIT_REPLACE)) {
split_indx = 0;
/* Checking for ability of splitting by the left-side insertion
* of a pure page with the new key */
for (i = 0; i < mc->mc_top; ++i)
if (mc->mc_ki[i]) {
get_key(page_node(mc->mc_pg[i], mc->mc_ki[i]), &sepkey);
if (mc->mc_dbx->md_cmp(newkey, &sepkey) >= 0)
split_indx = minkeys;
break;
}
if (split_indx == 0) {
/* Save the current first key which was omitted on the parent branch
* page and should be updated if the new first entry will be added */
if (IS_LEAF2(mp)) {
sepkey.iov_len = mp->mp_leaf2_ksize;
sepkey.iov_base = page_leaf2key(mp, 0, sepkey.iov_len);
} else
get_key(page_node(mp, 0), &sepkey);
mdbx_cassert(mc, mc->mc_dbx->md_cmp(newkey, &sepkey) < 0);
/* Avoiding rare complex cases of split the parent page */
if (page_room(mn.mc_pg[ptop]) < branch_size(env, &sepkey))
split_indx = minkeys;
}
}
}
const bool pure_right = split_indx == nkeys;
const bool pure_left = split_indx == 0;
if (unlikely(pure_right)) {
/* newindx == split_indx == nkeys */
mdbx_trace("no-split, but add new pure page at the %s", "right/after");
mdbx_cassert(mc, newindx == nkeys && split_indx == nkeys && minkeys == 1);
sepkey = *newkey; sepkey = *newkey;
split_indx = newindx; } else if (unlikely(pure_left)) {
nkeys = 0; /* newindx == split_indx == 0 */
mdbx_trace("no-split, but add new pure page at the %s", "left/before");
mdbx_cassert(mc, newindx == 0 && split_indx == 0 && minkeys == 1);
mdbx_trace("old-first-key is %s", DKEY_DEBUG(&sepkey));
} else { } else {
split_indx = (nkeys + 1) / 2; if (IS_LEAF2(sister)) {
if (IS_LEAF2(rp)) {
char *split, *ins; char *split, *ins;
unsigned lsize, rsize, ksize; unsigned lsize, rsize, ksize;
/* Move half of the keys to the right sibling */ /* Move half of the keys to the right sibling */
@ -17175,23 +17236,19 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *newkey,
lsize = (nkeys - split_indx) * sizeof(indx_t); lsize = (nkeys - split_indx) * sizeof(indx_t);
mdbx_cassert(mc, mp->mp_lower >= lsize); mdbx_cassert(mc, mp->mp_lower >= lsize);
mp->mp_lower -= (indx_t)lsize; mp->mp_lower -= (indx_t)lsize;
mdbx_cassert(mc, rp->mp_lower + lsize <= UINT16_MAX); mdbx_cassert(mc, sister->mp_lower + lsize <= UINT16_MAX);
rp->mp_lower += (indx_t)lsize; sister->mp_lower += (indx_t)lsize;
mdbx_cassert(mc, mp->mp_upper + rsize - lsize <= UINT16_MAX); mdbx_cassert(mc, mp->mp_upper + rsize - lsize <= UINT16_MAX);
mp->mp_upper += (indx_t)(rsize - lsize); mp->mp_upper += (indx_t)(rsize - lsize);
mdbx_cassert(mc, rp->mp_upper >= rsize - lsize); mdbx_cassert(mc, sister->mp_upper >= rsize - lsize);
rp->mp_upper -= (indx_t)(rsize - lsize); sister->mp_upper -= (indx_t)(rsize - lsize);
sepkey.iov_len = ksize; sepkey.iov_len = ksize;
if (newindx == split_indx) { sepkey.iov_base = (newindx != split_indx) ? split : newkey->iov_base;
sepkey.iov_base = newkey->iov_base;
} else {
sepkey.iov_base = split;
}
if (x < 0) { if (x < 0) {
mdbx_cassert(mc, ksize >= sizeof(indx_t)); mdbx_cassert(mc, ksize >= sizeof(indx_t));
ins = page_leaf2key(mp, mc->mc_ki[mc->mc_top], ksize); ins = page_leaf2key(mp, mc->mc_ki[mc->mc_top], ksize);
memcpy(rp->mp_ptrs, split, rsize); memcpy(sister->mp_ptrs, split, rsize);
sepkey.iov_base = rp->mp_ptrs; sepkey.iov_base = sister->mp_ptrs;
memmove(ins + ksize, ins, (split_indx - mc->mc_ki[mc->mc_top]) * ksize); memmove(ins + ksize, ins, (split_indx - mc->mc_ki[mc->mc_top]) * ksize);
memcpy(ins, newkey->iov_base, ksize); memcpy(ins, newkey->iov_base, ksize);
mdbx_cassert(mc, UINT16_MAX - mp->mp_lower >= (int)sizeof(indx_t)); mdbx_cassert(mc, UINT16_MAX - mp->mp_lower >= (int)sizeof(indx_t));
@ -17199,41 +17256,50 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *newkey,
mdbx_cassert(mc, mp->mp_upper >= ksize - sizeof(indx_t)); mdbx_cassert(mc, mp->mp_upper >= ksize - sizeof(indx_t));
mp->mp_upper -= (indx_t)(ksize - sizeof(indx_t)); mp->mp_upper -= (indx_t)(ksize - sizeof(indx_t));
} else { } else {
memcpy(rp->mp_ptrs, split, x * ksize); memcpy(sister->mp_ptrs, split, x * ksize);
ins = page_leaf2key(rp, x, ksize); ins = page_leaf2key(sister, x, ksize);
memcpy(ins, newkey->iov_base, ksize); memcpy(ins, newkey->iov_base, ksize);
memcpy(ins + ksize, split + x * ksize, rsize - x * ksize); memcpy(ins + ksize, split + x * ksize, rsize - x * ksize);
mdbx_cassert(mc, UINT16_MAX - rp->mp_lower >= (int)sizeof(indx_t)); mdbx_cassert(mc, UINT16_MAX - sister->mp_lower >= (int)sizeof(indx_t));
rp->mp_lower += sizeof(indx_t); sister->mp_lower += sizeof(indx_t);
mdbx_cassert(mc, rp->mp_upper >= ksize - sizeof(indx_t)); mdbx_cassert(mc, sister->mp_upper >= ksize - sizeof(indx_t));
rp->mp_upper -= (indx_t)(ksize - sizeof(indx_t)); sister->mp_upper -= (indx_t)(ksize - sizeof(indx_t));
mdbx_cassert(mc, x <= (int)UINT16_MAX); mdbx_cassert(mc, x <= (int)UINT16_MAX);
mc->mc_ki[mc->mc_top] = (indx_t)x; mc->mc_ki[mc->mc_top] = (indx_t)x;
} }
if (mdbx_audit_enabled()) {
rc = mdbx_cursor_check(mc, C_UPDATING);
if (unlikely(rc != MDBX_SUCCESS))
goto done;
rc = mdbx_cursor_check(&mn, C_UPDATING);
if (unlikely(rc != MDBX_SUCCESS))
goto done;
}
} else { } else {
/* Maximum free space in an empty page */ /* Maximum free space in an empty page */
const unsigned pmax = page_space(env); const unsigned max_space = page_space(env);
const size_t nsize = IS_LEAF(mp) ? leaf_size(env, newkey, newdata) const size_t new_size = IS_LEAF(mp) ? leaf_size(env, newkey, newdata)
: branch_size(env, newkey); : branch_size(env, newkey);
/* grab a page to hold a temporary copy */ /* grab a page to hold a temporary copy */
copy = mdbx_page_malloc(mc->mc_txn, 1); tmp_ki_copy = mdbx_page_malloc(mc->mc_txn, 1);
if (unlikely(copy == NULL)) { if (unlikely(tmp_ki_copy == NULL)) {
rc = MDBX_ENOMEM; rc = MDBX_ENOMEM;
goto done; goto done;
} }
copy->mp_pgno = mp->mp_pgno;
copy->mp_flags = mp->mp_flags;
copy->mp_txnid = INVALID_TXNID;
copy->mp_lower = 0;
copy->mp_upper = (indx_t)page_space(env);
/* prepare to insert */ /* prepare to insert */
for (unsigned j = i = 0; i < nkeys; i++) { for (unsigned j = i = 0; i < nkeys; ++i, ++j) {
if (i == newindx) tmp_ki_copy->mp_ptrs[j] = 0;
copy->mp_ptrs[j++] = 0; j += (i == newindx);
copy->mp_ptrs[j++] = mp->mp_ptrs[i]; tmp_ki_copy->mp_ptrs[j] = mp->mp_ptrs[i];
} }
tmp_ki_copy->mp_pgno = mp->mp_pgno;
tmp_ki_copy->mp_flags = mp->mp_flags;
tmp_ki_copy->mp_txnid = INVALID_TXNID;
tmp_ki_copy->mp_lower = 0;
tmp_ki_copy->mp_upper = (indx_t)max_space;
/* When items are relatively large the split point needs /* When items are relatively large the split point needs
* to be checked, because being off-by-one will make the * to be checked, because being off-by-one will make the
@ -17241,7 +17307,7 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *newkey,
* *
* It's also relevant if a page happens to be laid out * It's also relevant if a page happens to be laid out
* such that one half of its nodes are all "small" and * such that one half of its nodes are all "small" and
* the other half of its nodes are "large." If the new * the other half of its nodes are "large". If the new
* item is also "large" and falls on the half with * item is also "large" and falls on the half with
* "large" nodes, it also may not fit. * "large" nodes, it also may not fit.
* *
@ -17249,74 +17315,86 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *newkey,
* spot on the page (and thus, onto the new page), bias * spot on the page (and thus, onto the new page), bias
* the split so the new page is emptier than the old page. * the split so the new page is emptier than the old page.
* This yields better packing during sequential inserts. */ * This yields better packing during sequential inserts. */
if (nkeys < 32 || nsize > pmax / 16 || newindx >= nkeys) {
if (nkeys < 32 || new_size > max_space / 16) {
/* Find split point */ /* Find split point */
int dir; int dir;
size_t psize = 0; if (newindx <= split_indx) {
unsigned k;
if (newindx <= split_indx || newindx >= nkeys) {
i = 0; i = 0;
dir = 1; dir = 1;
k = (newindx >= nkeys) ? nkeys : split_indx + 1 + IS_LEAF(mp);
split_indx = k - 1;
} else { } else {
i = nkeys; i = nkeys;
dir = -1; dir = -1;
k = split_indx - 1;
split_indx += 1;
} }
size_t before = 0, after = new_size + page_used(env, mp);
int best = split_indx;
int best_offset = nkeys + 1;
mdbx_trace("seek separator from %u, step %i, default %u, new-idx %u, "
"new-size %zu",
i, dir, split_indx, newindx, new_size);
do { do {
if (i == newindx) { mdbx_cassert(mc, i <= nkeys);
psize += nsize; size_t size = new_size;
} else { if (i != newindx) {
MDBX_node *node = MDBX_node *node =
(MDBX_node *)((char *)mp + copy->mp_ptrs[i] + PAGEHDRSZ); (MDBX_node *)((char *)mp + tmp_ki_copy->mp_ptrs[i] + PAGEHDRSZ);
psize += NODESIZE + node_ks(node) + sizeof(indx_t); size = NODESIZE + node_ks(node) + sizeof(indx_t);
if (IS_LEAF(mp)) if (IS_LEAF(mp))
psize += F_ISSET(node_flags(node), F_BIGDATA) ? sizeof(pgno_t) size += F_ISSET(node_flags(node), F_BIGDATA) ? sizeof(pgno_t)
: node_ds(node); : node_ds(node);
psize = EVEN(psize); size = EVEN(size);
} }
if (psize > pmax) {
split_indx = i + (dir < 0); before += size;
break; after -= size;
mdbx_trace("step %u, size %zu, before %zu, after %zu, max %u", i,
size, before, after, max_space);
if (before <= max_space && after <= max_space) {
int offset = branchless_abs(split_indx - i);
if (offset >= best_offset)
break;
best_offset = offset;
best = i;
} }
i += dir; i += dir;
} while (i != k); } while (i < nkeys);
split_indx = best + (dir > 0);
split_indx = (split_indx <= nkeys - minkeys + 1) ? split_indx
: nkeys - minkeys + 1;
split_indx = (split_indx >= minkeys) ? split_indx : minkeys;
mdbx_trace("chosen %u", split_indx);
} }
if (split_indx == newindx) { sepkey.iov_len = newkey->iov_len;
sepkey.iov_len = newkey->iov_len; sepkey.iov_base = newkey->iov_base;
sepkey.iov_base = newkey->iov_base; if (split_indx != newindx) {
} else {
MDBX_node *node = MDBX_node *node =
(MDBX_node *)((char *)mp + copy->mp_ptrs[split_indx] + PAGEHDRSZ); (MDBX_node *)((char *)mp + tmp_ki_copy->mp_ptrs[split_indx] +
PAGEHDRSZ);
sepkey.iov_len = node_ks(node); sepkey.iov_len = node_ks(node);
sepkey.iov_base = node_key(node); sepkey.iov_base = node_key(node);
} }
} }
} }
mdbx_debug("separator is %d [%s]", split_indx, DKEY_DEBUG(&sepkey));
mdbx_debug("separator is %d [%s]", split_indx, DKEY(&sepkey)); bool did_split_parent = false;
if (mdbx_audit_enabled()) {
rc = mdbx_cursor_check(mc, C_UPDATING);
if (unlikely(rc != MDBX_SUCCESS))
goto done;
rc = mdbx_cursor_check(&mn, C_UPDATING);
if (unlikely(rc != MDBX_SUCCESS))
goto done;
}
/* Copy separator key to the parent. */ /* Copy separator key to the parent. */
if (page_room(mn.mc_pg[ptop]) < branch_size(env, &sepkey)) { if (page_room(mn.mc_pg[ptop]) < branch_size(env, &sepkey)) {
mdbx_trace("need split parent branch-page for key %s", DKEY_DEBUG(&sepkey));
mdbx_cassert(mc, page_numkeys(mn.mc_pg[ptop]) > 2);
mdbx_cassert(mc, !pure_left);
const int snum = mc->mc_snum; const int snum = mc->mc_snum;
const int depth = mc->mc_db->md_depth; const int depth = mc->mc_db->md_depth;
mn.mc_snum--; mn.mc_snum--;
mn.mc_top--; mn.mc_top--;
did_split = 1; did_split_parent = true;
/* We want other splits to find mn when doing fixups */ /* We want other splits to find mn when doing fixups */
WITH_CURSOR_TRACKING( WITH_CURSOR_TRACKING(
mn, rc = mdbx_page_split(&mn, &sepkey, NULL, rp->mp_pgno, 0)); mn, rc = mdbx_page_split(&mn, &sepkey, NULL, sister->mp_pgno, 0));
if (unlikely(rc != MDBX_SUCCESS)) if (unlikely(rc != MDBX_SUCCESS))
goto done; goto done;
mdbx_cassert(mc, (int)mc->mc_snum - snum == mc->mc_db->md_depth - depth); mdbx_cassert(mc, (int)mc->mc_snum - snum == mc->mc_db->md_depth - depth);
@ -17344,30 +17422,60 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *newkey,
/* find right page's left sibling */ /* find right page's left sibling */
mc->mc_ki[ptop] = mn.mc_ki[ptop]; mc->mc_ki[ptop] = mn.mc_ki[ptop];
rc = mdbx_cursor_sibling(mc, SIBLING_LEFT); rc = mdbx_cursor_sibling(mc, SIBLING_LEFT);
if (unlikely(rc != MDBX_SUCCESS)) {
if (rc == MDBX_NOTFOUND) /* improper mdbx_cursor_sibling() result */ {
mdbx_error("unexpected %s", "MDBX_NOTFOUND");
rc = MDBX_PROBLEM;
}
goto done;
}
} }
} }
} else if (unlikely(pure_left)) {
MDBX_page *ptop_page = mc->mc_pg[ptop];
mdbx_notice("adding to parent page %u node[%u] left-leaf page #%u key %s",
ptop_page->mp_pgno, mc->mc_ki[ptop], sister->mp_pgno,
DKEY(mc->mc_ki[ptop] ? newkey : NULL));
mc->mc_top--;
rc = mdbx_node_add_branch(mc, mc->mc_ki[ptop],
mc->mc_ki[ptop] ? newkey : NULL, sister->mp_pgno);
mdbx_cassert(mc, mp == mc->mc_pg[ptop + 1] &&
newindx == mc->mc_ki[ptop + 1] && ptop == mc->mc_top);
if (likely(rc == MDBX_SUCCESS) && mc->mc_ki[ptop] == 0) {
mdbx_notice("update prev-first key on parent %s", DKEY(&sepkey));
MDBX_node *node = page_node(mc->mc_pg[ptop], 1);
mdbx_cassert(mc, node_ks(node) == 0 && node_pgno(node) == mp->mp_pgno);
mdbx_cassert(mc, mc->mc_top == ptop && mc->mc_ki[ptop] == 0);
mc->mc_ki[ptop] = 1;
rc = mdbx_update_key(mc, &sepkey);
mdbx_cassert(mc, mc->mc_top == ptop && mc->mc_ki[ptop] == 1);
mdbx_cassert(mc,
mp == mc->mc_pg[ptop + 1] && newindx == mc->mc_ki[ptop + 1]);
mc->mc_ki[ptop] = 0;
}
mc->mc_top++;
if (unlikely(rc != MDBX_SUCCESS))
goto done;
MDBX_node *node = page_node(mc->mc_pg[ptop], mc->mc_ki[ptop] + 1);
mdbx_cassert(mc, node_pgno(node) == mp->mp_pgno &&
mc->mc_pg[ptop] == ptop_page);
} else { } else {
mn.mc_top--; mn.mc_top--;
rc = mdbx_node_add_branch(&mn, mn.mc_ki[ptop], &sepkey, rp->mp_pgno); mdbx_trace("add-to-parent the right-entry[%u] for new sibling-page",
mn.mc_ki[ptop]);
rc = mdbx_node_add_branch(&mn, mn.mc_ki[ptop], &sepkey, sister->mp_pgno);
mn.mc_top++; mn.mc_top++;
} if (unlikely(rc != MDBX_SUCCESS))
if (unlikely(rc != MDBX_SUCCESS)) { goto done;
if (rc == MDBX_NOTFOUND) /* improper mdbx_cursor_sibling() result */ {
mdbx_error("unexpected %s", "MDBX_NOTFOUND");
rc = MDBX_PROBLEM;
}
goto done;
} }
if (nflags & MDBX_APPEND) { if (unlikely(pure_left | pure_right)) {
mc->mc_pg[mc->mc_top] = rp; mc->mc_pg[mc->mc_top] = sister;
mc->mc_ki[mc->mc_top] = 0; mc->mc_ki[mc->mc_top] = 0;
switch (PAGETYPE(rp)) { switch (PAGETYPE(sister)) {
case P_BRANCH: {
mdbx_cassert(mc, (nflags & (F_BIGDATA | F_SUBDATA | F_DUPDATA)) == 0);
mdbx_cassert(mc, newpgno != 0 && newpgno != P_INVALID);
rc = mdbx_node_add_branch(mc, 0, newkey, newpgno);
} break;
case P_LEAF: { case P_LEAF: {
mdbx_cassert(mc, newpgno == 0 || newpgno == P_INVALID); mdbx_cassert(mc, newpgno == 0 || newpgno == P_INVALID);
rc = mdbx_node_add_leaf(mc, 0, newkey, newdata, nflags); rc = mdbx_node_add_leaf(mc, 0, newkey, newdata, nflags);
@ -17378,32 +17486,56 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *newkey,
rc = mdbx_node_add_leaf2(mc, 0, newkey); rc = mdbx_node_add_leaf2(mc, 0, newkey);
} break; } break;
default: default:
rc = bad_page(rp, "wrong page-type %u\n", PAGETYPE(rp)); rc = bad_page(sister, "wrong page-type %u\n", PAGETYPE(sister));
} }
if (rc) if (unlikely(rc != MDBX_SUCCESS))
goto done; goto done;
for (i = 0; i < mc->mc_top; i++)
mc->mc_ki[i] = mn.mc_ki[i]; if (pure_right) {
for (i = 0; i < mc->mc_top; i++)
mc->mc_ki[i] = mn.mc_ki[i];
} else if (mc->mc_ki[mc->mc_top - 1] == 0) {
for (unsigned i = 2; i <= mc->mc_top; ++i)
if (mc->mc_ki[mc->mc_top - i]) {
get_key(
page_node(mc->mc_pg[mc->mc_top - i], mc->mc_ki[mc->mc_top - i]),
&sepkey);
if (mc->mc_dbx->md_cmp(newkey, &sepkey) < 0) {
mc->mc_top -= i;
mdbx_notice("update new-first on parent [%i] page %u key %s",
mc->mc_ki[mc->mc_top], mc->mc_pg[mc->mc_top]->mp_pgno,
DKEY(newkey));
rc = mdbx_update_key(mc, newkey);
mc->mc_top += i;
if (unlikely(rc != MDBX_SUCCESS))
goto done;
}
break;
}
}
} else if (!IS_LEAF2(mp)) { } else if (!IS_LEAF2(mp)) {
/* Move nodes */ /* Move nodes */
mc->mc_pg[mc->mc_top] = rp; mc->mc_pg[mc->mc_top] = sister;
i = split_indx; i = split_indx;
indx_t n = 0; unsigned n = 0;
pgno_t pgno = 0;
do { do {
mdbx_trace("i %u, nkeys %u => n %u, rp #%u", i, nkeys, n,
sister->mp_pgno);
MDBX_val *rdata = NULL; MDBX_val *rdata = NULL;
if (i == newindx) { if (i == newindx) {
rkey.iov_base = newkey->iov_base; rkey.iov_base = newkey->iov_base;
rkey.iov_len = newkey->iov_len; rkey.iov_len = newkey->iov_len;
if (IS_LEAF(mp)) { if (IS_LEAF(mp))
rdata = newdata; rdata = newdata;
} else else
pgno = newpgno; pgno = newpgno;
flags = nflags; flags = nflags;
/* Update index for the new key. */ /* Update index for the new key. */
mc->mc_ki[mc->mc_top] = n; mc->mc_ki[mc->mc_top] = (indx_t)n;
} else { } else {
MDBX_node *node = MDBX_node *node =
(MDBX_node *)((char *)mp + copy->mp_ptrs[i] + PAGEHDRSZ); (MDBX_node *)((char *)mp + tmp_ki_copy->mp_ptrs[i] + PAGEHDRSZ);
rkey.iov_base = node_key(node); rkey.iov_base = node_key(node);
rkey.iov_len = node_ks(node); rkey.iov_len = node_ks(node);
if (IS_LEAF(mp)) { if (IS_LEAF(mp)) {
@ -17415,14 +17547,11 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *newkey,
flags = node_flags(node); flags = node_flags(node);
} }
switch (PAGETYPE(rp)) { switch (PAGETYPE(sister)) {
case P_BRANCH: { case P_BRANCH: {
mdbx_cassert(mc, 0 == (uint16_t)flags); mdbx_cassert(mc, 0 == (uint16_t)flags);
if (n == 0) { /* First branch index doesn't need key data. */
/* First branch index doesn't need key data. */ rc = mdbx_node_add_branch(mc, n, n ? &rkey : NULL, pgno);
rkey.iov_len = 0;
}
rc = mdbx_node_add_branch(mc, n, &rkey, pgno);
} break; } break;
case P_LEAF: { case P_LEAF: {
mdbx_cassert(mc, pgno == 0); mdbx_cassert(mc, pgno == 0);
@ -17435,34 +17564,36 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *newkey,
rc = mdbx_node_add_leaf2(mc, n, &rkey); rc = mdbx_node_add_leaf2(mc, n, &rkey);
} break; */ } break; */
default: default:
rc = bad_page(rp, "wrong page-type %u\n", PAGETYPE(rp)); rc = bad_page(sister, "wrong page-type %u\n", PAGETYPE(sister));
} }
if (rc) if (unlikely(rc != MDBX_SUCCESS))
goto done; goto done;
if (i == nkeys) { ++n;
if (++i > nkeys) {
i = 0; i = 0;
n = 0; n = 0;
mc->mc_pg[mc->mc_top] = copy; mc->mc_pg[mc->mc_top] = tmp_ki_copy;
} else { mdbx_trace("switch to mp #%u", tmp_ki_copy->mp_pgno);
i++;
n++;
} }
} while (i != split_indx); } while (i != split_indx);
nkeys = page_numkeys(copy); mdbx_trace("i %u, nkeys %u, n %u, pgno #%u", i, nkeys, n,
mc->mc_pg[mc->mc_top]->mp_pgno);
nkeys = page_numkeys(tmp_ki_copy);
for (i = 0; i < nkeys; i++) for (i = 0; i < nkeys; i++)
mp->mp_ptrs[i] = copy->mp_ptrs[i]; mp->mp_ptrs[i] = tmp_ki_copy->mp_ptrs[i];
mp->mp_lower = copy->mp_lower; mp->mp_lower = tmp_ki_copy->mp_lower;
mp->mp_upper = copy->mp_upper; mp->mp_upper = tmp_ki_copy->mp_upper;
memcpy(page_node(mp, nkeys - 1), page_node(copy, nkeys - 1), memcpy(page_node(mp, nkeys - 1), page_node(tmp_ki_copy, nkeys - 1),
env->me_psize - copy->mp_upper - PAGEHDRSZ); env->me_psize - tmp_ki_copy->mp_upper - PAGEHDRSZ);
/* reset back to original page */ /* reset back to original page */
if (newindx < split_indx) { if (newindx < split_indx) {
mc->mc_pg[mc->mc_top] = mp; mc->mc_pg[mc->mc_top] = mp;
} else { } else {
mc->mc_pg[mc->mc_top] = rp; mc->mc_pg[mc->mc_top] = sister;
mc->mc_ki[ptop]++; mc->mc_ki[ptop]++;
/* Make sure mc_ki is still valid. */ /* Make sure mc_ki is still valid. */
if (mn.mc_pg[ptop] != mc->mc_pg[ptop] && if (mn.mc_pg[ptop] != mc->mc_pg[ptop] &&
@ -17473,81 +17604,83 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *newkey,
} }
} }
} }
if (nflags & MDBX_RESERVE) { } else if (newindx >= split_indx) {
mc->mc_pg[mc->mc_top] = sister;
mc->mc_ki[ptop]++;
/* Make sure mc_ki is still valid. */
if (mn.mc_pg[ptop] != mc->mc_pg[ptop] &&
mc->mc_ki[ptop] >= page_numkeys(mc->mc_pg[ptop])) {
for (i = 0; i <= ptop; i++) {
mc->mc_pg[i] = mn.mc_pg[i];
mc->mc_ki[i] = mn.mc_ki[i];
}
}
}
/* Adjust other cursors pointing to mp and/or to parent page */
nkeys = page_numkeys(mp);
for (MDBX_cursor *m2 = mc->mc_txn->tw.cursors[mc->mc_dbi]; m2;
m2 = m2->mc_next) {
MDBX_cursor *m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2;
if (m3 == mc)
continue;
if (!(m2->mc_flags & m3->mc_flags & C_INITIALIZED))
continue;
if (foliage) {
/* sub cursors may be on different DB */
if (m3->mc_pg[0] != mp)
continue;
/* root split */
for (int k = foliage; k >= 0; k--) {
m3->mc_ki[k + 1] = m3->mc_ki[k];
m3->mc_pg[k + 1] = m3->mc_pg[k];
}
m3->mc_ki[0] = (m3->mc_ki[0] >= nkeys) ? 1 : 0;
m3->mc_pg[0] = mc->mc_pg[0];
m3->mc_snum++;
m3->mc_top++;
}
if (m3->mc_top >= mc->mc_top && m3->mc_pg[mc->mc_top] == mp && !pure_left) {
if (m3->mc_ki[mc->mc_top] >= newindx && !(nflags & MDBX_SPLIT_REPLACE))
m3->mc_ki[mc->mc_top]++;
if (m3->mc_ki[mc->mc_top] >= nkeys) {
m3->mc_pg[mc->mc_top] = sister;
mdbx_cassert(mc, m3->mc_ki[mc->mc_top] >= nkeys);
m3->mc_ki[mc->mc_top] -= (indx_t)nkeys;
for (i = 0; i < mc->mc_top; i++) {
m3->mc_ki[i] = mn.mc_ki[i];
m3->mc_pg[i] = mn.mc_pg[i];
}
}
} else if (!did_split_parent && m3->mc_top >= ptop &&
m3->mc_pg[ptop] == mc->mc_pg[ptop] &&
m3->mc_ki[ptop] >= mc->mc_ki[ptop]) {
m3->mc_ki[ptop]++; /* also for the `pure-left` case */
}
if (XCURSOR_INITED(m3) && IS_LEAF(mp))
XCURSOR_REFRESH(m3, m3->mc_pg[mc->mc_top], m3->mc_ki[mc->mc_top]);
}
mdbx_trace("mp #%u left: %d, sister #%u left: %d", mp->mp_pgno, page_room(mp),
sister->mp_pgno, page_room(sister));
done:
if (tmp_ki_copy)
mdbx_dpage_free(env, tmp_ki_copy, 1);
if (unlikely(rc != MDBX_SUCCESS))
mc->mc_txn->mt_flags |= MDBX_TXN_ERROR;
else {
if (mdbx_audit_enabled())
rc = mdbx_cursor_check(mc, C_UPDATING);
if (unlikely(nflags & MDBX_RESERVE)) {
MDBX_node *node = page_node(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); MDBX_node *node = page_node(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
if (!(node_flags(node) & F_BIGDATA)) if (!(node_flags(node) & F_BIGDATA))
newdata->iov_base = node_data(node); newdata->iov_base = node_data(node);
} }
} else {
if (newindx >= split_indx) {
mc->mc_pg[mc->mc_top] = rp;
mc->mc_ki[ptop]++;
/* Make sure mc_ki is still valid. */
if (mn.mc_pg[ptop] != mc->mc_pg[ptop] &&
mc->mc_ki[ptop] >= page_numkeys(mc->mc_pg[ptop])) {
for (i = 0; i <= ptop; i++) {
mc->mc_pg[i] = mn.mc_pg[i];
mc->mc_ki[i] = mn.mc_ki[i];
}
}
}
} }
{ mdbx_debug("<< mp #%u, rc %d", mp->mp_pgno, rc);
/* Adjust other cursors pointing to mp */
MDBX_cursor *m2, *m3;
MDBX_dbi dbi = mc->mc_dbi;
nkeys = page_numkeys(mp);
for (m2 = mc->mc_txn->tw.cursors[dbi]; m2; m2 = m2->mc_next) {
m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2;
if (m3 == mc)
continue;
if (!(m2->mc_flags & m3->mc_flags & C_INITIALIZED))
continue;
if (foliage) {
int k;
/* sub cursors may be on different DB */
if (m3->mc_pg[0] != mp)
continue;
/* root split */
for (k = foliage; k >= 0; k--) {
m3->mc_ki[k + 1] = m3->mc_ki[k];
m3->mc_pg[k + 1] = m3->mc_pg[k];
}
m3->mc_ki[0] = (m3->mc_ki[0] >= nkeys) ? 1 : 0;
m3->mc_pg[0] = mc->mc_pg[0];
m3->mc_snum++;
m3->mc_top++;
}
if (m3->mc_top >= mc->mc_top && m3->mc_pg[mc->mc_top] == mp) {
if (m3->mc_ki[mc->mc_top] >= newindx && !(nflags & MDBX_SPLIT_REPLACE))
m3->mc_ki[mc->mc_top]++;
if (m3->mc_ki[mc->mc_top] >= nkeys) {
m3->mc_pg[mc->mc_top] = rp;
mdbx_cassert(mc, m3->mc_ki[mc->mc_top] >= nkeys);
m3->mc_ki[mc->mc_top] -= (indx_t)nkeys;
for (i = 0; i < mc->mc_top; i++) {
m3->mc_ki[i] = mn.mc_ki[i];
m3->mc_pg[i] = mn.mc_pg[i];
}
}
} else if (!did_split && m3->mc_top >= ptop &&
m3->mc_pg[ptop] == mc->mc_pg[ptop] &&
m3->mc_ki[ptop] >= mc->mc_ki[ptop]) {
m3->mc_ki[ptop]++;
}
if (XCURSOR_INITED(m3) && IS_LEAF(mp))
XCURSOR_REFRESH(m3, m3->mc_pg[mc->mc_top], m3->mc_ki[mc->mc_top]);
}
}
mdbx_debug("mp left: %d, rp left: %d", page_room(mp), page_room(rp));
done:
if (copy) /* tmp page */
mdbx_dpage_free(env, copy, 1);
if (unlikely(rc))
mc->mc_txn->mt_flags |= MDBX_TXN_ERROR;
return rc; return rc;
} }