libmdbx/src/tree.c

1646 lines
59 KiB
C
Raw Normal View History

/// \copyright SPDX-License-Identifier: Apache-2.0
/// \note Please refer to the COPYRIGHT file for explanations license change,
/// credits and acknowledgments.
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
#include "internals.h"
static MDBX_cursor *cursor_clone(const MDBX_cursor *csrc,
cursor_couple_t *couple) {
cASSERT(csrc, csrc->txn->txnid >= csrc->txn->env->lck->cached_oldest.weak);
couple->outer.next = nullptr;
couple->outer.backup = nullptr;
couple->outer.subcur = nullptr;
couple->outer.clc = nullptr;
couple->outer.txn = csrc->txn;
couple->outer.dbi_state = csrc->dbi_state;
couple->outer.checking = z_pagecheck;
couple->outer.tree = nullptr;
couple->outer.top_and_flags = 0;
MDBX_cursor *cdst = &couple->outer;
if (is_inner(csrc)) {
couple->inner.cursor.next = nullptr;
couple->inner.cursor.backup = nullptr;
couple->inner.cursor.subcur = nullptr;
couple->inner.cursor.txn = csrc->txn;
couple->inner.cursor.dbi_state = csrc->dbi_state;
couple->outer.subcur = &couple->inner;
cdst = &couple->inner.cursor;
}
cdst->checking = csrc->checking;
cdst->tree = csrc->tree;
cdst->clc = csrc->clc;
cursor_cpstk(csrc, cdst);
return cdst;
}
/*----------------------------------------------------------------------------*/
void recalculate_merge_thresholds(MDBX_env *env) {
const size_t bytes = page_space(env);
env->merge_threshold =
(uint16_t)(bytes -
(bytes * env->options.merge_threshold_16dot16_percent >> 16));
env->merge_threshold_gc =
(uint16_t)(bytes - ((env->options.merge_threshold_16dot16_percent > 19005)
? bytes / 3 /* 33 % */
: bytes / 4 /* 25 % */));
}
int tree_drop(MDBX_cursor *mc, const bool may_have_subDBs) {
MDBX_txn *txn = mc->txn;
int rc = tree_search(mc, nullptr, Z_FIRST);
if (likely(rc == MDBX_SUCCESS)) {
/* DUPSORT sub-DBs have no large-pages/subDBs. Omit scanning leaves.
* This also avoids any P_DUPFIX pages, which have no nodes.
* Also if the DB doesn't have sub-DBs and has no large/overflow
* pages, omit scanning leaves. */
if (!(may_have_subDBs | mc->tree->large_pages))
cursor_pop(mc);
rc = pnl_need(&txn->tw.retired_pages, (size_t)mc->tree->branch_pages +
(size_t)mc->tree->leaf_pages +
(size_t)mc->tree->large_pages);
if (unlikely(rc != MDBX_SUCCESS))
goto bailout;
page_t *stack[CURSOR_STACK_SIZE];
for (intptr_t i = 0; i <= mc->top; ++i)
stack[i] = mc->pg[i];
while (mc->top >= 0) {
page_t *const mp = mc->pg[mc->top];
const size_t nkeys = page_numkeys(mp);
if (is_leaf(mp)) {
cASSERT(mc, mc->top + 1 == mc->tree->height);
for (size_t i = 0; i < nkeys; i++) {
node_t *node = page_node(mp, i);
if (node_flags(node) & N_BIGDATA) {
rc = page_retire_ex(mc, node_largedata_pgno(node), nullptr, 0);
if (unlikely(rc != MDBX_SUCCESS))
goto bailout;
if (!(may_have_subDBs | mc->tree->large_pages))
goto pop;
} else if (node_flags(node) & N_SUBDATA) {
if (unlikely((node_flags(node) & N_DUPDATA) == 0)) {
rc = /* disallowing implicit subDB deletion */ MDBX_INCOMPATIBLE;
goto bailout;
}
rc = cursor_dupsort_setup(mc, node, mp);
if (unlikely(rc != MDBX_SUCCESS))
goto bailout;
rc = tree_drop(&mc->subcur->cursor, false);
if (unlikely(rc != MDBX_SUCCESS))
goto bailout;
}
}
} else {
cASSERT(mc, mc->top + 1 < mc->tree->height);
mc->checking |= z_retiring;
const unsigned pagetype = (is_frozen(txn, mp) ? P_FROZEN : 0) +
((mc->top + 2 == mc->tree->height)
? (mc->checking & (P_LEAF | P_DUPFIX))
: P_BRANCH);
for (size_t i = 0; i < nkeys; i++) {
node_t *node = page_node(mp, i);
tASSERT(txn, (node_flags(node) &
(N_BIGDATA | N_SUBDATA | N_DUPDATA)) == 0);
const pgno_t pgno = node_pgno(node);
rc = page_retire_ex(mc, pgno, nullptr, pagetype);
if (unlikely(rc != MDBX_SUCCESS))
goto bailout;
}
mc->checking -= z_retiring;
}
if (!mc->top)
break;
cASSERT(mc, nkeys > 0);
mc->ki[mc->top] = (indx_t)nkeys;
rc = cursor_sibling_right(mc);
if (unlikely(rc != MDBX_SUCCESS)) {
if (unlikely(rc != MDBX_NOTFOUND))
goto bailout;
/* no more siblings, go back to beginning
* of previous level. */
pop:
cursor_pop(mc);
mc->ki[0] = 0;
for (intptr_t i = 1; i <= mc->top; i++) {
mc->pg[i] = stack[i];
mc->ki[i] = 0;
}
}
}
rc = page_retire(mc, mc->pg[0]);
}
bailout:
be_poor(mc);
if (unlikely(rc != MDBX_SUCCESS))
txn->flags |= MDBX_TXN_ERROR;
return rc;
}
static int node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, bool fromleft) {
int rc;
DKBUF_DEBUG;
page_t *psrc = csrc->pg[csrc->top];
page_t *pdst = cdst->pg[cdst->top];
cASSERT(csrc, page_type(psrc) == page_type(pdst));
cASSERT(csrc, csrc->tree == cdst->tree);
cASSERT(csrc, csrc->top == cdst->top);
if (unlikely(page_type(psrc) != page_type(pdst))) {
bailout:
ERROR("Wrong or mismatch pages's types (src %d, dst %d) to move node",
page_type(psrc), page_type(pdst));
csrc->txn->flags |= MDBX_TXN_ERROR;
return MDBX_PROBLEM;
}
MDBX_val key4move;
switch (page_type(psrc)) {
case P_BRANCH: {
const node_t *srcnode = page_node(psrc, csrc->ki[csrc->top]);
cASSERT(csrc, node_flags(srcnode) == 0);
const pgno_t srcpg = node_pgno(srcnode);
key4move.iov_len = node_ks(srcnode);
key4move.iov_base = node_key(srcnode);
if (csrc->ki[csrc->top] == 0) {
const int8_t top = csrc->top;
cASSERT(csrc, top >= 0);
/* must find the lowest key below src */
rc = tree_search_lowest(csrc);
page_t *lowest_page = csrc->pg[csrc->top];
if (unlikely(rc != MDBX_SUCCESS))
return rc;
cASSERT(csrc, is_leaf(lowest_page));
if (unlikely(!is_leaf(lowest_page)))
goto bailout;
if (is_dupfix_leaf(lowest_page))
key4move = page_dupfix_key(lowest_page, 0, csrc->tree->dupfix_size);
else {
const node_t *lowest_node = page_node(lowest_page, 0);
key4move.iov_len = node_ks(lowest_node);
key4move.iov_base = node_key(lowest_node);
}
/* restore cursor after mdbx_page_search_lowest() */
csrc->top = top;
csrc->ki[csrc->top] = 0;
/* paranoia */
cASSERT(csrc, psrc == csrc->pg[csrc->top]);
cASSERT(csrc, is_branch(psrc));
if (unlikely(!is_branch(psrc)))
goto bailout;
}
if (cdst->ki[cdst->top] == 0) {
cursor_couple_t couple;
MDBX_cursor *const mn = cursor_clone(cdst, &couple);
const int8_t top = cdst->top;
cASSERT(csrc, top >= 0);
/* must find the lowest key below dst */
rc = tree_search_lowest(mn);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
page_t *const lowest_page = mn->pg[mn->top];
cASSERT(cdst, is_leaf(lowest_page));
if (unlikely(!is_leaf(lowest_page)))
goto bailout;
MDBX_val key;
if (is_dupfix_leaf(lowest_page))
key = page_dupfix_key(lowest_page, 0, mn->tree->dupfix_size);
else {
node_t *lowest_node = page_node(lowest_page, 0);
key.iov_len = node_ks(lowest_node);
key.iov_base = node_key(lowest_node);
}
/* restore cursor after mdbx_page_search_lowest() */
mn->top = top;
mn->ki[mn->top] = 0;
const intptr_t delta = EVEN_CEIL(key.iov_len) -
EVEN_CEIL(node_ks(page_node(mn->pg[mn->top], 0)));
const intptr_t needed = branch_size(cdst->txn->env, &key4move) + delta;
const intptr_t have = page_room(pdst);
if (unlikely(needed > have))
return MDBX_RESULT_TRUE;
if (unlikely((rc = page_touch(csrc)) || (rc = page_touch(cdst))))
return rc;
psrc = csrc->pg[csrc->top];
pdst = cdst->pg[cdst->top];
couple.outer.next = mn->txn->cursors[cursor_dbi(mn)];
mn->txn->cursors[cursor_dbi(mn)] = &couple.outer;
rc = tree_propagate_key(mn, &key);
mn->txn->cursors[cursor_dbi(mn)] = couple.outer.next;
if (unlikely(rc != MDBX_SUCCESS))
return rc;
} else {
const size_t needed = branch_size(cdst->txn->env, &key4move);
const size_t have = page_room(pdst);
if (unlikely(needed > have))
return MDBX_RESULT_TRUE;
if (unlikely((rc = page_touch(csrc)) || (rc = page_touch(cdst))))
return rc;
psrc = csrc->pg[csrc->top];
pdst = cdst->pg[cdst->top];
}
DEBUG("moving %s-node %u [%s] on page %" PRIaPGNO
" to node %u on page %" PRIaPGNO,
"branch", csrc->ki[csrc->top], DKEY_DEBUG(&key4move), psrc->pgno,
cdst->ki[cdst->top], pdst->pgno);
/* Add the node to the destination page. */
rc = node_add_branch(cdst, cdst->ki[cdst->top], &key4move, srcpg);
} break;
case P_LEAF: {
/* Mark src and dst as dirty. */
if (unlikely((rc = page_touch(csrc)) || (rc = page_touch(cdst))))
return rc;
psrc = csrc->pg[csrc->top];
pdst = cdst->pg[cdst->top];
const node_t *srcnode = page_node(psrc, csrc->ki[csrc->top]);
MDBX_val data;
data.iov_len = node_ds(srcnode);
data.iov_base = node_data(srcnode);
key4move.iov_len = node_ks(srcnode);
key4move.iov_base = node_key(srcnode);
DEBUG("moving %s-node %u [%s] on page %" PRIaPGNO
" to node %u on page %" PRIaPGNO,
"leaf", csrc->ki[csrc->top], DKEY_DEBUG(&key4move), psrc->pgno,
cdst->ki[cdst->top], pdst->pgno);
/* Add the node to the destination page. */
rc = node_add_leaf(cdst, cdst->ki[cdst->top], &key4move, &data,
node_flags(srcnode));
} break;
case P_LEAF | P_DUPFIX: {
/* Mark src and dst as dirty. */
if (unlikely((rc = page_touch(csrc)) || (rc = page_touch(cdst))))
return rc;
psrc = csrc->pg[csrc->top];
pdst = cdst->pg[cdst->top];
key4move =
page_dupfix_key(psrc, csrc->ki[csrc->top], csrc->tree->dupfix_size);
DEBUG("moving %s-node %u [%s] on page %" PRIaPGNO
" to node %u on page %" PRIaPGNO,
"leaf2", csrc->ki[csrc->top], DKEY_DEBUG(&key4move), psrc->pgno,
cdst->ki[cdst->top], pdst->pgno);
/* Add the node to the destination page. */
rc = node_add_dupfix(cdst, cdst->ki[cdst->top], &key4move);
} break;
default:
assert(false);
goto bailout;
}
if (unlikely(rc != MDBX_SUCCESS))
return rc;
/* Delete the node from the source page. */
node_del(csrc, key4move.iov_len);
cASSERT(csrc, psrc == csrc->pg[csrc->top]);
cASSERT(cdst, pdst == cdst->pg[cdst->top]);
cASSERT(csrc, page_type(psrc) == page_type(pdst));
/* csrc курсор тут всегда временный, на стеке внутри tree_rebalance(),
* и его нет необходимости корректировать. */
{
/* Adjust other cursors pointing to mp */
MDBX_cursor *m2, *m3;
const size_t dbi = cursor_dbi(csrc);
cASSERT(csrc, csrc->top == cdst->top);
if (fromleft) {
/* Перемещаем с левой страницы нв правую, нужно сдвинуть ki на +1 */
for (m2 = csrc->txn->cursors[dbi]; m2; m2 = m2->next) {
m3 = (csrc->flags & z_inner) ? &m2->subcur->cursor : m2;
if (!is_related(csrc, m3))
continue;
if (m3 != cdst && m3->pg[csrc->top] == pdst &&
m3->ki[csrc->top] >= cdst->ki[csrc->top]) {
m3->ki[csrc->top] += 1;
}
if (/* m3 != csrc && */ m3->pg[csrc->top] == psrc &&
m3->ki[csrc->top] == csrc->ki[csrc->top]) {
m3->pg[csrc->top] = pdst;
m3->ki[csrc->top] = cdst->ki[cdst->top];
cASSERT(csrc, csrc->top > 0);
m3->ki[csrc->top - 1] += 1;
}
if (is_leaf(psrc) && inner_pointed(m3)) {
cASSERT(csrc, csrc->top == m3->top);
size_t nkeys = page_numkeys(m3->pg[csrc->top]);
if (likely(nkeys > m3->ki[csrc->top]))
cursor_inner_refresh(m3, m3->pg[csrc->top], m3->ki[csrc->top]);
}
}
} else {
/* Перемещаем с правой страницы на левую, нужно сдвинуть ki на -1 */
for (m2 = csrc->txn->cursors[dbi]; m2; m2 = m2->next) {
m3 = (csrc->flags & z_inner) ? &m2->subcur->cursor : m2;
if (!is_related(csrc, m3))
continue;
if (m3->pg[csrc->top] == psrc) {
if (!m3->ki[csrc->top]) {
m3->pg[csrc->top] = pdst;
m3->ki[csrc->top] = cdst->ki[cdst->top];
cASSERT(csrc, csrc->top > 0 && m3->ki[csrc->top - 1] > 0);
m3->ki[csrc->top - 1] -= 1;
} else
m3->ki[csrc->top] -= 1;
if (is_leaf(psrc) && inner_pointed(m3)) {
cASSERT(csrc, csrc->top == m3->top);
size_t nkeys = page_numkeys(m3->pg[csrc->top]);
if (likely(nkeys > m3->ki[csrc->top]))
cursor_inner_refresh(m3, m3->pg[csrc->top], m3->ki[csrc->top]);
}
}
}
}
}
/* Update the parent separators. */
if (csrc->ki[csrc->top] == 0) {
cASSERT(csrc, csrc->top > 0);
if (csrc->ki[csrc->top - 1] != 0) {
MDBX_val key;
if (is_dupfix_leaf(psrc))
key = page_dupfix_key(psrc, 0, csrc->tree->dupfix_size);
else {
node_t *srcnode = page_node(psrc, 0);
key.iov_len = node_ks(srcnode);
key.iov_base = node_key(srcnode);
}
DEBUG("update separator for source page %" PRIaPGNO " to [%s]",
psrc->pgno, DKEY_DEBUG(&key));
cursor_couple_t couple;
MDBX_cursor *const mn = cursor_clone(csrc, &couple);
cASSERT(csrc, mn->top > 0);
mn->top -= 1;
couple.outer.next = mn->txn->cursors[cursor_dbi(mn)];
mn->txn->cursors[cursor_dbi(mn)] = &couple.outer;
rc = tree_propagate_key(mn, &key);
mn->txn->cursors[cursor_dbi(mn)] = couple.outer.next;
if (unlikely(rc != MDBX_SUCCESS))
return rc;
}
if (is_branch(psrc)) {
const MDBX_val nullkey = {0, 0};
const indx_t ix = csrc->ki[csrc->top];
csrc->ki[csrc->top] = 0;
rc = tree_propagate_key(csrc, &nullkey);
csrc->ki[csrc->top] = ix;
cASSERT(csrc, rc == MDBX_SUCCESS);
}
}
if (cdst->ki[cdst->top] == 0) {
cASSERT(cdst, cdst->top > 0);
if (cdst->ki[cdst->top - 1] != 0) {
MDBX_val key;
if (is_dupfix_leaf(pdst))
key = page_dupfix_key(pdst, 0, cdst->tree->dupfix_size);
else {
node_t *srcnode = page_node(pdst, 0);
key.iov_len = node_ks(srcnode);
key.iov_base = node_key(srcnode);
}
DEBUG("update separator for destination page %" PRIaPGNO " to [%s]",
pdst->pgno, DKEY_DEBUG(&key));
cursor_couple_t couple;
MDBX_cursor *const mn = cursor_clone(cdst, &couple);
cASSERT(cdst, mn->top > 0);
mn->top -= 1;
couple.outer.next = mn->txn->cursors[cursor_dbi(mn)];
mn->txn->cursors[cursor_dbi(mn)] = &couple.outer;
rc = tree_propagate_key(mn, &key);
mn->txn->cursors[cursor_dbi(mn)] = couple.outer.next;
if (unlikely(rc != MDBX_SUCCESS))
return rc;
}
if (is_branch(pdst)) {
const MDBX_val nullkey = {0, 0};
const indx_t ix = cdst->ki[cdst->top];
cdst->ki[cdst->top] = 0;
rc = tree_propagate_key(cdst, &nullkey);
cdst->ki[cdst->top] = ix;
cASSERT(cdst, rc == MDBX_SUCCESS);
}
}
return MDBX_SUCCESS;
}
static int page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) {
MDBX_val key;
int rc;
cASSERT(csrc, csrc != cdst);
cASSERT(csrc, cursor_is_tracked(csrc));
cASSERT(cdst, cursor_is_tracked(cdst));
const page_t *const psrc = csrc->pg[csrc->top];
page_t *pdst = cdst->pg[cdst->top];
DEBUG("merging page %" PRIaPGNO " into %" PRIaPGNO, psrc->pgno, pdst->pgno);
cASSERT(csrc, page_type(psrc) == page_type(pdst));
cASSERT(csrc, csrc->clc == cdst->clc && csrc->tree == cdst->tree);
cASSERT(csrc, csrc->top > 0); /* can't merge root page */
cASSERT(cdst, cdst->top > 0);
cASSERT(cdst, cdst->top + 1 < cdst->tree->height ||
is_leaf(cdst->pg[cdst->tree->height - 1]));
cASSERT(csrc, csrc->top + 1 < csrc->tree->height ||
is_leaf(csrc->pg[csrc->tree->height - 1]));
cASSERT(cdst, csrc->txn->env->options.prefer_waf_insteadof_balance ||
page_room(pdst) >= page_used(cdst->txn->env, psrc));
const int pagetype = page_type(psrc);
/* Move all nodes from src to dst */
const size_t dst_nkeys = page_numkeys(pdst);
const size_t src_nkeys = page_numkeys(psrc);
cASSERT(cdst, dst_nkeys + src_nkeys >= (is_leaf(psrc) ? 1u : 2u));
if (likely(src_nkeys)) {
size_t ii = dst_nkeys;
if (unlikely(pagetype & P_DUPFIX)) {
/* Mark dst as dirty. */
rc = page_touch(cdst);
cASSERT(cdst, rc != MDBX_RESULT_TRUE);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
key.iov_len = csrc->tree->dupfix_size;
key.iov_base = page_data(psrc);
size_t i = 0;
do {
rc = node_add_dupfix(cdst, ii++, &key);
cASSERT(cdst, rc != MDBX_RESULT_TRUE);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
key.iov_base = ptr_disp(key.iov_base, key.iov_len);
} while (++i != src_nkeys);
} else {
node_t *srcnode = page_node(psrc, 0);
key.iov_len = node_ks(srcnode);
key.iov_base = node_key(srcnode);
if (pagetype & P_BRANCH) {
cursor_couple_t couple;
MDBX_cursor *const mn = cursor_clone(csrc, &couple);
/* must find the lowest key below src */
rc = tree_search_lowest(mn);
cASSERT(csrc, rc != MDBX_RESULT_TRUE);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
const page_t *mp = mn->pg[mn->top];
if (likely(!is_dupfix_leaf(mp))) {
cASSERT(mn, is_leaf(mp));
const node_t *lowest = page_node(mp, 0);
key.iov_len = node_ks(lowest);
key.iov_base = node_key(lowest);
} else {
cASSERT(mn, mn->top > csrc->top);
key = page_dupfix_key(mp, mn->ki[mn->top], csrc->tree->dupfix_size);
}
cASSERT(mn, key.iov_len >= csrc->clc->k.lmin);
cASSERT(mn, key.iov_len <= csrc->clc->k.lmax);
const size_t dst_room = page_room(pdst);
const size_t src_used = page_used(cdst->txn->env, psrc);
const size_t space_needed = src_used - node_ks(srcnode) + key.iov_len;
if (unlikely(space_needed > dst_room))
return MDBX_RESULT_TRUE;
}
/* Mark dst as dirty. */
rc = page_touch(cdst);
cASSERT(cdst, rc != MDBX_RESULT_TRUE);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
size_t i = 0;
while (true) {
if (pagetype & P_LEAF) {
MDBX_val data;
data.iov_len = node_ds(srcnode);
data.iov_base = node_data(srcnode);
rc = node_add_leaf(cdst, ii++, &key, &data, node_flags(srcnode));
} else {
cASSERT(csrc, node_flags(srcnode) == 0);
rc = node_add_branch(cdst, ii++, &key, node_pgno(srcnode));
}
cASSERT(cdst, rc != MDBX_RESULT_TRUE);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
if (++i == src_nkeys)
break;
srcnode = page_node(psrc, i);
key.iov_len = node_ks(srcnode);
key.iov_base = node_key(srcnode);
}
}
pdst = cdst->pg[cdst->top];
DEBUG("dst page %" PRIaPGNO " now has %zu keys (%u.%u%% filled)",
pdst->pgno, page_numkeys(pdst),
page_fill_percentum_x10(cdst->txn->env, pdst) / 10,
page_fill_percentum_x10(cdst->txn->env, pdst) % 10);
cASSERT(csrc, psrc == csrc->pg[csrc->top]);
cASSERT(cdst, pdst == cdst->pg[cdst->top]);
}
/* Unlink the src page from parent and add to free list. */
csrc->top -= 1;
node_del(csrc, 0);
if (csrc->ki[csrc->top] == 0) {
const MDBX_val nullkey = {0, 0};
rc = tree_propagate_key(csrc, &nullkey);
cASSERT(csrc, rc != MDBX_RESULT_TRUE);
if (unlikely(rc != MDBX_SUCCESS)) {
csrc->top += 1;
return rc;
}
}
csrc->top += 1;
cASSERT(csrc, psrc == csrc->pg[csrc->top]);
cASSERT(cdst, pdst == cdst->pg[cdst->top]);
{
/* Adjust other cursors pointing to mp */
MDBX_cursor *m2, *m3;
const size_t dbi = cursor_dbi(csrc);
for (m2 = csrc->txn->cursors[dbi]; m2; m2 = m2->next) {
m3 = (csrc->flags & z_inner) ? &m2->subcur->cursor : m2;
if (!is_related(csrc, m3))
continue;
if (m3->pg[csrc->top] == psrc) {
m3->pg[csrc->top] = pdst;
m3->ki[csrc->top] += (indx_t)dst_nkeys;
m3->ki[csrc->top - 1] = cdst->ki[csrc->top - 1];
} else if (m3->pg[csrc->top - 1] == csrc->pg[csrc->top - 1] &&
m3->ki[csrc->top - 1] > csrc->ki[csrc->top - 1]) {
cASSERT(m3, m3->ki[csrc->top - 1] > 0 &&
m3->ki[csrc->top - 1] <=
page_numkeys(m3->pg[csrc->top - 1]));
m3->ki[csrc->top - 1] -= 1;
}
if (is_leaf(psrc) && inner_pointed(m3)) {
cASSERT(csrc, csrc->top == m3->top);
size_t nkeys = page_numkeys(m3->pg[csrc->top]);
if (likely(nkeys > m3->ki[csrc->top]))
cursor_inner_refresh(m3, m3->pg[csrc->top], m3->ki[csrc->top]);
}
}
}
rc = page_retire(csrc, (page_t *)psrc);
cASSERT(csrc, rc != MDBX_RESULT_TRUE);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
cASSERT(cdst, cdst->tree->items > 0);
cASSERT(cdst, cdst->top + 1 <= cdst->tree->height);
cASSERT(cdst, cdst->top > 0);
page_t *const top_page = cdst->pg[cdst->top];
const indx_t top_indx = cdst->ki[cdst->top];
const int save_top = cdst->top;
const uint16_t save_height = cdst->tree->height;
cursor_pop(cdst);
rc = tree_rebalance(cdst);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
cASSERT(cdst, cdst->tree->items > 0);
cASSERT(cdst, cdst->top + 1 <= cdst->tree->height);
#if MDBX_ENABLE_PGOP_STAT
cdst->txn->env->lck->pgops.merge.weak += 1;
#endif /* MDBX_ENABLE_PGOP_STAT */
if (is_leaf(cdst->pg[cdst->top])) {
/* LY: don't touch cursor if top-page is a LEAF */
cASSERT(cdst, is_leaf(cdst->pg[cdst->top]) ||
page_type(cdst->pg[cdst->top]) == pagetype);
return MDBX_SUCCESS;
}
cASSERT(cdst, page_numkeys(top_page) == dst_nkeys + src_nkeys);
if (unlikely(pagetype != page_type(top_page))) {
/* LY: LEAF-page becomes BRANCH, unable restore cursor's stack */
goto bailout;
}
if (top_page == cdst->pg[cdst->top]) {
/* LY: don't touch cursor if prev top-page already on the top */
cASSERT(cdst, cdst->ki[cdst->top] == top_indx);
cASSERT(cdst, is_leaf(cdst->pg[cdst->top]) ||
page_type(cdst->pg[cdst->top]) == pagetype);
return MDBX_SUCCESS;
}
const int new_top = save_top - save_height + cdst->tree->height;
if (unlikely(new_top < 0 || new_top >= cdst->tree->height)) {
/* LY: out of range, unable restore cursor's stack */
goto bailout;
}
if (top_page == cdst->pg[new_top]) {
cASSERT(cdst, cdst->ki[new_top] == top_indx);
/* LY: restore cursor stack */
cdst->top = (int8_t)new_top;
cASSERT(cdst, cdst->top + 1 < cdst->tree->height ||
is_leaf(cdst->pg[cdst->tree->height - 1]));
cASSERT(cdst, is_leaf(cdst->pg[cdst->top]) ||
page_type(cdst->pg[cdst->top]) == pagetype);
return MDBX_SUCCESS;
}
page_t *const stub_page = (page_t *)(~(uintptr_t)top_page);
const indx_t stub_indx = top_indx;
if (save_height > cdst->tree->height &&
((cdst->pg[save_top] == top_page && cdst->ki[save_top] == top_indx) ||
(cdst->pg[save_top] == stub_page && cdst->ki[save_top] == stub_indx))) {
/* LY: restore cursor stack */
cdst->pg[new_top] = top_page;
cdst->ki[new_top] = top_indx;
#if MDBX_DEBUG
cdst->pg[new_top + 1] = nullptr;
cdst->ki[new_top + 1] = INT16_MAX;
#endif
cdst->top = (int8_t)new_top;
cASSERT(cdst, cdst->top + 1 < cdst->tree->height ||
is_leaf(cdst->pg[cdst->tree->height - 1]));
cASSERT(cdst, is_leaf(cdst->pg[cdst->top]) ||
page_type(cdst->pg[cdst->top]) == pagetype);
return MDBX_SUCCESS;
}
bailout:
/* LY: unable restore cursor's stack */
be_poor(cdst);
return MDBX_CURSOR_FULL;
}
int tree_rebalance(MDBX_cursor *mc) {
cASSERT(mc, cursor_is_tracked(mc));
cASSERT(mc, mc->top >= 0);
cASSERT(mc, mc->top + 1 < mc->tree->height ||
is_leaf(mc->pg[mc->tree->height - 1]));
const page_t *const tp = mc->pg[mc->top];
const uint8_t pagetype = page_type(tp);
STATIC_ASSERT(P_BRANCH == 1);
const size_t minkeys = (pagetype & P_BRANCH) + (size_t)1;
/* Pages emptier than this are candidates for merging. */
size_t room_threshold = likely(mc->tree != &mc->txn->dbs[FREE_DBI])
? mc->txn->env->merge_threshold
: mc->txn->env->merge_threshold_gc;
const size_t numkeys = page_numkeys(tp);
const size_t room = page_room(tp);
DEBUG("rebalancing %s page %" PRIaPGNO
" (has %zu keys, fill %u.%u%%, used %zu, room %zu bytes)",
is_leaf(tp) ? "leaf" : "branch", tp->pgno, numkeys,
page_fill_percentum_x10(mc->txn->env, tp) / 10,
page_fill_percentum_x10(mc->txn->env, tp) % 10,
page_used(mc->txn->env, tp), room);
cASSERT(mc, is_modifable(mc->txn, tp));
if (unlikely(numkeys < minkeys)) {
DEBUG("page %" PRIaPGNO " must be merged due keys < %zu threshold",
tp->pgno, minkeys);
} else if (unlikely(room > room_threshold)) {
DEBUG("page %" PRIaPGNO " should be merged due room %zu > %zu threshold",
tp->pgno, room, room_threshold);
} else {
DEBUG("no need to rebalance page %" PRIaPGNO ", room %zu < %zu threshold",
tp->pgno, room, room_threshold);
cASSERT(mc, mc->tree->items > 0);
return MDBX_SUCCESS;
}
int rc;
if (mc->top == 0) {
page_t *const mp = mc->pg[0];
const size_t nkeys = page_numkeys(mp);
cASSERT(mc, (mc->tree->items == 0) == (nkeys == 0));
if (nkeys == 0) {
DEBUG("%s", "tree is completely empty");
cASSERT(mc, is_leaf(mp));
cASSERT(mc, (*cursor_dbi_state(mc) & DBI_DIRTY) != 0);
cASSERT(mc, mc->tree->branch_pages == 0 && mc->tree->large_pages == 0 &&
mc->tree->leaf_pages == 1);
/* Adjust cursors pointing to mp */
for (MDBX_cursor *m2 = mc->txn->cursors[cursor_dbi(mc)]; m2;
m2 = m2->next) {
MDBX_cursor *m3 = (mc->flags & z_inner) ? &m2->subcur->cursor : m2;
if (!is_poor(m3) && m3->pg[0] == mp) {
be_poor(m3);
m3->flags |= z_after_delete;
}
}
if (is_subpage(mp)) {
return MDBX_SUCCESS;
} else {
mc->tree->root = P_INVALID;
mc->tree->height = 0;
return page_retire(mc, mp);
}
}
if (is_subpage(mp)) {
DEBUG("%s", "Can't rebalance a subpage, ignoring");
cASSERT(mc, is_leaf(tp));
return MDBX_SUCCESS;
}
if (is_branch(mp) && nkeys == 1) {
DEBUG("%s", "collapsing root page!");
mc->tree->root = node_pgno(page_node(mp, 0));
rc = page_get(mc, mc->tree->root, &mc->pg[0], mp->txnid);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
mc->tree->height--;
mc->ki[0] = mc->ki[1];
for (intptr_t i = 1; i < mc->tree->height; i++) {
mc->pg[i] = mc->pg[i + 1];
mc->ki[i] = mc->ki[i + 1];
}
/* Adjust other cursors pointing to mp */
for (MDBX_cursor *m2 = mc->txn->cursors[cursor_dbi(mc)]; m2;
m2 = m2->next) {
MDBX_cursor *m3 = (mc->flags & z_inner) ? &m2->subcur->cursor : m2;
if (is_related(mc, m3) && m3->pg[0] == mp) {
for (intptr_t i = 0; i < mc->tree->height; i++) {
m3->pg[i] = m3->pg[i + 1];
m3->ki[i] = m3->ki[i + 1];
}
m3->top -= 1;
}
}
cASSERT(mc, is_leaf(mc->pg[mc->top]) ||
page_type(mc->pg[mc->top]) == pagetype);
cASSERT(mc, mc->top + 1 < mc->tree->height ||
is_leaf(mc->pg[mc->tree->height - 1]));
return page_retire(mc, mp);
}
DEBUG("root page %" PRIaPGNO " doesn't need rebalancing (flags 0x%x)",
mp->pgno, mp->flags);
return MDBX_SUCCESS;
}
/* The parent (branch page) must have at least 2 pointers,
* otherwise the tree is invalid. */
const size_t pre_top = mc->top - 1;
cASSERT(mc, is_branch(mc->pg[pre_top]));
cASSERT(mc, !is_subpage(mc->pg[0]));
cASSERT(mc, page_numkeys(mc->pg[pre_top]) > 1);
/* Leaf page fill factor is below the threshold.
* Try to move keys from left or right neighbor, or
* merge with a neighbor page. */
/* Find neighbors. */
cursor_couple_t couple;
MDBX_cursor *const mn = cursor_clone(mc, &couple);
page_t *left = nullptr, *right = nullptr;
if (mn->ki[pre_top] > 0) {
rc =
page_get(mn, node_pgno(page_node(mn->pg[pre_top], mn->ki[pre_top] - 1)),
&left, mc->pg[mc->top]->txnid);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
cASSERT(mc, page_type(left) == page_type(mc->pg[mc->top]));
}
if (mn->ki[pre_top] + (size_t)1 < page_numkeys(mn->pg[pre_top])) {
rc = page_get(
mn, node_pgno(page_node(mn->pg[pre_top], mn->ki[pre_top] + (size_t)1)),
&right, mc->pg[mc->top]->txnid);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
cASSERT(mc, page_type(right) == page_type(mc->pg[mc->top]));
}
cASSERT(mc, left || right);
const size_t ki_top = mc->ki[mc->top];
const size_t ki_pre_top = mn->ki[pre_top];
const size_t nkeys = page_numkeys(mn->pg[mn->top]);
const size_t left_room = left ? page_room(left) : 0;
const size_t right_room = right ? page_room(right) : 0;
const size_t left_nkeys = left ? page_numkeys(left) : 0;
const size_t right_nkeys = right ? page_numkeys(right) : 0;
bool involve = false;
retry:
cASSERT(mc, mc->top > 0);
if (left_room > room_threshold && left_room >= right_room &&
(is_modifable(mc->txn, left) || involve)) {
/* try merge with left */
cASSERT(mc, left_nkeys >= minkeys);
mn->pg[mn->top] = left;
mn->ki[mn->top - 1] = (indx_t)(ki_pre_top - 1);
mn->ki[mn->top] = (indx_t)(left_nkeys - 1);
mc->ki[mc->top] = 0;
const size_t new_ki = ki_top + left_nkeys;
mn->ki[mn->top] += mc->ki[mn->top] + 1;
couple.outer.next = mn->txn->cursors[cursor_dbi(mn)];
mn->txn->cursors[cursor_dbi(mn)] = &couple.outer;
rc = page_merge(mc, mn);
mn->txn->cursors[cursor_dbi(mn)] = couple.outer.next;
if (likely(rc != MDBX_RESULT_TRUE)) {
cursor_cpstk(mn, mc);
mc->ki[mc->top] = (indx_t)new_ki;
cASSERT(mc, rc || page_numkeys(mc->pg[mc->top]) >= minkeys);
return rc;
}
}
if (right_room > room_threshold &&
(is_modifable(mc->txn, right) || involve)) {
/* try merge with right */
cASSERT(mc, right_nkeys >= minkeys);
mn->pg[mn->top] = right;
mn->ki[mn->top - 1] = (indx_t)(ki_pre_top + 1);
mn->ki[mn->top] = 0;
mc->ki[mc->top] = (indx_t)nkeys;
couple.outer.next = mn->txn->cursors[cursor_dbi(mn)];
mn->txn->cursors[cursor_dbi(mn)] = &couple.outer;
rc = page_merge(mn, mc);
mn->txn->cursors[cursor_dbi(mn)] = couple.outer.next;
if (likely(rc != MDBX_RESULT_TRUE)) {
mc->ki[mc->top] = (indx_t)ki_top;
cASSERT(mc, rc || page_numkeys(mc->pg[mc->top]) >= minkeys);
return rc;
}
}
if (left_nkeys > minkeys &&
(right_nkeys <= left_nkeys || right_room >= left_room) &&
(is_modifable(mc->txn, left) || involve)) {
/* try move from left */
mn->pg[mn->top] = left;
mn->ki[mn->top - 1] = (indx_t)(ki_pre_top - 1);
mn->ki[mn->top] = (indx_t)(left_nkeys - 1);
mc->ki[mc->top] = 0;
couple.outer.next = mn->txn->cursors[cursor_dbi(mn)];
mn->txn->cursors[cursor_dbi(mn)] = &couple.outer;
rc = node_move(mn, mc, true);
mn->txn->cursors[cursor_dbi(mn)] = couple.outer.next;
if (likely(rc != MDBX_RESULT_TRUE)) {
mc->ki[mc->top] = (indx_t)(ki_top + 1);
cASSERT(mc, rc || page_numkeys(mc->pg[mc->top]) >= minkeys);
return rc;
}
}
if (right_nkeys > minkeys && (is_modifable(mc->txn, right) || involve)) {
/* try move from right */
mn->pg[mn->top] = right;
mn->ki[mn->top - 1] = (indx_t)(ki_pre_top + 1);
mn->ki[mn->top] = 0;
mc->ki[mc->top] = (indx_t)nkeys;
couple.outer.next = mn->txn->cursors[cursor_dbi(mn)];
mn->txn->cursors[cursor_dbi(mn)] = &couple.outer;
rc = node_move(mn, mc, false);
mn->txn->cursors[cursor_dbi(mn)] = couple.outer.next;
if (likely(rc != MDBX_RESULT_TRUE)) {
mc->ki[mc->top] = (indx_t)ki_top;
cASSERT(mc, rc || page_numkeys(mc->pg[mc->top]) >= minkeys);
return rc;
}
}
if (nkeys >= minkeys) {
mc->ki[mc->top] = (indx_t)ki_top;
if (AUDIT_ENABLED())
return cursor_check_updating(mc);
return MDBX_SUCCESS;
}
if (mc->txn->env->options.prefer_waf_insteadof_balance &&
likely(room_threshold > 0)) {
room_threshold = 0;
goto retry;
}
if (likely(!involve) &&
(likely(mc->tree != &mc->txn->dbs[FREE_DBI]) || mc->txn->tw.loose_pages ||
MDBX_PNL_GETSIZE(mc->txn->tw.relist) ||
(mc->flags & z_gcu_preparation) || (mc->txn->flags & txn_gc_drained) ||
room_threshold)) {
involve = true;
goto retry;
}
if (likely(room_threshold > 0)) {
room_threshold = 0;
goto retry;
}
ERROR("Unable to merge/rebalance %s page %" PRIaPGNO
" (has %zu keys, fill %u.%u%%, used %zu, room %zu bytes)",
is_leaf(tp) ? "leaf" : "branch", tp->pgno, numkeys,
page_fill_percentum_x10(mc->txn->env, tp) / 10,
page_fill_percentum_x10(mc->txn->env, tp) % 10,
page_used(mc->txn->env, tp), room);
return MDBX_PROBLEM;
}
int page_split(MDBX_cursor *mc, const MDBX_val *const newkey,
MDBX_val *const newdata, pgno_t newpgno, const unsigned naf) {
unsigned flags;
int rc = MDBX_SUCCESS, foliage = 0;
MDBX_env *const env = mc->txn->env;
MDBX_val rkey, xdata;
page_t *tmp_ki_copy = nullptr;
DKBUF;
page_t *const mp = mc->pg[mc->top];
cASSERT(mc, (mp->flags & P_ILL_BITS) == 0);
const size_t newindx = mc->ki[mc->top];
size_t nkeys = page_numkeys(mp);
if (AUDIT_ENABLED()) {
rc = cursor_check_updating(mc);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
}
STATIC_ASSERT(P_BRANCH == 1);
const size_t minkeys = (mp->flags & P_BRANCH) + (size_t)1;
DEBUG(">> splitting %s-page %" PRIaPGNO
" and adding %zu+%zu [%s] at %i, nkeys %zi",
is_leaf(mp) ? "leaf" : "branch", mp->pgno, newkey->iov_len,
newdata ? newdata->iov_len : 0, DKEY_DEBUG(newkey), mc->ki[mc->top],
nkeys);
cASSERT(mc, nkeys + 1 >= minkeys * 2);
/* Create a new sibling page. */
pgr_t npr = page_new(mc, mp->flags);
if (unlikely(npr.err != MDBX_SUCCESS))
return npr.err;
page_t *const sister = npr.page;
sister->dupfix_ksize = mp->dupfix_ksize;
DEBUG("new sibling: page %" PRIaPGNO, sister->pgno);
/* Usually when splitting the root page, the cursor
* height is 1. But when called from tree_propagate_key,
* the cursor height may be greater because it walks
* up the stack while finding the branch slot to update. */
intptr_t prev_top = mc->top - 1;
if (mc->top == 0) {
npr = page_new(mc, P_BRANCH);
rc = npr.err;
if (unlikely(rc != MDBX_SUCCESS))
goto done;
page_t *const pp = npr.page;
/* shift current top to make room for new parent */
cASSERT(mc, mc->tree->height > 0);
#if MDBX_DEBUG
memset(mc->pg + 3, 0, sizeof(mc->pg) - sizeof(mc->pg[0]) * 3);
memset(mc->ki + 3, -1, sizeof(mc->ki) - sizeof(mc->ki[0]) * 3);
#endif
mc->pg[2] = mc->pg[1];
mc->ki[2] = mc->ki[1];
mc->pg[1] = mc->pg[0];
mc->ki[1] = mc->ki[0];
mc->pg[0] = pp;
mc->ki[0] = 0;
mc->tree->root = pp->pgno;
DEBUG("root split! new root = %" PRIaPGNO, pp->pgno);
foliage = mc->tree->height++;
/* Add left (implicit) pointer. */
rc = node_add_branch(mc, 0, nullptr, mp->pgno);
if (unlikely(rc != MDBX_SUCCESS)) {
/* undo the pre-push */
mc->pg[0] = mc->pg[1];
mc->ki[0] = mc->ki[1];
mc->tree->root = mp->pgno;
mc->tree->height--;
goto done;
}
mc->top = 1;
prev_top = 0;
if (AUDIT_ENABLED()) {
rc = cursor_check_updating(mc);
if (unlikely(rc != MDBX_SUCCESS))
goto done;
}
} else {
DEBUG("parent branch page is %" PRIaPGNO, mc->pg[prev_top]->pgno);
}
cursor_couple_t couple;
MDBX_cursor *const mn = cursor_clone(mc, &couple);
mn->pg[mn->top] = sister;
mn->ki[mn->top] = 0;
mn->ki[prev_top] = mc->ki[prev_top] + 1;
size_t split_indx =
(newindx < nkeys)
? /* split at the middle */ (nkeys + 1) >> 1
: /* split at the end (i.e. like append-mode ) */ nkeys - minkeys + 1;
eASSERT(env, split_indx >= minkeys && split_indx <= nkeys - minkeys + 1);
cASSERT(mc, !is_branch(mp) || newindx > 0);
MDBX_val sepkey = {nullptr, 0};
/* It is reasonable and possible to split the page at the begin */
if (unlikely(newindx < minkeys)) {
split_indx = minkeys;
if (newindx == 0 && !(naf & MDBX_SPLIT_REPLACE)) {
split_indx = 0;
/* Checking for ability of splitting by the left-side insertion
* of a pure page with the new key */
for (intptr_t i = 0; i < mc->top; ++i)
if (mc->ki[i]) {
sepkey = get_key(page_node(mc->pg[i], mc->ki[i]));
if (mc->clc->k.cmp(newkey, &sepkey) >= 0)
split_indx = minkeys;
break;
}
if (split_indx == 0) {
/* Save the current first key which was omitted on the parent branch
* page and should be updated if the new first entry will be added */
if (is_dupfix_leaf(mp))
sepkey = page_dupfix_key(mp, 0, mc->tree->dupfix_size);
else
sepkey = get_key(page_node(mp, 0));
cASSERT(mc, mc->clc->k.cmp(newkey, &sepkey) < 0);
/* Avoiding rare complex cases of nested split the parent page(s) */
if (page_room(mc->pg[prev_top]) < branch_size(env, &sepkey))
split_indx = minkeys;
}
if (foliage) {
TRACE("pure-left: foliage %u, top %i, ptop %zu, split_indx %zi, "
"minkeys %zi, sepkey %s, parent-room %zu, need4split %zu",
foliage, mc->top, prev_top, split_indx, minkeys,
DKEY_DEBUG(&sepkey), page_room(mc->pg[prev_top]),
branch_size(env, &sepkey));
TRACE("pure-left: newkey %s, newdata %s, newindx %zu",
DKEY_DEBUG(newkey), DVAL_DEBUG(newdata), newindx);
}
}
}
const bool pure_right = split_indx == nkeys;
const bool pure_left = split_indx == 0;
if (unlikely(pure_right)) {
/* newindx == split_indx == nkeys */
TRACE("no-split, but add new pure page at the %s", "right/after");
cASSERT(mc, newindx == nkeys && split_indx == nkeys && minkeys == 1);
sepkey = *newkey;
} else if (unlikely(pure_left)) {
/* newindx == split_indx == 0 */
TRACE("pure-left: no-split, but add new pure page at the %s",
"left/before");
cASSERT(mc, newindx == 0 && split_indx == 0 && minkeys == 1);
TRACE("pure-left: old-first-key is %s", DKEY_DEBUG(&sepkey));
} else {
if (is_dupfix_leaf(sister)) {
/* Move half of the keys to the right sibling */
const intptr_t distance = mc->ki[mc->top] - split_indx;
size_t ksize = mc->tree->dupfix_size;
void *const split = page_dupfix_ptr(mp, split_indx, ksize);
size_t rsize = (nkeys - split_indx) * ksize;
size_t lsize = (nkeys - split_indx) * sizeof(indx_t);
cASSERT(mc, mp->lower >= lsize);
mp->lower -= (indx_t)lsize;
cASSERT(mc, sister->lower + lsize <= UINT16_MAX);
sister->lower += (indx_t)lsize;
cASSERT(mc, mp->upper + rsize - lsize <= UINT16_MAX);
mp->upper += (indx_t)(rsize - lsize);
cASSERT(mc, sister->upper >= rsize - lsize);
sister->upper -= (indx_t)(rsize - lsize);
sepkey.iov_len = ksize;
sepkey.iov_base = (newindx != split_indx) ? split : newkey->iov_base;
if (distance < 0) {
cASSERT(mc, ksize >= sizeof(indx_t));
void *const ins = page_dupfix_ptr(mp, mc->ki[mc->top], ksize);
memcpy(sister->entries, split, rsize);
sepkey.iov_base = sister->entries;
memmove(ptr_disp(ins, ksize), ins,
(split_indx - mc->ki[mc->top]) * ksize);
memcpy(ins, newkey->iov_base, ksize);
cASSERT(mc, UINT16_MAX - mp->lower >= (int)sizeof(indx_t));
mp->lower += sizeof(indx_t);
cASSERT(mc, mp->upper >= ksize - sizeof(indx_t));
mp->upper -= (indx_t)(ksize - sizeof(indx_t));
cASSERT(mc, (((ksize & page_numkeys(mp)) ^ mp->upper) & 1) == 0);
} else {
memcpy(sister->entries, split, distance * ksize);
void *const ins = page_dupfix_ptr(sister, distance, ksize);
memcpy(ins, newkey->iov_base, ksize);
memcpy(ptr_disp(ins, ksize), ptr_disp(split, distance * ksize),
rsize - distance * ksize);
cASSERT(mc, UINT16_MAX - sister->lower >= (int)sizeof(indx_t));
sister->lower += sizeof(indx_t);
cASSERT(mc, sister->upper >= ksize - sizeof(indx_t));
sister->upper -= (indx_t)(ksize - sizeof(indx_t));
cASSERT(mc, distance <= (int)UINT16_MAX);
mc->ki[mc->top] = (indx_t)distance;
cASSERT(mc,
(((ksize & page_numkeys(sister)) ^ sister->upper) & 1) == 0);
}
if (AUDIT_ENABLED()) {
rc = cursor_check_updating(mc);
if (unlikely(rc != MDBX_SUCCESS))
goto done;
rc = cursor_check_updating(mn);
if (unlikely(rc != MDBX_SUCCESS))
goto done;
}
} else {
/* grab a page to hold a temporary copy */
tmp_ki_copy = page_shadow_alloc(mc->txn, 1);
if (unlikely(tmp_ki_copy == nullptr)) {
rc = MDBX_ENOMEM;
goto done;
}
const size_t max_space = page_space(env);
const size_t new_size = is_leaf(mp) ? leaf_size(env, newkey, newdata)
: branch_size(env, newkey);
/* prepare to insert */
size_t i = 0;
while (i < newindx) {
tmp_ki_copy->entries[i] = mp->entries[i];
++i;
}
tmp_ki_copy->entries[i] = (indx_t)-1;
while (++i <= nkeys)
tmp_ki_copy->entries[i] = mp->entries[i - 1];
tmp_ki_copy->pgno = mp->pgno;
tmp_ki_copy->flags = mp->flags;
tmp_ki_copy->txnid = INVALID_TXNID;
tmp_ki_copy->lower = 0;
tmp_ki_copy->upper = (indx_t)max_space;
/* Добавляемый узел может не поместиться в страницу-половину вместе
* с количественной половиной узлов из исходной страницы. В худшем случае,
* в страницу-половину с добавляемым узлом могут попасть самые больше узлы
* из исходной страницы, а другую половину только узлы с самыми короткими
* ключами и с пустыми данными. Поэтому, чтобы найти подходящую границу
* разреза требуется итерировать узлы и считая их объем.
*
* Однако, при простом количественном делении (без учета размера ключей
* и данных) на страницах-половинах будет примерно вдвое меньше узлов.
* Поэтому добавляемый узел точно поместится, если его размер не больше
* чем место "освобождающееся" от заголовков узлов, которые переедут
* в другую страницу-половину. Кроме этого, как минимум по одному байту
* будет в каждом ключе, в худшем случае кроме одного, который может быть
* нулевого размера. */
if (newindx == split_indx && nkeys >= 5) {
STATIC_ASSERT(P_BRANCH == 1);
split_indx += mp->flags & P_BRANCH;
}
eASSERT(env, split_indx >= minkeys && split_indx <= nkeys + 1 - minkeys);
const size_t dim_nodes =
(newindx >= split_indx) ? split_indx : nkeys - split_indx;
const size_t dim_used = (sizeof(indx_t) + NODESIZE + 1) * dim_nodes;
if (new_size >= dim_used) {
/* Search for best acceptable split point */
i = (newindx < split_indx) ? 0 : nkeys;
intptr_t dir = (newindx < split_indx) ? 1 : -1;
size_t before = 0, after = new_size + page_used(env, mp);
size_t best_split = split_indx;
size_t best_shift = INT_MAX;
TRACE("seek separator from %zu, step %zi, default %zu, new-idx %zu, "
"new-size %zu",
i, dir, split_indx, newindx, new_size);
do {
cASSERT(mc, i <= nkeys);
size_t size = new_size;
if (i != newindx) {
node_t *node = ptr_disp(mp, tmp_ki_copy->entries[i] + PAGEHDRSZ);
size = NODESIZE + node_ks(node) + sizeof(indx_t);
if (is_leaf(mp))
size += (node_flags(node) & N_BIGDATA) ? sizeof(pgno_t)
: node_ds(node);
size = EVEN_CEIL(size);
}
before += size;
after -= size;
TRACE("step %zu, size %zu, before %zu, after %zu, max %zu", i, size,
before, after, max_space);
if (before <= max_space && after <= max_space) {
const size_t split = i + (dir > 0);
if (split >= minkeys && split <= nkeys + 1 - minkeys) {
const size_t shift = branchless_abs(split_indx - split);
if (shift >= best_shift)
break;
best_shift = shift;
best_split = split;
if (!best_shift)
break;
}
}
i += dir;
} while (i < nkeys);
split_indx = best_split;
TRACE("chosen %zu", split_indx);
}
eASSERT(env, split_indx >= minkeys && split_indx <= nkeys + 1 - minkeys);
sepkey = *newkey;
if (split_indx != newindx) {
node_t *node =
ptr_disp(mp, tmp_ki_copy->entries[split_indx] + PAGEHDRSZ);
sepkey.iov_len = node_ks(node);
sepkey.iov_base = node_key(node);
}
}
}
DEBUG("separator is %zd [%s]", split_indx, DKEY_DEBUG(&sepkey));
bool did_split_parent = false;
/* Copy separator key to the parent. */
if (page_room(mn->pg[prev_top]) < branch_size(env, &sepkey)) {
TRACE("need split parent branch-page for key %s", DKEY_DEBUG(&sepkey));
cASSERT(mc, page_numkeys(mn->pg[prev_top]) > 2);
cASSERT(mc, !pure_left);
const int top = mc->top;
const int height = mc->tree->height;
mn->top -= 1;
did_split_parent = true;
couple.outer.next = mn->txn->cursors[cursor_dbi(mn)];
mn->txn->cursors[cursor_dbi(mn)] = &couple.outer;
rc = page_split(mn, &sepkey, nullptr, sister->pgno, 0);
mn->txn->cursors[cursor_dbi(mn)] = couple.outer.next;
if (unlikely(rc != MDBX_SUCCESS))
goto done;
cASSERT(mc, mc->top - top == mc->tree->height - height);
if (AUDIT_ENABLED()) {
rc = cursor_check_updating(mc);
if (unlikely(rc != MDBX_SUCCESS))
goto done;
}
/* root split? */
prev_top += mc->top - top;
/* Right page might now have changed parent.
* Check if left page also changed parent. */
if (mn->pg[prev_top] != mc->pg[prev_top] &&
mc->ki[prev_top] >= page_numkeys(mc->pg[prev_top])) {
for (intptr_t i = 0; i < prev_top; i++) {
mc->pg[i] = mn->pg[i];
mc->ki[i] = mn->ki[i];
}
mc->pg[prev_top] = mn->pg[prev_top];
if (mn->ki[prev_top]) {
mc->ki[prev_top] = mn->ki[prev_top] - 1;
} else {
/* find right page's left sibling */
mc->ki[prev_top] = mn->ki[prev_top];
rc = cursor_sibling_left(mc);
if (unlikely(rc != MDBX_SUCCESS)) {
if (rc == MDBX_NOTFOUND) /* improper mdbx_cursor_sibling() result */ {
ERROR("unexpected %i error going left sibling", rc);
rc = MDBX_PROBLEM;
}
goto done;
}
}
}
} else if (unlikely(pure_left)) {
page_t *ptop_page = mc->pg[prev_top];
TRACE("pure-left: adding to parent page %u node[%u] left-leaf page #%u key "
"%s",
ptop_page->pgno, mc->ki[prev_top], sister->pgno,
DKEY(mc->ki[prev_top] ? newkey : nullptr));
assert(mc->top == prev_top + 1);
mc->top = (uint8_t)prev_top;
rc = node_add_branch(mc, mc->ki[prev_top],
mc->ki[prev_top] ? newkey : nullptr, sister->pgno);
cASSERT(mc, mp == mc->pg[prev_top + 1] && newindx == mc->ki[prev_top + 1] &&
prev_top == mc->top);
if (likely(rc == MDBX_SUCCESS) && mc->ki[prev_top] == 0) {
node_t *node = page_node(mc->pg[prev_top], 1);
TRACE("pure-left: update prev-first key on parent to %s", DKEY(&sepkey));
cASSERT(mc, node_ks(node) == 0 && node_pgno(node) == mp->pgno);
cASSERT(mc, mc->top == prev_top && mc->ki[prev_top] == 0);
mc->ki[prev_top] = 1;
rc = tree_propagate_key(mc, &sepkey);
cASSERT(mc, mc->top == prev_top && mc->ki[prev_top] == 1);
cASSERT(mc,
mp == mc->pg[prev_top + 1] && newindx == mc->ki[prev_top + 1]);
mc->ki[prev_top] = 0;
} else {
TRACE("pure-left: no-need-update prev-first key on parent %s",
DKEY(&sepkey));
}
mc->top++;
if (unlikely(rc != MDBX_SUCCESS))
goto done;
node_t *node = page_node(mc->pg[prev_top], mc->ki[prev_top] + (size_t)1);
cASSERT(mc, node_pgno(node) == mp->pgno && mc->pg[prev_top] == ptop_page);
} else {
mn->top -= 1;
TRACE("add-to-parent the right-entry[%u] for new sibling-page",
mn->ki[prev_top]);
rc = node_add_branch(mn, mn->ki[prev_top], &sepkey, sister->pgno);
mn->top += 1;
if (unlikely(rc != MDBX_SUCCESS))
goto done;
}
if (unlikely(pure_left | pure_right)) {
mc->pg[mc->top] = sister;
mc->ki[mc->top] = 0;
switch (page_type(sister)) {
case P_LEAF: {
cASSERT(mc, newpgno == 0 || newpgno == P_INVALID);
rc = node_add_leaf(mc, 0, newkey, newdata, naf);
} break;
case P_LEAF | P_DUPFIX: {
cASSERT(mc, (naf & (N_BIGDATA | N_SUBDATA | N_DUPDATA)) == 0);
cASSERT(mc, newpgno == 0 || newpgno == P_INVALID);
rc = node_add_dupfix(mc, 0, newkey);
} break;
default:
rc = bad_page(sister, "wrong page-type %u\n", page_type(sister));
}
if (unlikely(rc != MDBX_SUCCESS))
goto done;
if (pure_right) {
for (intptr_t i = 0; i < mc->top; i++)
mc->ki[i] = mn->ki[i];
} else if (mc->ki[mc->top - 1] == 0) {
for (intptr_t i = 2; i <= mc->top; ++i)
if (mc->ki[mc->top - i]) {
sepkey = get_key(page_node(mc->pg[mc->top - i], mc->ki[mc->top - i]));
if (mc->clc->k.cmp(newkey, &sepkey) < 0) {
mc->top -= (int8_t)i;
DEBUG("pure-left: update new-first on parent [%i] page %u key %s",
mc->ki[mc->top], mc->pg[mc->top]->pgno, DKEY(newkey));
rc = tree_propagate_key(mc, newkey);
mc->top += (int8_t)i;
if (unlikely(rc != MDBX_SUCCESS))
goto done;
}
break;
}
}
} else if (tmp_ki_copy) { /* !is_dupfix_leaf(mp) */
/* Move nodes */
mc->pg[mc->top] = sister;
size_t n = 0, ii = split_indx;
do {
TRACE("i %zu, nkeys %zu => n %zu, rp #%u", ii, nkeys, n, sister->pgno);
pgno_t pgno = 0;
MDBX_val *rdata = nullptr;
if (ii == newindx) {
rkey = *newkey;
if (is_leaf(mp))
rdata = newdata;
else
pgno = newpgno;
flags = naf;
/* Update index for the new key. */
mc->ki[mc->top] = (indx_t)n;
} else {
node_t *node = ptr_disp(mp, tmp_ki_copy->entries[ii] + PAGEHDRSZ);
rkey.iov_base = node_key(node);
rkey.iov_len = node_ks(node);
if (is_leaf(mp)) {
xdata.iov_base = node_data(node);
xdata.iov_len = node_ds(node);
rdata = &xdata;
} else
pgno = node_pgno(node);
flags = node_flags(node);
}
switch (page_type(sister)) {
case P_BRANCH: {
cASSERT(mc, 0 == (uint16_t)flags);
/* First branch index doesn't need key data. */
rc = node_add_branch(mc, n, n ? &rkey : nullptr, pgno);
} break;
case P_LEAF: {
cASSERT(mc, pgno == 0);
cASSERT(mc, rdata != nullptr);
rc = node_add_leaf(mc, n, &rkey, rdata, flags);
} break;
/* case P_LEAF | P_DUPFIX: {
cASSERT(mc, (nflags & (N_BIGDATA | N_SUBDATA | N_DUPDATA)) == 0);
cASSERT(mc, gno == 0);
rc = mdbx_node_add_dupfix(mc, n, &rkey);
} break; */
default:
rc = bad_page(sister, "wrong page-type %u\n", page_type(sister));
}
if (unlikely(rc != MDBX_SUCCESS))
goto done;
++n;
if (++ii > nkeys) {
ii = 0;
n = 0;
mc->pg[mc->top] = tmp_ki_copy;
TRACE("switch to mp #%u", tmp_ki_copy->pgno);
}
} while (ii != split_indx);
TRACE("ii %zu, nkeys %zu, n %zu, pgno #%u", ii, nkeys, n,
mc->pg[mc->top]->pgno);
nkeys = page_numkeys(tmp_ki_copy);
for (size_t i = 0; i < nkeys; i++)
mp->entries[i] = tmp_ki_copy->entries[i];
mp->lower = tmp_ki_copy->lower;
mp->upper = tmp_ki_copy->upper;
memcpy(page_node(mp, nkeys - 1), page_node(tmp_ki_copy, nkeys - 1),
env->ps - tmp_ki_copy->upper - PAGEHDRSZ);
/* reset back to original page */
if (newindx < split_indx) {
mc->pg[mc->top] = mp;
} else {
mc->pg[mc->top] = sister;
mc->ki[prev_top]++;
/* Make sure ki is still valid. */
if (mn->pg[prev_top] != mc->pg[prev_top] &&
mc->ki[prev_top] >= page_numkeys(mc->pg[prev_top])) {
for (intptr_t i = 0; i <= prev_top; i++) {
mc->pg[i] = mn->pg[i];
mc->ki[i] = mn->ki[i];
}
}
}
} else if (newindx >= split_indx) {
mc->pg[mc->top] = sister;
mc->ki[prev_top]++;
/* Make sure ki is still valid. */
if (mn->pg[prev_top] != mc->pg[prev_top] &&
mc->ki[prev_top] >= page_numkeys(mc->pg[prev_top])) {
for (intptr_t i = 0; i <= prev_top; i++) {
mc->pg[i] = mn->pg[i];
mc->ki[i] = mn->ki[i];
}
}
}
/* Adjust other cursors pointing to mp and/or to parent page */
nkeys = page_numkeys(mp);
for (MDBX_cursor *m2 = mc->txn->cursors[cursor_dbi(mc)]; m2; m2 = m2->next) {
MDBX_cursor *m3 = (mc->flags & z_inner) ? &m2->subcur->cursor : m2;
if (!is_pointed(m3) || m3 == mc)
continue;
if (foliage) {
/* sub cursors may be on different DB */
if (m3->pg[0] != mp)
continue;
/* root split */
for (intptr_t k = foliage; k >= 0; k--) {
m3->ki[k + 1] = m3->ki[k];
m3->pg[k + 1] = m3->pg[k];
}
m3->ki[0] = m3->ki[0] >= nkeys + pure_left;
m3->pg[0] = mc->pg[0];
m3->top += 1;
}
if (m3->top >= mc->top && m3->pg[mc->top] == mp && !pure_left) {
if (m3->ki[mc->top] >= newindx)
m3->ki[mc->top] += !(naf & MDBX_SPLIT_REPLACE);
if (m3->ki[mc->top] >= nkeys) {
m3->pg[mc->top] = sister;
cASSERT(mc, m3->ki[mc->top] >= nkeys);
m3->ki[mc->top] -= (indx_t)nkeys;
for (intptr_t i = 0; i < mc->top; i++) {
m3->ki[i] = mn->ki[i];
m3->pg[i] = mn->pg[i];
}
}
} else if (!did_split_parent && m3->top >= prev_top &&
m3->pg[prev_top] == mc->pg[prev_top] &&
m3->ki[prev_top] >= mc->ki[prev_top]) {
m3->ki[prev_top]++; /* also for the `pure-left` case */
}
if (inner_pointed(m3) && is_leaf(mp))
cursor_inner_refresh(m3, m3->pg[mc->top], m3->ki[mc->top]);
}
TRACE("mp #%u left: %zd, sister #%u left: %zd", mp->pgno, page_room(mp),
sister->pgno, page_room(sister));
done:
if (tmp_ki_copy)
page_shadow_release(env, tmp_ki_copy, 1);
if (unlikely(rc != MDBX_SUCCESS))
mc->txn->flags |= MDBX_TXN_ERROR;
else {
if (AUDIT_ENABLED())
rc = cursor_check_updating(mc);
if (unlikely(naf & MDBX_RESERVE)) {
node_t *node = page_node(mc->pg[mc->top], mc->ki[mc->top]);
if (!(node_flags(node) & N_BIGDATA))
newdata->iov_base = node_data(node);
}
#if MDBX_ENABLE_PGOP_STAT
env->lck->pgops.split.weak += 1;
#endif /* MDBX_ENABLE_PGOP_STAT */
}
DEBUG("<< mp #%u, rc %d", mp->pgno, rc);
return rc;
}
int tree_propagate_key(MDBX_cursor *mc, const MDBX_val *key) {
page_t *mp;
node_t *node;
size_t len;
ptrdiff_t delta, ksize, oksize;
intptr_t ptr, i, nkeys, indx;
DKBUF_DEBUG;
cASSERT(mc, cursor_is_tracked(mc));
indx = mc->ki[mc->top];
mp = mc->pg[mc->top];
node = page_node(mp, indx);
ptr = mp->entries[indx];
#if MDBX_DEBUG
MDBX_val k2;
k2.iov_base = node_key(node);
k2.iov_len = node_ks(node);
DEBUG("update key %zi (offset %zu) [%s] to [%s] on page %" PRIaPGNO, indx,
ptr, DVAL_DEBUG(&k2), DKEY_DEBUG(key), mp->pgno);
#endif /* MDBX_DEBUG */
/* Sizes must be 2-byte aligned. */
ksize = EVEN_CEIL(key->iov_len);
oksize = EVEN_CEIL(node_ks(node));
delta = ksize - oksize;
/* Shift node contents if EVEN_CEIL(key length) changed. */
if (delta) {
if (delta > (int)page_room(mp)) {
/* not enough space left, do a delete and split */
DEBUG("Not enough room, delta = %zd, splitting...", delta);
pgno_t pgno = node_pgno(node);
node_del(mc, 0);
int err = page_split(mc, key, nullptr, pgno, MDBX_SPLIT_REPLACE);
if (err == MDBX_SUCCESS && AUDIT_ENABLED())
err = cursor_check_updating(mc);
return err;
}
nkeys = page_numkeys(mp);
for (i = 0; i < nkeys; i++) {
if (mp->entries[i] <= ptr) {
cASSERT(mc, mp->entries[i] >= delta);
mp->entries[i] -= (indx_t)delta;
}
}
void *const base = ptr_disp(mp, mp->upper + PAGEHDRSZ);
len = ptr - mp->upper + NODESIZE;
memmove(ptr_disp(base, -delta), base, len);
cASSERT(mc, mp->upper >= delta);
mp->upper -= (indx_t)delta;
node = page_node(mp, indx);
}
/* But even if no shift was needed, update ksize */
node_set_ks(node, key->iov_len);
if (likely(key->iov_len /* to avoid UBSAN traps*/ != 0))
memcpy(node_key(node), key->iov_base, key->iov_len);
return MDBX_SUCCESS;
}