/// \copyright SPDX-License-Identifier: Apache-2.0 /// \note Please refer to the COPYRIGHT file for explanations license change, /// credits and acknowledgments. /// \author Леонид Юрьев aka Leonid Yuriev \date 2015-2024 #include "internals.h" static MDBX_cursor *cursor_clone(const MDBX_cursor *csrc, cursor_couple_t *couple) { cASSERT(csrc, csrc->txn->txnid >= csrc->txn->env->lck->cached_oldest.weak); couple->outer.next = nullptr; couple->outer.backup = nullptr; couple->outer.subcur = nullptr; couple->outer.clc = nullptr; couple->outer.txn = csrc->txn; couple->outer.dbi_state = csrc->dbi_state; couple->outer.checking = z_pagecheck; couple->outer.tree = nullptr; couple->outer.top_and_flags = 0; MDBX_cursor *cdst = &couple->outer; if (is_inner(csrc)) { couple->inner.cursor.next = nullptr; couple->inner.cursor.backup = nullptr; couple->inner.cursor.subcur = nullptr; couple->inner.cursor.txn = csrc->txn; couple->inner.cursor.dbi_state = csrc->dbi_state; couple->outer.subcur = &couple->inner; cdst = &couple->inner.cursor; } cdst->checking = csrc->checking; cdst->tree = csrc->tree; cdst->clc = csrc->clc; cursor_cpstk(csrc, cdst); return cdst; } /*----------------------------------------------------------------------------*/ void recalculate_merge_thresholds(MDBX_env *env) { const size_t bytes = page_space(env); env->merge_threshold = (uint16_t)(bytes - (bytes * env->options.merge_threshold_16dot16_percent >> 16)); env->merge_threshold_gc = (uint16_t)(bytes - ((env->options.merge_threshold_16dot16_percent > 19005) ? bytes / 3 /* 33 % */ : bytes / 4 /* 25 % */)); } int tree_drop(MDBX_cursor *mc, const bool may_have_tables) { MDBX_txn *txn = mc->txn; int rc = tree_search(mc, nullptr, Z_FIRST); if (likely(rc == MDBX_SUCCESS)) { /* DUPSORT sub-DBs have no large-pages/tables. Omit scanning leaves. * This also avoids any P_DUPFIX pages, which have no nodes. * Also if the DB doesn't have sub-DBs and has no large/overflow * pages, omit scanning leaves. */ if (!(may_have_tables | mc->tree->large_pages)) cursor_pop(mc); rc = pnl_need(&txn->tw.retired_pages, (size_t)mc->tree->branch_pages + (size_t)mc->tree->leaf_pages + (size_t)mc->tree->large_pages); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; page_t *stack[CURSOR_STACK_SIZE]; for (intptr_t i = 0; i <= mc->top; ++i) stack[i] = mc->pg[i]; while (mc->top >= 0) { page_t *const mp = mc->pg[mc->top]; const size_t nkeys = page_numkeys(mp); if (is_leaf(mp)) { cASSERT(mc, mc->top + 1 == mc->tree->height); for (size_t i = 0; i < nkeys; i++) { node_t *node = page_node(mp, i); if (node_flags(node) & N_BIGDATA) { rc = page_retire_ex(mc, node_largedata_pgno(node), nullptr, 0); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; if (!(may_have_tables | mc->tree->large_pages)) goto pop; } else if (node_flags(node) & N_SUBDATA) { if (unlikely((node_flags(node) & N_DUPDATA) == 0)) { rc = /* disallowing implicit table deletion */ MDBX_INCOMPATIBLE; goto bailout; } rc = cursor_dupsort_setup(mc, node, mp); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; rc = tree_drop(&mc->subcur->cursor, false); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } } } else { cASSERT(mc, mc->top + 1 < mc->tree->height); mc->checking |= z_retiring; const unsigned pagetype = (is_frozen(txn, mp) ? P_FROZEN : 0) + ((mc->top + 2 == mc->tree->height) ? (mc->checking & (P_LEAF | P_DUPFIX)) : P_BRANCH); for (size_t i = 0; i < nkeys; i++) { node_t *node = page_node(mp, i); tASSERT(txn, (node_flags(node) & (N_BIGDATA | N_SUBDATA | N_DUPDATA)) == 0); const pgno_t pgno = node_pgno(node); rc = page_retire_ex(mc, pgno, nullptr, pagetype); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } mc->checking -= z_retiring; } if (!mc->top) break; cASSERT(mc, nkeys > 0); mc->ki[mc->top] = (indx_t)nkeys; rc = cursor_sibling_right(mc); if (unlikely(rc != MDBX_SUCCESS)) { if (unlikely(rc != MDBX_NOTFOUND)) goto bailout; /* no more siblings, go back to beginning * of previous level. */ pop: cursor_pop(mc); mc->ki[0] = 0; for (intptr_t i = 1; i <= mc->top; i++) { mc->pg[i] = stack[i]; mc->ki[i] = 0; } } } rc = page_retire(mc, mc->pg[0]); } bailout: be_poor(mc); if (unlikely(rc != MDBX_SUCCESS)) txn->flags |= MDBX_TXN_ERROR; return rc; } static int node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, bool fromleft) { int rc; DKBUF_DEBUG; page_t *psrc = csrc->pg[csrc->top]; page_t *pdst = cdst->pg[cdst->top]; cASSERT(csrc, page_type(psrc) == page_type(pdst)); cASSERT(csrc, csrc->tree == cdst->tree); cASSERT(csrc, csrc->top == cdst->top); if (unlikely(page_type(psrc) != page_type(pdst))) { bailout: ERROR("Wrong or mismatch pages's types (src %d, dst %d) to move node", page_type(psrc), page_type(pdst)); csrc->txn->flags |= MDBX_TXN_ERROR; return MDBX_PROBLEM; } MDBX_val key4move; switch (page_type(psrc)) { case P_BRANCH: { const node_t *srcnode = page_node(psrc, csrc->ki[csrc->top]); cASSERT(csrc, node_flags(srcnode) == 0); const pgno_t srcpg = node_pgno(srcnode); key4move.iov_len = node_ks(srcnode); key4move.iov_base = node_key(srcnode); if (csrc->ki[csrc->top] == 0) { const int8_t top = csrc->top; cASSERT(csrc, top >= 0); /* must find the lowest key below src */ rc = tree_search_lowest(csrc); page_t *lowest_page = csrc->pg[csrc->top]; if (unlikely(rc != MDBX_SUCCESS)) return rc; cASSERT(csrc, is_leaf(lowest_page)); if (unlikely(!is_leaf(lowest_page))) goto bailout; if (is_dupfix_leaf(lowest_page)) key4move = page_dupfix_key(lowest_page, 0, csrc->tree->dupfix_size); else { const node_t *lowest_node = page_node(lowest_page, 0); key4move.iov_len = node_ks(lowest_node); key4move.iov_base = node_key(lowest_node); } /* restore cursor after mdbx_page_search_lowest() */ csrc->top = top; csrc->ki[csrc->top] = 0; /* paranoia */ cASSERT(csrc, psrc == csrc->pg[csrc->top]); cASSERT(csrc, is_branch(psrc)); if (unlikely(!is_branch(psrc))) goto bailout; } if (cdst->ki[cdst->top] == 0) { cursor_couple_t couple; MDBX_cursor *const mn = cursor_clone(cdst, &couple); const int8_t top = cdst->top; cASSERT(csrc, top >= 0); /* must find the lowest key below dst */ rc = tree_search_lowest(mn); if (unlikely(rc != MDBX_SUCCESS)) return rc; page_t *const lowest_page = mn->pg[mn->top]; cASSERT(cdst, is_leaf(lowest_page)); if (unlikely(!is_leaf(lowest_page))) goto bailout; MDBX_val key; if (is_dupfix_leaf(lowest_page)) key = page_dupfix_key(lowest_page, 0, mn->tree->dupfix_size); else { node_t *lowest_node = page_node(lowest_page, 0); key.iov_len = node_ks(lowest_node); key.iov_base = node_key(lowest_node); } /* restore cursor after mdbx_page_search_lowest() */ mn->top = top; mn->ki[mn->top] = 0; const intptr_t delta = EVEN_CEIL(key.iov_len) - EVEN_CEIL(node_ks(page_node(mn->pg[mn->top], 0))); const intptr_t needed = branch_size(cdst->txn->env, &key4move) + delta; const intptr_t have = page_room(pdst); if (unlikely(needed > have)) return MDBX_RESULT_TRUE; if (unlikely((rc = page_touch(csrc)) || (rc = page_touch(cdst)))) return rc; psrc = csrc->pg[csrc->top]; pdst = cdst->pg[cdst->top]; couple.outer.next = mn->txn->cursors[cursor_dbi(mn)]; mn->txn->cursors[cursor_dbi(mn)] = &couple.outer; rc = tree_propagate_key(mn, &key); mn->txn->cursors[cursor_dbi(mn)] = couple.outer.next; if (unlikely(rc != MDBX_SUCCESS)) return rc; } else { const size_t needed = branch_size(cdst->txn->env, &key4move); const size_t have = page_room(pdst); if (unlikely(needed > have)) return MDBX_RESULT_TRUE; if (unlikely((rc = page_touch(csrc)) || (rc = page_touch(cdst)))) return rc; psrc = csrc->pg[csrc->top]; pdst = cdst->pg[cdst->top]; } DEBUG("moving %s-node %u [%s] on page %" PRIaPGNO " to node %u on page %" PRIaPGNO, "branch", csrc->ki[csrc->top], DKEY_DEBUG(&key4move), psrc->pgno, cdst->ki[cdst->top], pdst->pgno); /* Add the node to the destination page. */ rc = node_add_branch(cdst, cdst->ki[cdst->top], &key4move, srcpg); } break; case P_LEAF: { /* Mark src and dst as dirty. */ if (unlikely((rc = page_touch(csrc)) || (rc = page_touch(cdst)))) return rc; psrc = csrc->pg[csrc->top]; pdst = cdst->pg[cdst->top]; const node_t *srcnode = page_node(psrc, csrc->ki[csrc->top]); MDBX_val data; data.iov_len = node_ds(srcnode); data.iov_base = node_data(srcnode); key4move.iov_len = node_ks(srcnode); key4move.iov_base = node_key(srcnode); DEBUG("moving %s-node %u [%s] on page %" PRIaPGNO " to node %u on page %" PRIaPGNO, "leaf", csrc->ki[csrc->top], DKEY_DEBUG(&key4move), psrc->pgno, cdst->ki[cdst->top], pdst->pgno); /* Add the node to the destination page. */ rc = node_add_leaf(cdst, cdst->ki[cdst->top], &key4move, &data, node_flags(srcnode)); } break; case P_LEAF | P_DUPFIX: { /* Mark src and dst as dirty. */ if (unlikely((rc = page_touch(csrc)) || (rc = page_touch(cdst)))) return rc; psrc = csrc->pg[csrc->top]; pdst = cdst->pg[cdst->top]; key4move = page_dupfix_key(psrc, csrc->ki[csrc->top], csrc->tree->dupfix_size); DEBUG("moving %s-node %u [%s] on page %" PRIaPGNO " to node %u on page %" PRIaPGNO, "leaf2", csrc->ki[csrc->top], DKEY_DEBUG(&key4move), psrc->pgno, cdst->ki[cdst->top], pdst->pgno); /* Add the node to the destination page. */ rc = node_add_dupfix(cdst, cdst->ki[cdst->top], &key4move); } break; default: assert(false); goto bailout; } if (unlikely(rc != MDBX_SUCCESS)) return rc; /* Delete the node from the source page. */ node_del(csrc, key4move.iov_len); cASSERT(csrc, psrc == csrc->pg[csrc->top]); cASSERT(cdst, pdst == cdst->pg[cdst->top]); cASSERT(csrc, page_type(psrc) == page_type(pdst)); /* csrc курсор тут всегда временный, на стеке внутри tree_rebalance(), * и его нет необходимости корректировать. */ { /* Adjust other cursors pointing to mp */ MDBX_cursor *m2, *m3; const size_t dbi = cursor_dbi(csrc); cASSERT(csrc, csrc->top == cdst->top); if (fromleft) { /* Перемещаем с левой страницы нв правую, нужно сдвинуть ki на +1 */ for (m2 = csrc->txn->cursors[dbi]; m2; m2 = m2->next) { m3 = (csrc->flags & z_inner) ? &m2->subcur->cursor : m2; if (!is_related(csrc, m3)) continue; if (m3 != cdst && m3->pg[csrc->top] == pdst && m3->ki[csrc->top] >= cdst->ki[csrc->top]) { m3->ki[csrc->top] += 1; } if (/* m3 != csrc && */ m3->pg[csrc->top] == psrc && m3->ki[csrc->top] == csrc->ki[csrc->top]) { m3->pg[csrc->top] = pdst; m3->ki[csrc->top] = cdst->ki[cdst->top]; cASSERT(csrc, csrc->top > 0); m3->ki[csrc->top - 1] += 1; } if (is_leaf(psrc) && inner_pointed(m3)) { cASSERT(csrc, csrc->top == m3->top); size_t nkeys = page_numkeys(m3->pg[csrc->top]); if (likely(nkeys > m3->ki[csrc->top])) cursor_inner_refresh(m3, m3->pg[csrc->top], m3->ki[csrc->top]); } } } else { /* Перемещаем с правой страницы на левую, нужно сдвинуть ki на -1 */ for (m2 = csrc->txn->cursors[dbi]; m2; m2 = m2->next) { m3 = (csrc->flags & z_inner) ? &m2->subcur->cursor : m2; if (!is_related(csrc, m3)) continue; if (m3->pg[csrc->top] == psrc) { if (!m3->ki[csrc->top]) { m3->pg[csrc->top] = pdst; m3->ki[csrc->top] = cdst->ki[cdst->top]; cASSERT(csrc, csrc->top > 0 && m3->ki[csrc->top - 1] > 0); m3->ki[csrc->top - 1] -= 1; } else m3->ki[csrc->top] -= 1; if (is_leaf(psrc) && inner_pointed(m3)) { cASSERT(csrc, csrc->top == m3->top); size_t nkeys = page_numkeys(m3->pg[csrc->top]); if (likely(nkeys > m3->ki[csrc->top])) cursor_inner_refresh(m3, m3->pg[csrc->top], m3->ki[csrc->top]); } } } } } /* Update the parent separators. */ if (csrc->ki[csrc->top] == 0) { cASSERT(csrc, csrc->top > 0); if (csrc->ki[csrc->top - 1] != 0) { MDBX_val key; if (is_dupfix_leaf(psrc)) key = page_dupfix_key(psrc, 0, csrc->tree->dupfix_size); else { node_t *srcnode = page_node(psrc, 0); key.iov_len = node_ks(srcnode); key.iov_base = node_key(srcnode); } DEBUG("update separator for source page %" PRIaPGNO " to [%s]", psrc->pgno, DKEY_DEBUG(&key)); cursor_couple_t couple; MDBX_cursor *const mn = cursor_clone(csrc, &couple); cASSERT(csrc, mn->top > 0); mn->top -= 1; couple.outer.next = mn->txn->cursors[cursor_dbi(mn)]; mn->txn->cursors[cursor_dbi(mn)] = &couple.outer; rc = tree_propagate_key(mn, &key); mn->txn->cursors[cursor_dbi(mn)] = couple.outer.next; if (unlikely(rc != MDBX_SUCCESS)) return rc; } if (is_branch(psrc)) { const MDBX_val nullkey = {0, 0}; const indx_t ix = csrc->ki[csrc->top]; csrc->ki[csrc->top] = 0; rc = tree_propagate_key(csrc, &nullkey); csrc->ki[csrc->top] = ix; cASSERT(csrc, rc == MDBX_SUCCESS); } } if (cdst->ki[cdst->top] == 0) { cASSERT(cdst, cdst->top > 0); if (cdst->ki[cdst->top - 1] != 0) { MDBX_val key; if (is_dupfix_leaf(pdst)) key = page_dupfix_key(pdst, 0, cdst->tree->dupfix_size); else { node_t *srcnode = page_node(pdst, 0); key.iov_len = node_ks(srcnode); key.iov_base = node_key(srcnode); } DEBUG("update separator for destination page %" PRIaPGNO " to [%s]", pdst->pgno, DKEY_DEBUG(&key)); cursor_couple_t couple; MDBX_cursor *const mn = cursor_clone(cdst, &couple); cASSERT(cdst, mn->top > 0); mn->top -= 1; couple.outer.next = mn->txn->cursors[cursor_dbi(mn)]; mn->txn->cursors[cursor_dbi(mn)] = &couple.outer; rc = tree_propagate_key(mn, &key); mn->txn->cursors[cursor_dbi(mn)] = couple.outer.next; if (unlikely(rc != MDBX_SUCCESS)) return rc; } if (is_branch(pdst)) { const MDBX_val nullkey = {0, 0}; const indx_t ix = cdst->ki[cdst->top]; cdst->ki[cdst->top] = 0; rc = tree_propagate_key(cdst, &nullkey); cdst->ki[cdst->top] = ix; cASSERT(cdst, rc == MDBX_SUCCESS); } } return MDBX_SUCCESS; } static int page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { MDBX_val key; int rc; cASSERT(csrc, csrc != cdst); cASSERT(csrc, cursor_is_tracked(csrc)); cASSERT(cdst, cursor_is_tracked(cdst)); const page_t *const psrc = csrc->pg[csrc->top]; page_t *pdst = cdst->pg[cdst->top]; DEBUG("merging page %" PRIaPGNO " into %" PRIaPGNO, psrc->pgno, pdst->pgno); cASSERT(csrc, page_type(psrc) == page_type(pdst)); cASSERT(csrc, csrc->clc == cdst->clc && csrc->tree == cdst->tree); cASSERT(csrc, csrc->top > 0); /* can't merge root page */ cASSERT(cdst, cdst->top > 0); cASSERT(cdst, cdst->top + 1 < cdst->tree->height || is_leaf(cdst->pg[cdst->tree->height - 1])); cASSERT(csrc, csrc->top + 1 < csrc->tree->height || is_leaf(csrc->pg[csrc->tree->height - 1])); cASSERT(cdst, csrc->txn->env->options.prefer_waf_insteadof_balance || page_room(pdst) >= page_used(cdst->txn->env, psrc)); const int pagetype = page_type(psrc); /* Move all nodes from src to dst */ const size_t dst_nkeys = page_numkeys(pdst); const size_t src_nkeys = page_numkeys(psrc); cASSERT(cdst, dst_nkeys + src_nkeys >= (is_leaf(psrc) ? 1u : 2u)); if (likely(src_nkeys)) { size_t ii = dst_nkeys; if (unlikely(pagetype & P_DUPFIX)) { /* Mark dst as dirty. */ rc = page_touch(cdst); cASSERT(cdst, rc != MDBX_RESULT_TRUE); if (unlikely(rc != MDBX_SUCCESS)) return rc; key.iov_len = csrc->tree->dupfix_size; key.iov_base = page_data(psrc); size_t i = 0; do { rc = node_add_dupfix(cdst, ii++, &key); cASSERT(cdst, rc != MDBX_RESULT_TRUE); if (unlikely(rc != MDBX_SUCCESS)) return rc; key.iov_base = ptr_disp(key.iov_base, key.iov_len); } while (++i != src_nkeys); } else { node_t *srcnode = page_node(psrc, 0); key.iov_len = node_ks(srcnode); key.iov_base = node_key(srcnode); if (pagetype & P_BRANCH) { cursor_couple_t couple; MDBX_cursor *const mn = cursor_clone(csrc, &couple); /* must find the lowest key below src */ rc = tree_search_lowest(mn); cASSERT(csrc, rc != MDBX_RESULT_TRUE); if (unlikely(rc != MDBX_SUCCESS)) return rc; const page_t *mp = mn->pg[mn->top]; if (likely(!is_dupfix_leaf(mp))) { cASSERT(mn, is_leaf(mp)); const node_t *lowest = page_node(mp, 0); key.iov_len = node_ks(lowest); key.iov_base = node_key(lowest); } else { cASSERT(mn, mn->top > csrc->top); key = page_dupfix_key(mp, mn->ki[mn->top], csrc->tree->dupfix_size); } cASSERT(mn, key.iov_len >= csrc->clc->k.lmin); cASSERT(mn, key.iov_len <= csrc->clc->k.lmax); const size_t dst_room = page_room(pdst); const size_t src_used = page_used(cdst->txn->env, psrc); const size_t space_needed = src_used - node_ks(srcnode) + key.iov_len; if (unlikely(space_needed > dst_room)) return MDBX_RESULT_TRUE; } /* Mark dst as dirty. */ rc = page_touch(cdst); cASSERT(cdst, rc != MDBX_RESULT_TRUE); if (unlikely(rc != MDBX_SUCCESS)) return rc; size_t i = 0; while (true) { if (pagetype & P_LEAF) { MDBX_val data; data.iov_len = node_ds(srcnode); data.iov_base = node_data(srcnode); rc = node_add_leaf(cdst, ii++, &key, &data, node_flags(srcnode)); } else { cASSERT(csrc, node_flags(srcnode) == 0); rc = node_add_branch(cdst, ii++, &key, node_pgno(srcnode)); } cASSERT(cdst, rc != MDBX_RESULT_TRUE); if (unlikely(rc != MDBX_SUCCESS)) return rc; if (++i == src_nkeys) break; srcnode = page_node(psrc, i); key.iov_len = node_ks(srcnode); key.iov_base = node_key(srcnode); } } pdst = cdst->pg[cdst->top]; DEBUG("dst page %" PRIaPGNO " now has %zu keys (%u.%u%% filled)", pdst->pgno, page_numkeys(pdst), page_fill_percentum_x10(cdst->txn->env, pdst) / 10, page_fill_percentum_x10(cdst->txn->env, pdst) % 10); cASSERT(csrc, psrc == csrc->pg[csrc->top]); cASSERT(cdst, pdst == cdst->pg[cdst->top]); } /* Unlink the src page from parent and add to free list. */ csrc->top -= 1; node_del(csrc, 0); if (csrc->ki[csrc->top] == 0) { const MDBX_val nullkey = {0, 0}; rc = tree_propagate_key(csrc, &nullkey); cASSERT(csrc, rc != MDBX_RESULT_TRUE); if (unlikely(rc != MDBX_SUCCESS)) { csrc->top += 1; return rc; } } csrc->top += 1; cASSERT(csrc, psrc == csrc->pg[csrc->top]); cASSERT(cdst, pdst == cdst->pg[cdst->top]); { /* Adjust other cursors pointing to mp */ MDBX_cursor *m2, *m3; const size_t dbi = cursor_dbi(csrc); for (m2 = csrc->txn->cursors[dbi]; m2; m2 = m2->next) { m3 = (csrc->flags & z_inner) ? &m2->subcur->cursor : m2; if (!is_related(csrc, m3)) continue; if (m3->pg[csrc->top] == psrc) { m3->pg[csrc->top] = pdst; m3->ki[csrc->top] += (indx_t)dst_nkeys; m3->ki[csrc->top - 1] = cdst->ki[csrc->top - 1]; } else if (m3->pg[csrc->top - 1] == csrc->pg[csrc->top - 1] && m3->ki[csrc->top - 1] > csrc->ki[csrc->top - 1]) { cASSERT(m3, m3->ki[csrc->top - 1] > 0 && m3->ki[csrc->top - 1] <= page_numkeys(m3->pg[csrc->top - 1])); m3->ki[csrc->top - 1] -= 1; } if (is_leaf(psrc) && inner_pointed(m3)) { cASSERT(csrc, csrc->top == m3->top); size_t nkeys = page_numkeys(m3->pg[csrc->top]); if (likely(nkeys > m3->ki[csrc->top])) cursor_inner_refresh(m3, m3->pg[csrc->top], m3->ki[csrc->top]); } } } rc = page_retire(csrc, (page_t *)psrc); cASSERT(csrc, rc != MDBX_RESULT_TRUE); if (unlikely(rc != MDBX_SUCCESS)) return rc; cASSERT(cdst, cdst->tree->items > 0); cASSERT(cdst, cdst->top + 1 <= cdst->tree->height); cASSERT(cdst, cdst->top > 0); page_t *const top_page = cdst->pg[cdst->top]; const indx_t top_indx = cdst->ki[cdst->top]; const int save_top = cdst->top; const uint16_t save_height = cdst->tree->height; cursor_pop(cdst); rc = tree_rebalance(cdst); if (unlikely(rc != MDBX_SUCCESS)) return rc; cASSERT(cdst, cdst->tree->items > 0); cASSERT(cdst, cdst->top + 1 <= cdst->tree->height); #if MDBX_ENABLE_PGOP_STAT cdst->txn->env->lck->pgops.merge.weak += 1; #endif /* MDBX_ENABLE_PGOP_STAT */ if (is_leaf(cdst->pg[cdst->top])) { /* LY: don't touch cursor if top-page is a LEAF */ cASSERT(cdst, is_leaf(cdst->pg[cdst->top]) || page_type(cdst->pg[cdst->top]) == pagetype); return MDBX_SUCCESS; } cASSERT(cdst, page_numkeys(top_page) == dst_nkeys + src_nkeys); if (unlikely(pagetype != page_type(top_page))) { /* LY: LEAF-page becomes BRANCH, unable restore cursor's stack */ goto bailout; } if (top_page == cdst->pg[cdst->top]) { /* LY: don't touch cursor if prev top-page already on the top */ cASSERT(cdst, cdst->ki[cdst->top] == top_indx); cASSERT(cdst, is_leaf(cdst->pg[cdst->top]) || page_type(cdst->pg[cdst->top]) == pagetype); return MDBX_SUCCESS; } const int new_top = save_top - save_height + cdst->tree->height; if (unlikely(new_top < 0 || new_top >= cdst->tree->height)) { /* LY: out of range, unable restore cursor's stack */ goto bailout; } if (top_page == cdst->pg[new_top]) { cASSERT(cdst, cdst->ki[new_top] == top_indx); /* LY: restore cursor stack */ cdst->top = (int8_t)new_top; cASSERT(cdst, cdst->top + 1 < cdst->tree->height || is_leaf(cdst->pg[cdst->tree->height - 1])); cASSERT(cdst, is_leaf(cdst->pg[cdst->top]) || page_type(cdst->pg[cdst->top]) == pagetype); return MDBX_SUCCESS; } page_t *const stub_page = (page_t *)(~(uintptr_t)top_page); const indx_t stub_indx = top_indx; if (save_height > cdst->tree->height && ((cdst->pg[save_top] == top_page && cdst->ki[save_top] == top_indx) || (cdst->pg[save_top] == stub_page && cdst->ki[save_top] == stub_indx))) { /* LY: restore cursor stack */ cdst->pg[new_top] = top_page; cdst->ki[new_top] = top_indx; #if MDBX_DEBUG cdst->pg[new_top + 1] = nullptr; cdst->ki[new_top + 1] = INT16_MAX; #endif cdst->top = (int8_t)new_top; cASSERT(cdst, cdst->top + 1 < cdst->tree->height || is_leaf(cdst->pg[cdst->tree->height - 1])); cASSERT(cdst, is_leaf(cdst->pg[cdst->top]) || page_type(cdst->pg[cdst->top]) == pagetype); return MDBX_SUCCESS; } bailout: /* LY: unable restore cursor's stack */ be_poor(cdst); return MDBX_CURSOR_FULL; } int tree_rebalance(MDBX_cursor *mc) { cASSERT(mc, cursor_is_tracked(mc)); cASSERT(mc, mc->top >= 0); cASSERT(mc, mc->top + 1 < mc->tree->height || is_leaf(mc->pg[mc->tree->height - 1])); const page_t *const tp = mc->pg[mc->top]; const uint8_t pagetype = page_type(tp); STATIC_ASSERT(P_BRANCH == 1); const size_t minkeys = (pagetype & P_BRANCH) + (size_t)1; /* Pages emptier than this are candidates for merging. */ size_t room_threshold = likely(mc->tree != &mc->txn->dbs[FREE_DBI]) ? mc->txn->env->merge_threshold : mc->txn->env->merge_threshold_gc; const size_t numkeys = page_numkeys(tp); const size_t room = page_room(tp); DEBUG("rebalancing %s page %" PRIaPGNO " (has %zu keys, fill %u.%u%%, used %zu, room %zu bytes)", is_leaf(tp) ? "leaf" : "branch", tp->pgno, numkeys, page_fill_percentum_x10(mc->txn->env, tp) / 10, page_fill_percentum_x10(mc->txn->env, tp) % 10, page_used(mc->txn->env, tp), room); cASSERT(mc, is_modifable(mc->txn, tp)); if (unlikely(numkeys < minkeys)) { DEBUG("page %" PRIaPGNO " must be merged due keys < %zu threshold", tp->pgno, minkeys); } else if (unlikely(room > room_threshold)) { DEBUG("page %" PRIaPGNO " should be merged due room %zu > %zu threshold", tp->pgno, room, room_threshold); } else { DEBUG("no need to rebalance page %" PRIaPGNO ", room %zu < %zu threshold", tp->pgno, room, room_threshold); cASSERT(mc, mc->tree->items > 0); return MDBX_SUCCESS; } int rc; if (mc->top == 0) { page_t *const mp = mc->pg[0]; const size_t nkeys = page_numkeys(mp); cASSERT(mc, (mc->tree->items == 0) == (nkeys == 0)); if (nkeys == 0) { DEBUG("%s", "tree is completely empty"); cASSERT(mc, is_leaf(mp)); cASSERT(mc, (*cursor_dbi_state(mc) & DBI_DIRTY) != 0); cASSERT(mc, mc->tree->branch_pages == 0 && mc->tree->large_pages == 0 && mc->tree->leaf_pages == 1); /* Adjust cursors pointing to mp */ for (MDBX_cursor *m2 = mc->txn->cursors[cursor_dbi(mc)]; m2; m2 = m2->next) { MDBX_cursor *m3 = (mc->flags & z_inner) ? &m2->subcur->cursor : m2; if (!is_poor(m3) && m3->pg[0] == mp) { be_poor(m3); m3->flags |= z_after_delete; } } if (is_subpage(mp)) { return MDBX_SUCCESS; } else { mc->tree->root = P_INVALID; mc->tree->height = 0; return page_retire(mc, mp); } } if (is_subpage(mp)) { DEBUG("%s", "Can't rebalance a subpage, ignoring"); cASSERT(mc, is_leaf(tp)); return MDBX_SUCCESS; } if (is_branch(mp) && nkeys == 1) { DEBUG("%s", "collapsing root page!"); mc->tree->root = node_pgno(page_node(mp, 0)); rc = page_get(mc, mc->tree->root, &mc->pg[0], mp->txnid); if (unlikely(rc != MDBX_SUCCESS)) return rc; mc->tree->height--; mc->ki[0] = mc->ki[1]; for (intptr_t i = 1; i < mc->tree->height; i++) { mc->pg[i] = mc->pg[i + 1]; mc->ki[i] = mc->ki[i + 1]; } /* Adjust other cursors pointing to mp */ for (MDBX_cursor *m2 = mc->txn->cursors[cursor_dbi(mc)]; m2; m2 = m2->next) { MDBX_cursor *m3 = (mc->flags & z_inner) ? &m2->subcur->cursor : m2; if (is_related(mc, m3) && m3->pg[0] == mp) { for (intptr_t i = 0; i < mc->tree->height; i++) { m3->pg[i] = m3->pg[i + 1]; m3->ki[i] = m3->ki[i + 1]; } m3->top -= 1; } } cASSERT(mc, is_leaf(mc->pg[mc->top]) || page_type(mc->pg[mc->top]) == pagetype); cASSERT(mc, mc->top + 1 < mc->tree->height || is_leaf(mc->pg[mc->tree->height - 1])); return page_retire(mc, mp); } DEBUG("root page %" PRIaPGNO " doesn't need rebalancing (flags 0x%x)", mp->pgno, mp->flags); return MDBX_SUCCESS; } /* The parent (branch page) must have at least 2 pointers, * otherwise the tree is invalid. */ const size_t pre_top = mc->top - 1; cASSERT(mc, is_branch(mc->pg[pre_top])); cASSERT(mc, !is_subpage(mc->pg[0])); cASSERT(mc, page_numkeys(mc->pg[pre_top]) > 1); /* Leaf page fill factor is below the threshold. * Try to move keys from left or right neighbor, or * merge with a neighbor page. */ /* Find neighbors. */ cursor_couple_t couple; MDBX_cursor *const mn = cursor_clone(mc, &couple); page_t *left = nullptr, *right = nullptr; if (mn->ki[pre_top] > 0) { rc = page_get(mn, node_pgno(page_node(mn->pg[pre_top], mn->ki[pre_top] - 1)), &left, mc->pg[mc->top]->txnid); if (unlikely(rc != MDBX_SUCCESS)) return rc; cASSERT(mc, page_type(left) == page_type(mc->pg[mc->top])); } if (mn->ki[pre_top] + (size_t)1 < page_numkeys(mn->pg[pre_top])) { rc = page_get( mn, node_pgno(page_node(mn->pg[pre_top], mn->ki[pre_top] + (size_t)1)), &right, mc->pg[mc->top]->txnid); if (unlikely(rc != MDBX_SUCCESS)) return rc; cASSERT(mc, page_type(right) == page_type(mc->pg[mc->top])); } cASSERT(mc, left || right); const size_t ki_top = mc->ki[mc->top]; const size_t ki_pre_top = mn->ki[pre_top]; const size_t nkeys = page_numkeys(mn->pg[mn->top]); const size_t left_room = left ? page_room(left) : 0; const size_t right_room = right ? page_room(right) : 0; const size_t left_nkeys = left ? page_numkeys(left) : 0; const size_t right_nkeys = right ? page_numkeys(right) : 0; bool involve = !(left && right); retry: cASSERT(mc, mc->top > 0); if (left_room > room_threshold && left_room >= right_room && (is_modifable(mc->txn, left) || involve)) { /* try merge with left */ cASSERT(mc, left_nkeys >= minkeys); mn->pg[mn->top] = left; mn->ki[mn->top - 1] = (indx_t)(ki_pre_top - 1); mn->ki[mn->top] = (indx_t)(left_nkeys - 1); mc->ki[mc->top] = 0; const size_t new_ki = ki_top + left_nkeys; mn->ki[mn->top] += mc->ki[mn->top] + 1; couple.outer.next = mn->txn->cursors[cursor_dbi(mn)]; mn->txn->cursors[cursor_dbi(mn)] = &couple.outer; rc = page_merge(mc, mn); mn->txn->cursors[cursor_dbi(mn)] = couple.outer.next; if (likely(rc != MDBX_RESULT_TRUE)) { cursor_cpstk(mn, mc); mc->ki[mc->top] = (indx_t)new_ki; cASSERT(mc, rc || page_numkeys(mc->pg[mc->top]) >= minkeys); return rc; } } if (right_room > room_threshold && (is_modifable(mc->txn, right) || involve)) { /* try merge with right */ cASSERT(mc, right_nkeys >= minkeys); mn->pg[mn->top] = right; mn->ki[mn->top - 1] = (indx_t)(ki_pre_top + 1); mn->ki[mn->top] = 0; mc->ki[mc->top] = (indx_t)nkeys; couple.outer.next = mn->txn->cursors[cursor_dbi(mn)]; mn->txn->cursors[cursor_dbi(mn)] = &couple.outer; rc = page_merge(mn, mc); mn->txn->cursors[cursor_dbi(mn)] = couple.outer.next; if (likely(rc != MDBX_RESULT_TRUE)) { mc->ki[mc->top] = (indx_t)ki_top; cASSERT(mc, rc || page_numkeys(mc->pg[mc->top]) >= minkeys); return rc; } } if (left_nkeys > minkeys && (right_nkeys <= left_nkeys || right_room >= left_room) && (is_modifable(mc->txn, left) || involve)) { /* try move from left */ mn->pg[mn->top] = left; mn->ki[mn->top - 1] = (indx_t)(ki_pre_top - 1); mn->ki[mn->top] = (indx_t)(left_nkeys - 1); mc->ki[mc->top] = 0; couple.outer.next = mn->txn->cursors[cursor_dbi(mn)]; mn->txn->cursors[cursor_dbi(mn)] = &couple.outer; rc = node_move(mn, mc, true); mn->txn->cursors[cursor_dbi(mn)] = couple.outer.next; if (likely(rc != MDBX_RESULT_TRUE)) { mc->ki[mc->top] = (indx_t)(ki_top + 1); cASSERT(mc, rc || page_numkeys(mc->pg[mc->top]) >= minkeys); return rc; } } if (right_nkeys > minkeys && (is_modifable(mc->txn, right) || involve)) { /* try move from right */ mn->pg[mn->top] = right; mn->ki[mn->top - 1] = (indx_t)(ki_pre_top + 1); mn->ki[mn->top] = 0; mc->ki[mc->top] = (indx_t)nkeys; couple.outer.next = mn->txn->cursors[cursor_dbi(mn)]; mn->txn->cursors[cursor_dbi(mn)] = &couple.outer; rc = node_move(mn, mc, false); mn->txn->cursors[cursor_dbi(mn)] = couple.outer.next; if (likely(rc != MDBX_RESULT_TRUE)) { mc->ki[mc->top] = (indx_t)ki_top; cASSERT(mc, rc || page_numkeys(mc->pg[mc->top]) >= minkeys); return rc; } } if (nkeys >= minkeys) { mc->ki[mc->top] = (indx_t)ki_top; if (AUDIT_ENABLED()) return cursor_check_updating(mc); return MDBX_SUCCESS; } if (mc->txn->env->options.prefer_waf_insteadof_balance && likely(room_threshold > 0)) { room_threshold = 0; goto retry; } if (likely(!involve) && (likely(mc->tree != &mc->txn->dbs[FREE_DBI]) || mc->txn->tw.loose_pages || MDBX_PNL_GETSIZE(mc->txn->tw.relist) || (mc->flags & z_gcu_preparation) || (mc->txn->flags & txn_gc_drained) || room_threshold)) { involve = true; goto retry; } if (likely(room_threshold > 0)) { room_threshold = 0; goto retry; } ERROR("Unable to merge/rebalance %s page %" PRIaPGNO " (has %zu keys, fill %u.%u%%, used %zu, room %zu bytes)", is_leaf(tp) ? "leaf" : "branch", tp->pgno, numkeys, page_fill_percentum_x10(mc->txn->env, tp) / 10, page_fill_percentum_x10(mc->txn->env, tp) % 10, page_used(mc->txn->env, tp), room); return MDBX_PROBLEM; } int page_split(MDBX_cursor *mc, const MDBX_val *const newkey, MDBX_val *const newdata, pgno_t newpgno, const unsigned naf) { unsigned flags; int rc = MDBX_SUCCESS, foliage = 0; MDBX_env *const env = mc->txn->env; MDBX_val rkey, xdata; page_t *tmp_ki_copy = nullptr; DKBUF; page_t *const mp = mc->pg[mc->top]; cASSERT(mc, (mp->flags & P_ILL_BITS) == 0); const size_t newindx = mc->ki[mc->top]; size_t nkeys = page_numkeys(mp); if (AUDIT_ENABLED()) { rc = cursor_check_updating(mc); if (unlikely(rc != MDBX_SUCCESS)) return rc; } STATIC_ASSERT(P_BRANCH == 1); const size_t minkeys = (mp->flags & P_BRANCH) + (size_t)1; DEBUG(">> splitting %s-page %" PRIaPGNO " and adding %zu+%zu [%s] at %i, nkeys %zi", is_leaf(mp) ? "leaf" : "branch", mp->pgno, newkey->iov_len, newdata ? newdata->iov_len : 0, DKEY_DEBUG(newkey), mc->ki[mc->top], nkeys); cASSERT(mc, nkeys + 1 >= minkeys * 2); /* Create a new sibling page. */ pgr_t npr = page_new(mc, mp->flags); if (unlikely(npr.err != MDBX_SUCCESS)) return npr.err; page_t *const sister = npr.page; sister->dupfix_ksize = mp->dupfix_ksize; DEBUG("new sibling: page %" PRIaPGNO, sister->pgno); /* Usually when splitting the root page, the cursor * height is 1. But when called from tree_propagate_key, * the cursor height may be greater because it walks * up the stack while finding the branch slot to update. */ intptr_t prev_top = mc->top - 1; if (mc->top == 0) { npr = page_new(mc, P_BRANCH); rc = npr.err; if (unlikely(rc != MDBX_SUCCESS)) goto done; page_t *const pp = npr.page; /* shift current top to make room for new parent */ cASSERT(mc, mc->tree->height > 0); #if MDBX_DEBUG memset(mc->pg + 3, 0, sizeof(mc->pg) - sizeof(mc->pg[0]) * 3); memset(mc->ki + 3, -1, sizeof(mc->ki) - sizeof(mc->ki[0]) * 3); #endif mc->pg[2] = mc->pg[1]; mc->ki[2] = mc->ki[1]; mc->pg[1] = mc->pg[0]; mc->ki[1] = mc->ki[0]; mc->pg[0] = pp; mc->ki[0] = 0; mc->tree->root = pp->pgno; DEBUG("root split! new root = %" PRIaPGNO, pp->pgno); foliage = mc->tree->height++; /* Add left (implicit) pointer. */ rc = node_add_branch(mc, 0, nullptr, mp->pgno); if (unlikely(rc != MDBX_SUCCESS)) { /* undo the pre-push */ mc->pg[0] = mc->pg[1]; mc->ki[0] = mc->ki[1]; mc->tree->root = mp->pgno; mc->tree->height--; goto done; } mc->top = 1; prev_top = 0; if (AUDIT_ENABLED()) { rc = cursor_check_updating(mc); if (unlikely(rc != MDBX_SUCCESS)) goto done; } } else { DEBUG("parent branch page is %" PRIaPGNO, mc->pg[prev_top]->pgno); } cursor_couple_t couple; MDBX_cursor *const mn = cursor_clone(mc, &couple); mn->pg[mn->top] = sister; mn->ki[mn->top] = 0; mn->ki[prev_top] = mc->ki[prev_top] + 1; size_t split_indx = (newindx < nkeys) ? /* split at the middle */ (nkeys + 1) >> 1 : /* split at the end (i.e. like append-mode ) */ nkeys - minkeys + 1; eASSERT(env, split_indx >= minkeys && split_indx <= nkeys - minkeys + 1); cASSERT(mc, !is_branch(mp) || newindx > 0); MDBX_val sepkey = {nullptr, 0}; /* It is reasonable and possible to split the page at the begin */ if (unlikely(newindx < minkeys)) { split_indx = minkeys; if (newindx == 0 && !(naf & MDBX_SPLIT_REPLACE)) { split_indx = 0; /* Checking for ability of splitting by the left-side insertion * of a pure page with the new key */ for (intptr_t i = 0; i < mc->top; ++i) if (mc->ki[i]) { sepkey = get_key(page_node(mc->pg[i], mc->ki[i])); if (mc->clc->k.cmp(newkey, &sepkey) >= 0) split_indx = minkeys; break; } if (split_indx == 0) { /* Save the current first key which was omitted on the parent branch * page and should be updated if the new first entry will be added */ if (is_dupfix_leaf(mp)) sepkey = page_dupfix_key(mp, 0, mc->tree->dupfix_size); else sepkey = get_key(page_node(mp, 0)); cASSERT(mc, mc->clc->k.cmp(newkey, &sepkey) < 0); /* Avoiding rare complex cases of nested split the parent page(s) */ if (page_room(mc->pg[prev_top]) < branch_size(env, &sepkey)) split_indx = minkeys; } if (foliage) { TRACE("pure-left: foliage %u, top %i, ptop %zu, split_indx %zi, " "minkeys %zi, sepkey %s, parent-room %zu, need4split %zu", foliage, mc->top, prev_top, split_indx, minkeys, DKEY_DEBUG(&sepkey), page_room(mc->pg[prev_top]), branch_size(env, &sepkey)); TRACE("pure-left: newkey %s, newdata %s, newindx %zu", DKEY_DEBUG(newkey), DVAL_DEBUG(newdata), newindx); } } } const bool pure_right = split_indx == nkeys; const bool pure_left = split_indx == 0; if (unlikely(pure_right)) { /* newindx == split_indx == nkeys */ TRACE("no-split, but add new pure page at the %s", "right/after"); cASSERT(mc, newindx == nkeys && split_indx == nkeys && minkeys == 1); sepkey = *newkey; } else if (unlikely(pure_left)) { /* newindx == split_indx == 0 */ TRACE("pure-left: no-split, but add new pure page at the %s", "left/before"); cASSERT(mc, newindx == 0 && split_indx == 0 && minkeys == 1); TRACE("pure-left: old-first-key is %s", DKEY_DEBUG(&sepkey)); } else { if (is_dupfix_leaf(sister)) { /* Move half of the keys to the right sibling */ const intptr_t distance = mc->ki[mc->top] - split_indx; size_t ksize = mc->tree->dupfix_size; void *const split = page_dupfix_ptr(mp, split_indx, ksize); size_t rsize = (nkeys - split_indx) * ksize; size_t lsize = (nkeys - split_indx) * sizeof(indx_t); cASSERT(mc, mp->lower >= lsize); mp->lower -= (indx_t)lsize; cASSERT(mc, sister->lower + lsize <= UINT16_MAX); sister->lower += (indx_t)lsize; cASSERT(mc, mp->upper + rsize - lsize <= UINT16_MAX); mp->upper += (indx_t)(rsize - lsize); cASSERT(mc, sister->upper >= rsize - lsize); sister->upper -= (indx_t)(rsize - lsize); sepkey.iov_len = ksize; sepkey.iov_base = (newindx != split_indx) ? split : newkey->iov_base; if (distance < 0) { cASSERT(mc, ksize >= sizeof(indx_t)); void *const ins = page_dupfix_ptr(mp, mc->ki[mc->top], ksize); memcpy(sister->entries, split, rsize); sepkey.iov_base = sister->entries; memmove(ptr_disp(ins, ksize), ins, (split_indx - mc->ki[mc->top]) * ksize); memcpy(ins, newkey->iov_base, ksize); cASSERT(mc, UINT16_MAX - mp->lower >= (int)sizeof(indx_t)); mp->lower += sizeof(indx_t); cASSERT(mc, mp->upper >= ksize - sizeof(indx_t)); mp->upper -= (indx_t)(ksize - sizeof(indx_t)); cASSERT(mc, (((ksize & page_numkeys(mp)) ^ mp->upper) & 1) == 0); } else { memcpy(sister->entries, split, distance * ksize); void *const ins = page_dupfix_ptr(sister, distance, ksize); memcpy(ins, newkey->iov_base, ksize); memcpy(ptr_disp(ins, ksize), ptr_disp(split, distance * ksize), rsize - distance * ksize); cASSERT(mc, UINT16_MAX - sister->lower >= (int)sizeof(indx_t)); sister->lower += sizeof(indx_t); cASSERT(mc, sister->upper >= ksize - sizeof(indx_t)); sister->upper -= (indx_t)(ksize - sizeof(indx_t)); cASSERT(mc, distance <= (int)UINT16_MAX); mc->ki[mc->top] = (indx_t)distance; cASSERT(mc, (((ksize & page_numkeys(sister)) ^ sister->upper) & 1) == 0); } if (AUDIT_ENABLED()) { rc = cursor_check_updating(mc); if (unlikely(rc != MDBX_SUCCESS)) goto done; rc = cursor_check_updating(mn); if (unlikely(rc != MDBX_SUCCESS)) goto done; } } else { /* grab a page to hold a temporary copy */ tmp_ki_copy = page_shadow_alloc(mc->txn, 1); if (unlikely(tmp_ki_copy == nullptr)) { rc = MDBX_ENOMEM; goto done; } const size_t max_space = page_space(env); const size_t new_size = is_leaf(mp) ? leaf_size(env, newkey, newdata) : branch_size(env, newkey); /* prepare to insert */ size_t i = 0; while (i < newindx) { tmp_ki_copy->entries[i] = mp->entries[i]; ++i; } tmp_ki_copy->entries[i] = (indx_t)-1; while (++i <= nkeys) tmp_ki_copy->entries[i] = mp->entries[i - 1]; tmp_ki_copy->pgno = mp->pgno; tmp_ki_copy->flags = mp->flags; tmp_ki_copy->txnid = INVALID_TXNID; tmp_ki_copy->lower = 0; tmp_ki_copy->upper = (indx_t)max_space; /* Добавляемый узел может не поместиться в страницу-половину вместе * с количественной половиной узлов из исходной страницы. В худшем случае, * в страницу-половину с добавляемым узлом могут попасть самые больше узлы * из исходной страницы, а другую половину только узлы с самыми короткими * ключами и с пустыми данными. Поэтому, чтобы найти подходящую границу * разреза требуется итерировать узлы и считая их объем. * * Однако, при простом количественном делении (без учета размера ключей * и данных) на страницах-половинах будет примерно вдвое меньше узлов. * Поэтому добавляемый узел точно поместится, если его размер не больше * чем место "освобождающееся" от заголовков узлов, которые переедут * в другую страницу-половину. Кроме этого, как минимум по одному байту * будет в каждом ключе, в худшем случае кроме одного, который может быть * нулевого размера. */ if (newindx == split_indx && nkeys >= 5) { STATIC_ASSERT(P_BRANCH == 1); split_indx += mp->flags & P_BRANCH; } eASSERT(env, split_indx >= minkeys && split_indx <= nkeys + 1 - minkeys); const size_t dim_nodes = (newindx >= split_indx) ? split_indx : nkeys - split_indx; const size_t dim_used = (sizeof(indx_t) + NODESIZE + 1) * dim_nodes; if (new_size >= dim_used) { /* Search for best acceptable split point */ i = (newindx < split_indx) ? 0 : nkeys; intptr_t dir = (newindx < split_indx) ? 1 : -1; size_t before = 0, after = new_size + page_used(env, mp); size_t best_split = split_indx; size_t best_shift = INT_MAX; TRACE("seek separator from %zu, step %zi, default %zu, new-idx %zu, " "new-size %zu", i, dir, split_indx, newindx, new_size); do { cASSERT(mc, i <= nkeys); size_t size = new_size; if (i != newindx) { node_t *node = ptr_disp(mp, tmp_ki_copy->entries[i] + PAGEHDRSZ); size = NODESIZE + node_ks(node) + sizeof(indx_t); if (is_leaf(mp)) size += (node_flags(node) & N_BIGDATA) ? sizeof(pgno_t) : node_ds(node); size = EVEN_CEIL(size); } before += size; after -= size; TRACE("step %zu, size %zu, before %zu, after %zu, max %zu", i, size, before, after, max_space); if (before <= max_space && after <= max_space) { const size_t split = i + (dir > 0); if (split >= minkeys && split <= nkeys + 1 - minkeys) { const size_t shift = branchless_abs(split_indx - split); if (shift >= best_shift) break; best_shift = shift; best_split = split; if (!best_shift) break; } } i += dir; } while (i < nkeys); split_indx = best_split; TRACE("chosen %zu", split_indx); } eASSERT(env, split_indx >= minkeys && split_indx <= nkeys + 1 - minkeys); sepkey = *newkey; if (split_indx != newindx) { node_t *node = ptr_disp(mp, tmp_ki_copy->entries[split_indx] + PAGEHDRSZ); sepkey.iov_len = node_ks(node); sepkey.iov_base = node_key(node); } } } DEBUG("separator is %zd [%s]", split_indx, DKEY_DEBUG(&sepkey)); bool did_split_parent = false; /* Copy separator key to the parent. */ if (page_room(mn->pg[prev_top]) < branch_size(env, &sepkey)) { TRACE("need split parent branch-page for key %s", DKEY_DEBUG(&sepkey)); cASSERT(mc, page_numkeys(mn->pg[prev_top]) > 2); cASSERT(mc, !pure_left); const int top = mc->top; const int height = mc->tree->height; mn->top -= 1; did_split_parent = true; couple.outer.next = mn->txn->cursors[cursor_dbi(mn)]; mn->txn->cursors[cursor_dbi(mn)] = &couple.outer; rc = page_split(mn, &sepkey, nullptr, sister->pgno, 0); mn->txn->cursors[cursor_dbi(mn)] = couple.outer.next; if (unlikely(rc != MDBX_SUCCESS)) goto done; cASSERT(mc, mc->top - top == mc->tree->height - height); if (AUDIT_ENABLED()) { rc = cursor_check_updating(mc); if (unlikely(rc != MDBX_SUCCESS)) goto done; } /* root split? */ prev_top += mc->top - top; /* Right page might now have changed parent. * Check if left page also changed parent. */ if (mn->pg[prev_top] != mc->pg[prev_top] && mc->ki[prev_top] >= page_numkeys(mc->pg[prev_top])) { for (intptr_t i = 0; i < prev_top; i++) { mc->pg[i] = mn->pg[i]; mc->ki[i] = mn->ki[i]; } mc->pg[prev_top] = mn->pg[prev_top]; if (mn->ki[prev_top]) { mc->ki[prev_top] = mn->ki[prev_top] - 1; } else { /* find right page's left sibling */ mc->ki[prev_top] = mn->ki[prev_top]; rc = cursor_sibling_left(mc); if (unlikely(rc != MDBX_SUCCESS)) { if (rc == MDBX_NOTFOUND) /* improper mdbx_cursor_sibling() result */ { ERROR("unexpected %i error going left sibling", rc); rc = MDBX_PROBLEM; } goto done; } } } } else if (unlikely(pure_left)) { page_t *ptop_page = mc->pg[prev_top]; TRACE("pure-left: adding to parent page %u node[%u] left-leaf page #%u key " "%s", ptop_page->pgno, mc->ki[prev_top], sister->pgno, DKEY(mc->ki[prev_top] ? newkey : nullptr)); assert(mc->top == prev_top + 1); mc->top = (uint8_t)prev_top; rc = node_add_branch(mc, mc->ki[prev_top], mc->ki[prev_top] ? newkey : nullptr, sister->pgno); cASSERT(mc, mp == mc->pg[prev_top + 1] && newindx == mc->ki[prev_top + 1] && prev_top == mc->top); if (likely(rc == MDBX_SUCCESS) && mc->ki[prev_top] == 0) { node_t *node = page_node(mc->pg[prev_top], 1); TRACE("pure-left: update prev-first key on parent to %s", DKEY(&sepkey)); cASSERT(mc, node_ks(node) == 0 && node_pgno(node) == mp->pgno); cASSERT(mc, mc->top == prev_top && mc->ki[prev_top] == 0); mc->ki[prev_top] = 1; rc = tree_propagate_key(mc, &sepkey); cASSERT(mc, mc->top == prev_top && mc->ki[prev_top] == 1); cASSERT(mc, mp == mc->pg[prev_top + 1] && newindx == mc->ki[prev_top + 1]); mc->ki[prev_top] = 0; } else { TRACE("pure-left: no-need-update prev-first key on parent %s", DKEY(&sepkey)); } mc->top++; if (unlikely(rc != MDBX_SUCCESS)) goto done; node_t *node = page_node(mc->pg[prev_top], mc->ki[prev_top] + (size_t)1); cASSERT(mc, node_pgno(node) == mp->pgno && mc->pg[prev_top] == ptop_page); } else { mn->top -= 1; TRACE("add-to-parent the right-entry[%u] for new sibling-page", mn->ki[prev_top]); rc = node_add_branch(mn, mn->ki[prev_top], &sepkey, sister->pgno); mn->top += 1; if (unlikely(rc != MDBX_SUCCESS)) goto done; } if (unlikely(pure_left | pure_right)) { mc->pg[mc->top] = sister; mc->ki[mc->top] = 0; switch (page_type(sister)) { case P_LEAF: { cASSERT(mc, newpgno == 0 || newpgno == P_INVALID); rc = node_add_leaf(mc, 0, newkey, newdata, naf); } break; case P_LEAF | P_DUPFIX: { cASSERT(mc, (naf & (N_BIGDATA | N_SUBDATA | N_DUPDATA)) == 0); cASSERT(mc, newpgno == 0 || newpgno == P_INVALID); rc = node_add_dupfix(mc, 0, newkey); } break; default: rc = bad_page(sister, "wrong page-type %u\n", page_type(sister)); } if (unlikely(rc != MDBX_SUCCESS)) goto done; if (pure_right) { for (intptr_t i = 0; i < mc->top; i++) mc->ki[i] = mn->ki[i]; } else if (mc->ki[mc->top - 1] == 0) { for (intptr_t i = 2; i <= mc->top; ++i) if (mc->ki[mc->top - i]) { sepkey = get_key(page_node(mc->pg[mc->top - i], mc->ki[mc->top - i])); if (mc->clc->k.cmp(newkey, &sepkey) < 0) { mc->top -= (int8_t)i; DEBUG("pure-left: update new-first on parent [%i] page %u key %s", mc->ki[mc->top], mc->pg[mc->top]->pgno, DKEY(newkey)); rc = tree_propagate_key(mc, newkey); mc->top += (int8_t)i; if (unlikely(rc != MDBX_SUCCESS)) goto done; } break; } } } else if (tmp_ki_copy) { /* !is_dupfix_leaf(mp) */ /* Move nodes */ mc->pg[mc->top] = sister; size_t n = 0, ii = split_indx; do { TRACE("i %zu, nkeys %zu => n %zu, rp #%u", ii, nkeys, n, sister->pgno); pgno_t pgno = 0; MDBX_val *rdata = nullptr; if (ii == newindx) { rkey = *newkey; if (is_leaf(mp)) rdata = newdata; else pgno = newpgno; flags = naf; /* Update index for the new key. */ mc->ki[mc->top] = (indx_t)n; } else { node_t *node = ptr_disp(mp, tmp_ki_copy->entries[ii] + PAGEHDRSZ); rkey.iov_base = node_key(node); rkey.iov_len = node_ks(node); if (is_leaf(mp)) { xdata.iov_base = node_data(node); xdata.iov_len = node_ds(node); rdata = &xdata; } else pgno = node_pgno(node); flags = node_flags(node); } switch (page_type(sister)) { case P_BRANCH: { cASSERT(mc, 0 == (uint16_t)flags); /* First branch index doesn't need key data. */ rc = node_add_branch(mc, n, n ? &rkey : nullptr, pgno); } break; case P_LEAF: { cASSERT(mc, pgno == 0); cASSERT(mc, rdata != nullptr); rc = node_add_leaf(mc, n, &rkey, rdata, flags); } break; /* case P_LEAF | P_DUPFIX: { cASSERT(mc, (nflags & (N_BIGDATA | N_SUBDATA | N_DUPDATA)) == 0); cASSERT(mc, gno == 0); rc = mdbx_node_add_dupfix(mc, n, &rkey); } break; */ default: rc = bad_page(sister, "wrong page-type %u\n", page_type(sister)); } if (unlikely(rc != MDBX_SUCCESS)) goto done; ++n; if (++ii > nkeys) { ii = 0; n = 0; mc->pg[mc->top] = tmp_ki_copy; TRACE("switch to mp #%u", tmp_ki_copy->pgno); } } while (ii != split_indx); TRACE("ii %zu, nkeys %zu, n %zu, pgno #%u", ii, nkeys, n, mc->pg[mc->top]->pgno); nkeys = page_numkeys(tmp_ki_copy); for (size_t i = 0; i < nkeys; i++) mp->entries[i] = tmp_ki_copy->entries[i]; mp->lower = tmp_ki_copy->lower; mp->upper = tmp_ki_copy->upper; memcpy(page_node(mp, nkeys - 1), page_node(tmp_ki_copy, nkeys - 1), env->ps - tmp_ki_copy->upper - PAGEHDRSZ); /* reset back to original page */ if (newindx < split_indx) { mc->pg[mc->top] = mp; } else { mc->pg[mc->top] = sister; mc->ki[prev_top]++; /* Make sure ki is still valid. */ if (mn->pg[prev_top] != mc->pg[prev_top] && mc->ki[prev_top] >= page_numkeys(mc->pg[prev_top])) { for (intptr_t i = 0; i <= prev_top; i++) { mc->pg[i] = mn->pg[i]; mc->ki[i] = mn->ki[i]; } } } } else if (newindx >= split_indx) { mc->pg[mc->top] = sister; mc->ki[prev_top]++; /* Make sure ki is still valid. */ if (mn->pg[prev_top] != mc->pg[prev_top] && mc->ki[prev_top] >= page_numkeys(mc->pg[prev_top])) { for (intptr_t i = 0; i <= prev_top; i++) { mc->pg[i] = mn->pg[i]; mc->ki[i] = mn->ki[i]; } } } /* Adjust other cursors pointing to mp and/or to parent page */ nkeys = page_numkeys(mp); for (MDBX_cursor *m2 = mc->txn->cursors[cursor_dbi(mc)]; m2; m2 = m2->next) { MDBX_cursor *m3 = (mc->flags & z_inner) ? &m2->subcur->cursor : m2; if (!is_pointed(m3) || m3 == mc) continue; if (foliage) { /* sub cursors may be on different DB */ if (m3->pg[0] != mp) continue; /* root split */ for (intptr_t k = foliage; k >= 0; k--) { m3->ki[k + 1] = m3->ki[k]; m3->pg[k + 1] = m3->pg[k]; } m3->ki[0] = m3->ki[0] >= nkeys + pure_left; m3->pg[0] = mc->pg[0]; m3->top += 1; } if (m3->top >= mc->top && m3->pg[mc->top] == mp && !pure_left) { if (m3->ki[mc->top] >= newindx) m3->ki[mc->top] += !(naf & MDBX_SPLIT_REPLACE); if (m3->ki[mc->top] >= nkeys) { m3->pg[mc->top] = sister; cASSERT(mc, m3->ki[mc->top] >= nkeys); m3->ki[mc->top] -= (indx_t)nkeys; for (intptr_t i = 0; i < mc->top; i++) { m3->ki[i] = mn->ki[i]; m3->pg[i] = mn->pg[i]; } } } else if (!did_split_parent && m3->top >= prev_top && m3->pg[prev_top] == mc->pg[prev_top] && m3->ki[prev_top] >= mc->ki[prev_top]) { m3->ki[prev_top]++; /* also for the `pure-left` case */ } if (inner_pointed(m3) && is_leaf(mp)) cursor_inner_refresh(m3, m3->pg[mc->top], m3->ki[mc->top]); } TRACE("mp #%u left: %zd, sister #%u left: %zd", mp->pgno, page_room(mp), sister->pgno, page_room(sister)); done: if (tmp_ki_copy) page_shadow_release(env, tmp_ki_copy, 1); if (unlikely(rc != MDBX_SUCCESS)) mc->txn->flags |= MDBX_TXN_ERROR; else { if (AUDIT_ENABLED()) rc = cursor_check_updating(mc); if (unlikely(naf & MDBX_RESERVE)) { node_t *node = page_node(mc->pg[mc->top], mc->ki[mc->top]); if (!(node_flags(node) & N_BIGDATA)) newdata->iov_base = node_data(node); } #if MDBX_ENABLE_PGOP_STAT env->lck->pgops.split.weak += 1; #endif /* MDBX_ENABLE_PGOP_STAT */ } DEBUG("<< mp #%u, rc %d", mp->pgno, rc); return rc; } int tree_propagate_key(MDBX_cursor *mc, const MDBX_val *key) { page_t *mp; node_t *node; size_t len; ptrdiff_t delta, ksize, oksize; intptr_t ptr, i, nkeys, indx; DKBUF_DEBUG; cASSERT(mc, cursor_is_tracked(mc)); indx = mc->ki[mc->top]; mp = mc->pg[mc->top]; node = page_node(mp, indx); ptr = mp->entries[indx]; #if MDBX_DEBUG MDBX_val k2; k2.iov_base = node_key(node); k2.iov_len = node_ks(node); DEBUG("update key %zi (offset %zu) [%s] to [%s] on page %" PRIaPGNO, indx, ptr, DVAL_DEBUG(&k2), DKEY_DEBUG(key), mp->pgno); #endif /* MDBX_DEBUG */ /* Sizes must be 2-byte aligned. */ ksize = EVEN_CEIL(key->iov_len); oksize = EVEN_CEIL(node_ks(node)); delta = ksize - oksize; /* Shift node contents if EVEN_CEIL(key length) changed. */ if (delta) { if (delta > (int)page_room(mp)) { /* not enough space left, do a delete and split */ DEBUG("Not enough room, delta = %zd, splitting...", delta); pgno_t pgno = node_pgno(node); node_del(mc, 0); int err = page_split(mc, key, nullptr, pgno, MDBX_SPLIT_REPLACE); if (err == MDBX_SUCCESS && AUDIT_ENABLED()) err = cursor_check_updating(mc); return err; } nkeys = page_numkeys(mp); for (i = 0; i < nkeys; i++) { if (mp->entries[i] <= ptr) { cASSERT(mc, mp->entries[i] >= delta); mp->entries[i] -= (indx_t)delta; } } void *const base = ptr_disp(mp, mp->upper + PAGEHDRSZ); len = ptr - mp->upper + NODESIZE; memmove(ptr_disp(base, -delta), base, len); cASSERT(mc, mp->upper >= delta); mp->upper -= (indx_t)delta; node = page_node(mp, indx); } /* But even if no shift was needed, update ksize */ node_set_ks(node, key->iov_len); if (likely(key->iov_len /* to avoid UBSAN traps*/ != 0)) memcpy(node_key(node), key->iov_base, key->iov_len); return MDBX_SUCCESS; }