libmdbx/src/page-ops.c
Леонид Юрьев (Leonid Yuriev) 225fb79eb2 mdbx: переименование repnl/retxl.
2024-12-22 10:39:49 +03:00

745 lines
29 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
#include "internals.h"
static inline tree_t *outer_tree(MDBX_cursor *mc) {
cASSERT(mc, (mc->flags & z_inner) != 0);
subcur_t *mx = container_of(mc->tree, subcur_t, nested_tree);
cursor_couple_t *couple = container_of(mx, cursor_couple_t, inner);
cASSERT(mc, mc->tree == &couple->outer.subcur->nested_tree);
cASSERT(mc, &mc->clc->k == &couple->outer.clc->v);
return couple->outer.tree;
}
pgr_t page_new(MDBX_cursor *mc, const unsigned flags) {
cASSERT(mc, (flags & P_LARGE) == 0);
pgr_t ret = gc_alloc_single(mc);
if (unlikely(ret.err != MDBX_SUCCESS))
return ret;
DEBUG("db %zu allocated new page %" PRIaPGNO, cursor_dbi(mc), ret.page->pgno);
ret.page->flags = (uint16_t)flags;
cASSERT(mc, *cursor_dbi_state(mc) & DBI_DIRTY);
cASSERT(mc, mc->txn->flags & MDBX_TXN_DIRTY);
#if MDBX_ENABLE_PGOP_STAT
mc->txn->env->lck->pgops.newly.weak += 1;
#endif /* MDBX_ENABLE_PGOP_STAT */
STATIC_ASSERT(P_BRANCH == 1);
const unsigned is_branch = flags & P_BRANCH;
ret.page->lower = 0;
ret.page->upper = (indx_t)(mc->txn->env->ps - PAGEHDRSZ);
mc->tree->branch_pages += is_branch;
mc->tree->leaf_pages += 1 - is_branch;
if (unlikely(mc->flags & z_inner)) {
tree_t *outer = outer_tree(mc);
outer->branch_pages += is_branch;
outer->leaf_pages += 1 - is_branch;
}
return ret;
}
pgr_t page_new_large(MDBX_cursor *mc, const size_t npages) {
pgr_t ret = likely(npages == 1) ? gc_alloc_single(mc) : gc_alloc_ex(mc, npages, ALLOC_DEFAULT);
if (unlikely(ret.err != MDBX_SUCCESS))
return ret;
DEBUG("dbi %zu allocated new large-page %" PRIaPGNO ", num %zu", cursor_dbi(mc), ret.page->pgno, npages);
ret.page->flags = P_LARGE;
cASSERT(mc, *cursor_dbi_state(mc) & DBI_DIRTY);
cASSERT(mc, mc->txn->flags & MDBX_TXN_DIRTY);
#if MDBX_ENABLE_PGOP_STAT
mc->txn->env->lck->pgops.newly.weak += npages;
#endif /* MDBX_ENABLE_PGOP_STAT */
mc->tree->large_pages += (pgno_t)npages;
ret.page->pages = (pgno_t)npages;
cASSERT(mc, !(mc->flags & z_inner));
return ret;
}
__hot void page_copy(page_t *const dst, const page_t *const src, const size_t size) {
STATIC_ASSERT(UINT16_MAX > MDBX_MAX_PAGESIZE - PAGEHDRSZ);
STATIC_ASSERT(MDBX_MIN_PAGESIZE > PAGEHDRSZ + NODESIZE * 4);
void *copy_dst = dst;
const void *copy_src = src;
size_t copy_len = size;
if (src->flags & P_DUPFIX) {
copy_len = PAGEHDRSZ + src->dupfix_ksize * page_numkeys(src);
if (unlikely(copy_len > size))
goto bailout;
} else if ((src->flags & P_LARGE) == 0) {
size_t upper = src->upper, lower = src->lower;
intptr_t unused = upper - lower;
/* If page isn't full, just copy the used portion. Adjust
* alignment so memcpy may copy words instead of bytes. */
if (unused > MDBX_CACHELINE_SIZE * 3) {
lower = ceil_powerof2(lower + PAGEHDRSZ, sizeof(void *));
upper = floor_powerof2(upper + PAGEHDRSZ, sizeof(void *));
if (unlikely(upper > copy_len))
goto bailout;
memcpy(copy_dst, copy_src, lower);
copy_dst = ptr_disp(copy_dst, upper);
copy_src = ptr_disp(copy_src, upper);
copy_len -= upper;
}
}
memcpy(copy_dst, copy_src, copy_len);
return;
bailout:
if (src->flags & P_DUPFIX)
bad_page(src, "%s addr %p, n-keys %zu, ksize %u", "invalid/corrupted source page", __Wpedantic_format_voidptr(src),
page_numkeys(src), src->dupfix_ksize);
else
bad_page(src, "%s addr %p, upper %u", "invalid/corrupted source page", __Wpedantic_format_voidptr(src), src->upper);
memset(dst, -1, size);
}
__cold pgr_t __must_check_result page_unspill(MDBX_txn *const txn, const page_t *const mp) {
VERBOSE("unspill page %" PRIaPGNO, mp->pgno);
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0);
tASSERT(txn, is_spilled(txn, mp));
const MDBX_txn *scan = txn;
pgr_t ret;
do {
tASSERT(txn, (scan->flags & MDBX_TXN_SPILLS) != 0);
const size_t si = spill_search(scan, mp->pgno);
if (!si)
continue;
const unsigned npages = is_largepage(mp) ? mp->pages : 1;
ret.page = page_shadow_alloc(txn, npages);
if (unlikely(!ret.page)) {
ret.err = MDBX_ENOMEM;
return ret;
}
page_copy(ret.page, mp, pgno2bytes(txn->env, npages));
if (scan == txn) {
/* If in current txn, this page is no longer spilled.
* If it happens to be the last page, truncate the spill list.
* Otherwise mark it as deleted by setting the LSB. */
spill_remove(txn, si, npages);
} /* otherwise, if belonging to a parent txn, the
* page remains spilled until child commits */
ret.err = page_dirty(txn, ret.page, npages);
if (unlikely(ret.err != MDBX_SUCCESS))
return ret;
#if MDBX_ENABLE_PGOP_STAT
txn->env->lck->pgops.unspill.weak += npages;
#endif /* MDBX_ENABLE_PGOP_STAT */
ret.page->flags |= (scan == txn) ? 0 : P_SPILLED;
ret.err = MDBX_SUCCESS;
return ret;
} while (likely((scan = scan->parent) != nullptr && (scan->flags & MDBX_TXN_SPILLS) != 0));
ERROR("Page %" PRIaPGNO " mod-txnid %" PRIaTXN " not found in the spill-list(s), current txn %" PRIaTXN
" front %" PRIaTXN ", root txn %" PRIaTXN " front %" PRIaTXN,
mp->pgno, mp->txnid, txn->txnid, txn->front_txnid, txn->env->basal_txn->txnid,
txn->env->basal_txn->front_txnid);
ret.err = MDBX_PROBLEM;
ret.page = nullptr;
return ret;
}
__hot int page_touch_modifable(MDBX_txn *txn, const page_t *const mp) {
tASSERT(txn, is_modifable(txn, mp) && txn->tw.dirtylist);
tASSERT(txn, !is_largepage(mp) && !is_subpage(mp));
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC);
const size_t n = dpl_search(txn, mp->pgno);
if (MDBX_AVOID_MSYNC && unlikely(txn->tw.dirtylist->items[n].pgno != mp->pgno)) {
tASSERT(txn, (txn->flags & MDBX_WRITEMAP));
tASSERT(txn, n > 0 && n <= txn->tw.dirtylist->length + 1);
VERBOSE("unspill page %" PRIaPGNO, mp->pgno);
#if MDBX_ENABLE_PGOP_STAT
txn->env->lck->pgops.unspill.weak += 1;
#endif /* MDBX_ENABLE_PGOP_STAT */
return page_dirty(txn, (page_t *)mp, 1);
}
tASSERT(txn, n > 0 && n <= txn->tw.dirtylist->length);
tASSERT(txn, txn->tw.dirtylist->items[n].pgno == mp->pgno && txn->tw.dirtylist->items[n].ptr == mp);
if (!MDBX_AVOID_MSYNC || (txn->flags & MDBX_WRITEMAP) == 0) {
size_t *const ptr = ptr_disp(txn->tw.dirtylist->items[n].ptr, -(ptrdiff_t)sizeof(size_t));
*ptr = txn->tw.dirtylru;
}
return MDBX_SUCCESS;
}
__hot int page_touch_unmodifable(MDBX_txn *txn, MDBX_cursor *mc, const page_t *const mp) {
tASSERT(txn, !is_modifable(txn, mp) && !is_largepage(mp));
if (is_subpage(mp)) {
((page_t *)mp)->txnid = txn->front_txnid;
return MDBX_SUCCESS;
}
int rc;
page_t *np;
if (is_frozen(txn, mp)) {
/* CoW the page */
rc = pnl_need(&txn->tw.retired_pages, 1);
if (unlikely(rc != MDBX_SUCCESS))
goto fail;
const pgr_t par = gc_alloc_single(mc);
rc = par.err;
np = par.page;
if (unlikely(rc != MDBX_SUCCESS))
goto fail;
const pgno_t pgno = np->pgno;
DEBUG("touched db %d page %" PRIaPGNO " -> %" PRIaPGNO, cursor_dbi_dbg(mc), mp->pgno, pgno);
tASSERT(txn, mp->pgno != pgno);
pnl_append_prereserved(txn->tw.retired_pages, mp->pgno);
/* Update the parent page, if any, to point to the new page */
if (likely(mc->top)) {
page_t *parent = mc->pg[mc->top - 1];
node_t *node = page_node(parent, mc->ki[mc->top - 1]);
node_set_pgno(node, pgno);
} else {
mc->tree->root = pgno;
}
#if MDBX_ENABLE_PGOP_STAT
txn->env->lck->pgops.cow.weak += 1;
#endif /* MDBX_ENABLE_PGOP_STAT */
page_copy(np, mp, txn->env->ps);
np->pgno = pgno;
np->txnid = txn->front_txnid;
} else if (is_spilled(txn, mp)) {
pgr_t pur = page_unspill(txn, mp);
np = pur.page;
rc = pur.err;
if (likely(rc == MDBX_SUCCESS)) {
tASSERT(txn, np != nullptr);
goto done;
}
goto fail;
} else {
if (unlikely(!txn->parent)) {
ERROR("Unexpected not frozen/modifiable/spilled but shadowed %s "
"page %" PRIaPGNO " mod-txnid %" PRIaTXN ","
" without parent transaction, current txn %" PRIaTXN " front %" PRIaTXN,
is_branch(mp) ? "branch" : "leaf", mp->pgno, mp->txnid, mc->txn->txnid, mc->txn->front_txnid);
rc = MDBX_PROBLEM;
goto fail;
}
DEBUG("clone db %d page %" PRIaPGNO, cursor_dbi_dbg(mc), mp->pgno);
tASSERT(txn, txn->tw.dirtylist->length <= PAGELIST_LIMIT + MDBX_PNL_GRANULATE);
/* No - copy it */
np = page_shadow_alloc(txn, 1);
if (unlikely(!np)) {
rc = MDBX_ENOMEM;
goto fail;
}
page_copy(np, mp, txn->env->ps);
/* insert a clone of parent's dirty page, so don't touch dirtyroom */
rc = page_dirty(txn, np, 1);
if (unlikely(rc != MDBX_SUCCESS))
goto fail;
#if MDBX_ENABLE_PGOP_STAT
txn->env->lck->pgops.clone.weak += 1;
#endif /* MDBX_ENABLE_PGOP_STAT */
}
done:
/* Adjust cursors pointing to mp */
mc->pg[mc->top] = np;
MDBX_cursor *m2 = txn->cursors[cursor_dbi(mc)];
if (mc->flags & z_inner) {
for (; m2; m2 = m2->next) {
MDBX_cursor *m3 = &m2->subcur->cursor;
if (m3->top < mc->top)
continue;
if (m3->pg[mc->top] == mp)
m3->pg[mc->top] = np;
}
} else {
for (; m2; m2 = m2->next) {
if (m2->top < mc->top)
continue;
if (m2->pg[mc->top] == mp) {
m2->pg[mc->top] = np;
if (is_leaf(np) && inner_pointed(m2))
cursor_inner_refresh(m2, np, m2->ki[mc->top]);
}
}
}
return MDBX_SUCCESS;
fail:
txn->flags |= MDBX_TXN_ERROR;
return rc;
}
page_t *page_shadow_alloc(MDBX_txn *txn, size_t num) {
MDBX_env *env = txn->env;
page_t *np = env->shadow_reserve;
size_t size = env->ps;
if (likely(num == 1 && np)) {
eASSERT(env, env->shadow_reserve_len > 0);
MDBX_ASAN_UNPOISON_MEMORY_REGION(np, size);
VALGRIND_MEMPOOL_ALLOC(env, ptr_disp(np, -(ptrdiff_t)sizeof(size_t)), size + sizeof(size_t));
VALGRIND_MAKE_MEM_DEFINED(&page_next(np), sizeof(page_t *));
env->shadow_reserve = page_next(np);
env->shadow_reserve_len -= 1;
} else {
size = pgno2bytes(env, num);
void *const ptr = osal_malloc(size + sizeof(size_t));
if (unlikely(!ptr)) {
txn->flags |= MDBX_TXN_ERROR;
return nullptr;
}
VALGRIND_MEMPOOL_ALLOC(env, ptr, size + sizeof(size_t));
np = ptr_disp(ptr, sizeof(size_t));
}
if ((env->flags & MDBX_NOMEMINIT) == 0) {
/* For a single page alloc, we init everything after the page header.
* For multi-page, we init the final page; if the caller needed that
* many pages they will be filling in at least up to the last page. */
size_t skip = PAGEHDRSZ;
if (num > 1)
skip += pgno2bytes(env, num - 1);
memset(ptr_disp(np, skip), 0, size - skip);
}
#if MDBX_DEBUG
np->pgno = 0;
#endif
VALGRIND_MAKE_MEM_UNDEFINED(np, size);
np->flags = 0;
np->pages = (pgno_t)num;
return np;
}
void page_shadow_release(MDBX_env *env, page_t *dp, size_t npages) {
VALGRIND_MAKE_MEM_UNDEFINED(dp, pgno2bytes(env, npages));
MDBX_ASAN_UNPOISON_MEMORY_REGION(dp, pgno2bytes(env, npages));
if (unlikely(env->flags & MDBX_PAGEPERTURB))
memset(dp, -1, pgno2bytes(env, npages));
if (likely(npages == 1 && env->shadow_reserve_len < env->options.dp_reserve_limit)) {
MDBX_ASAN_POISON_MEMORY_REGION(dp, env->ps);
MDBX_ASAN_UNPOISON_MEMORY_REGION(&page_next(dp), sizeof(page_t *));
page_next(dp) = env->shadow_reserve;
VALGRIND_MEMPOOL_FREE(env, ptr_disp(dp, -(ptrdiff_t)sizeof(size_t)));
env->shadow_reserve = dp;
env->shadow_reserve_len += 1;
} else {
/* large pages just get freed directly */
void *const ptr = ptr_disp(dp, -(ptrdiff_t)sizeof(size_t));
VALGRIND_MEMPOOL_FREE(env, ptr);
osal_free(ptr);
}
}
__cold static void page_kill(MDBX_txn *txn, page_t *mp, pgno_t pgno, size_t npages) {
MDBX_env *const env = txn->env;
DEBUG("kill %zu page(s) %" PRIaPGNO, npages, pgno);
eASSERT(env, pgno >= NUM_METAS && npages);
if (!is_frozen(txn, mp)) {
const size_t bytes = pgno2bytes(env, npages);
memset(mp, -1, bytes);
mp->pgno = pgno;
if ((txn->flags & MDBX_WRITEMAP) == 0)
osal_pwrite(env->lazy_fd, mp, bytes, pgno2bytes(env, pgno));
} else {
struct iovec iov[MDBX_AUXILARY_IOV_MAX];
iov[0].iov_len = env->ps;
iov[0].iov_base = ptr_disp(env->page_auxbuf, env->ps);
size_t iov_off = pgno2bytes(env, pgno), n = 1;
while (--npages) {
iov[n] = iov[0];
if (++n == MDBX_AUXILARY_IOV_MAX) {
osal_pwritev(env->lazy_fd, iov, MDBX_AUXILARY_IOV_MAX, iov_off);
iov_off += pgno2bytes(env, MDBX_AUXILARY_IOV_MAX);
n = 0;
}
}
osal_pwritev(env->lazy_fd, iov, n, iov_off);
}
}
static inline bool suitable4loose(const MDBX_txn *txn, pgno_t pgno) {
/* TODO:
* 1) при включенной "экономии последовательностей" проверить, что
* страница не примыкает к какой-либо из уже находящийся в reclaimed.
* 2) стоит подумать над тем, чтобы при большом loose-списке отбрасывать
половину в reclaimed. */
return txn->tw.loose_count < txn->env->options.dp_loose_limit &&
(!MDBX_ENABLE_REFUND ||
/* skip pages near to the end in favor of compactification */
txn->geo.first_unallocated > pgno + txn->env->options.dp_loose_limit ||
txn->geo.first_unallocated <= txn->env->options.dp_loose_limit);
}
/* Retire, loosen or free a single page.
*
* For dirty pages, saves single pages to a list for future reuse in this same
* txn. It has been pulled from the GC and already resides on the dirty list,
* but has been deleted. Use these pages first before pulling again from the GC.
*
* If the page wasn't dirtied in this txn, just add it
* to this txn's free list. */
int page_retire_ex(MDBX_cursor *mc, const pgno_t pgno, page_t *mp /* maybe null */,
unsigned pageflags /* maybe unknown/zero */) {
int rc;
MDBX_txn *const txn = mc->txn;
tASSERT(txn, !mp || (mp->pgno == pgno && mp->flags == pageflags));
/* During deleting entire subtrees, it is reasonable and possible to avoid
* reading leaf pages, i.e. significantly reduce hard page-faults & IOPs:
* - mp is null, i.e. the page has not yet been read;
* - pagetype is known and the P_LEAF bit is set;
* - we can determine the page status via scanning the lists
* of dirty and spilled pages.
*
* On the other hand, this could be suboptimal for WRITEMAP mode, since
* requires support the list of dirty pages and avoid explicit spilling.
* So for flexibility and avoid extra internal dependencies we just
* fallback to reading if dirty list was not allocated yet. */
size_t di = 0, si = 0, npages = 1;
enum page_status { unknown, frozen, spilled, shadowed, modifable } status = unknown;
if (unlikely(!mp)) {
if (ASSERT_ENABLED() && pageflags) {
pgr_t check;
check = page_get_any(mc, pgno, txn->front_txnid);
if (unlikely(check.err != MDBX_SUCCESS))
return check.err;
tASSERT(txn, ((unsigned)check.page->flags & ~P_SPILLED) == (pageflags & ~P_FROZEN));
tASSERT(txn, !(pageflags & P_FROZEN) || is_frozen(txn, check.page));
}
if (pageflags & P_FROZEN) {
status = frozen;
if (ASSERT_ENABLED()) {
for (MDBX_txn *scan = txn; scan; scan = scan->parent) {
tASSERT(txn, !txn->tw.spilled.list || !spill_search(scan, pgno));
tASSERT(txn, !scan->tw.dirtylist || !debug_dpl_find(scan, pgno));
}
}
goto status_done;
} else if (pageflags && txn->tw.dirtylist) {
if ((di = dpl_exist(txn, pgno)) != 0) {
mp = txn->tw.dirtylist->items[di].ptr;
tASSERT(txn, is_modifable(txn, mp));
status = modifable;
goto status_done;
}
if ((si = spill_search(txn, pgno)) != 0) {
status = spilled;
goto status_done;
}
for (MDBX_txn *parent = txn->parent; parent; parent = parent->parent) {
if (dpl_exist(parent, pgno)) {
status = shadowed;
goto status_done;
}
if (spill_search(parent, pgno)) {
status = spilled;
goto status_done;
}
}
status = frozen;
goto status_done;
}
pgr_t pg = page_get_any(mc, pgno, txn->front_txnid);
if (unlikely(pg.err != MDBX_SUCCESS))
return pg.err;
mp = pg.page;
tASSERT(txn, !pageflags || mp->flags == pageflags);
pageflags = mp->flags;
}
if (is_frozen(txn, mp)) {
status = frozen;
tASSERT(txn, !is_modifable(txn, mp));
tASSERT(txn, !is_spilled(txn, mp));
tASSERT(txn, !is_shadowed(txn, mp));
tASSERT(txn, !debug_dpl_find(txn, pgno));
tASSERT(txn, !txn->tw.spilled.list || !spill_search(txn, pgno));
} else if (is_modifable(txn, mp)) {
status = modifable;
if (txn->tw.dirtylist)
di = dpl_exist(txn, pgno);
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) || !is_spilled(txn, mp));
tASSERT(txn, !txn->tw.spilled.list || !spill_search(txn, pgno));
} else if (is_shadowed(txn, mp)) {
status = shadowed;
tASSERT(txn, !txn->tw.spilled.list || !spill_search(txn, pgno));
tASSERT(txn, !debug_dpl_find(txn, pgno));
} else {
tASSERT(txn, is_spilled(txn, mp));
status = spilled;
si = spill_search(txn, pgno);
tASSERT(txn, !debug_dpl_find(txn, pgno));
}
status_done:
if (likely((pageflags & P_LARGE) == 0)) {
STATIC_ASSERT(P_BRANCH == 1);
const bool is_branch = pageflags & P_BRANCH;
cASSERT(mc, ((pageflags & P_LEAF) == 0) == is_branch);
if (unlikely(mc->flags & z_inner)) {
tree_t *outer = outer_tree(mc);
cASSERT(mc, !is_branch || outer->branch_pages > 0);
outer->branch_pages -= is_branch;
cASSERT(mc, is_branch || outer->leaf_pages > 0);
outer->leaf_pages -= 1 - is_branch;
}
cASSERT(mc, !is_branch || mc->tree->branch_pages > 0);
mc->tree->branch_pages -= is_branch;
cASSERT(mc, is_branch || mc->tree->leaf_pages > 0);
mc->tree->leaf_pages -= 1 - is_branch;
} else {
npages = mp->pages;
cASSERT(mc, mc->tree->large_pages >= npages);
mc->tree->large_pages -= (pgno_t)npages;
}
if (status == frozen) {
retire:
DEBUG("retire %zu page %" PRIaPGNO, npages, pgno);
rc = pnl_append_span(&txn->tw.retired_pages, pgno, npages);
tASSERT(txn, dpl_check(txn));
return rc;
}
/* Возврат страниц в нераспределенный "хвост" БД.
* Содержимое страниц не уничтожается, а для вложенных транзакций граница
* нераспределенного "хвоста" БД сдвигается только при их коммите. */
if (MDBX_ENABLE_REFUND && unlikely(pgno + npages == txn->geo.first_unallocated)) {
const char *kind = nullptr;
if (status == modifable) {
/* Страница испачкана в этой транзакции, но до этого могла быть
* аллоцирована, испачкана и пролита в одной из родительских транзакций.
* Её МОЖНО вытолкнуть в нераспределенный хвост. */
kind = "dirty";
/* Remove from dirty list */
page_wash(txn, di, mp, npages);
} else if (si) {
/* Страница пролита в этой транзакции, т.е. она аллоцирована
* и запачкана в этой или одной из родительских транзакций.
* Её МОЖНО вытолкнуть в нераспределенный хвост. */
kind = "spilled";
tASSERT(txn, status == spilled);
spill_remove(txn, si, npages);
} else {
/* Страница аллоцирована, запачкана и возможно пролита в одной
* из родительских транзакций.
* Её МОЖНО вытолкнуть в нераспределенный хвост. */
kind = "parent's";
if (ASSERT_ENABLED() && mp) {
kind = nullptr;
for (MDBX_txn *parent = txn->parent; parent; parent = parent->parent) {
if (spill_search(parent, pgno)) {
kind = "parent-spilled";
tASSERT(txn, status == spilled);
break;
}
if (mp == debug_dpl_find(parent, pgno)) {
kind = "parent-dirty";
tASSERT(txn, status == shadowed);
break;
}
}
tASSERT(txn, kind != nullptr);
}
tASSERT(txn, status == spilled || status == shadowed);
}
DEBUG("refunded %zu %s page %" PRIaPGNO, npages, kind, pgno);
txn->geo.first_unallocated = pgno;
txn_refund(txn);
return MDBX_SUCCESS;
}
if (status == modifable) {
/* Dirty page from this transaction */
/* If suitable we can reuse it through loose list */
if (likely(npages == 1 && suitable4loose(txn, pgno)) && (di || !txn->tw.dirtylist)) {
DEBUG("loosen dirty page %" PRIaPGNO, pgno);
if (MDBX_DEBUG != 0 || unlikely(txn->env->flags & MDBX_PAGEPERTURB))
memset(page_data(mp), -1, txn->env->ps - PAGEHDRSZ);
mp->txnid = INVALID_TXNID;
mp->flags = P_LOOSE;
page_next(mp) = txn->tw.loose_pages;
txn->tw.loose_pages = mp;
txn->tw.loose_count++;
#if MDBX_ENABLE_REFUND
txn->tw.loose_refund_wl = (pgno + 2 > txn->tw.loose_refund_wl) ? pgno + 2 : txn->tw.loose_refund_wl;
#endif /* MDBX_ENABLE_REFUND */
VALGRIND_MAKE_MEM_NOACCESS(page_data(mp), txn->env->ps - PAGEHDRSZ);
MDBX_ASAN_POISON_MEMORY_REGION(page_data(mp), txn->env->ps - PAGEHDRSZ);
return MDBX_SUCCESS;
}
#if !MDBX_DEBUG && !defined(ENABLE_MEMCHECK) && !defined(__SANITIZE_ADDRESS__)
if (unlikely(txn->env->flags & MDBX_PAGEPERTURB))
#endif
{
/* Страница могла быть изменена в одной из родительских транзакций,
* в том числе, позже выгружена и затем снова загружена и изменена.
* В обоих случаях её нельзя затирать на диске и помечать недоступной
* в asan и/или valgrind */
for (MDBX_txn *parent = txn->parent; parent && (parent->flags & MDBX_TXN_SPILLS); parent = parent->parent) {
if (spill_intersect(parent, pgno, npages))
goto skip_invalidate;
if (dpl_intersect(parent, pgno, npages))
goto skip_invalidate;
}
#if defined(ENABLE_MEMCHECK) || defined(__SANITIZE_ADDRESS__)
if (MDBX_DEBUG != 0 || unlikely(txn->env->flags & MDBX_PAGEPERTURB))
#endif
page_kill(txn, mp, pgno, npages);
if ((txn->flags & MDBX_WRITEMAP) == 0) {
VALGRIND_MAKE_MEM_NOACCESS(page_data(pgno2page(txn->env, pgno)), pgno2bytes(txn->env, npages) - PAGEHDRSZ);
MDBX_ASAN_POISON_MEMORY_REGION(page_data(pgno2page(txn->env, pgno)), pgno2bytes(txn->env, npages) - PAGEHDRSZ);
}
}
skip_invalidate:
/* wash dirty page */
page_wash(txn, di, mp, npages);
reclaim:
DEBUG("reclaim %zu %s page %" PRIaPGNO, npages, "dirty", pgno);
rc = pnl_insert_span(&txn->tw.repnl, pgno, npages);
tASSERT(txn, pnl_check_allocated(txn->tw.repnl, txn->geo.first_unallocated - MDBX_ENABLE_REFUND));
tASSERT(txn, dpl_check(txn));
return rc;
}
if (si) {
/* Page ws spilled in this txn */
spill_remove(txn, si, npages);
/* Страница могла быть выделена и затем пролита в этой транзакции,
* тогда её необходимо поместить в reclaimed-список.
* Либо она могла быть выделена в одной из родительских транзакций и затем
* пролита в этой транзакции, тогда её необходимо поместить в
* retired-список для последующей фильтрации при коммите. */
for (MDBX_txn *parent = txn->parent; parent; parent = parent->parent) {
if (dpl_exist(parent, pgno))
goto retire;
}
/* Страница точно была выделена в этой транзакции
* и теперь может быть использована повторно. */
goto reclaim;
}
if (status == shadowed) {
/* Dirty page MUST BE a clone from (one of) parent transaction(s). */
if (ASSERT_ENABLED()) {
const page_t *parent_dp = nullptr;
/* Check parent(s)'s dirty lists. */
for (MDBX_txn *parent = txn->parent; parent && !parent_dp; parent = parent->parent) {
tASSERT(txn, !spill_search(parent, pgno));
parent_dp = debug_dpl_find(parent, pgno);
}
tASSERT(txn, parent_dp && (!mp || parent_dp == mp));
}
/* Страница была выделена в родительской транзакции и теперь может быть
* использована повторно, но только внутри этой транзакции, либо дочерних.
*/
goto reclaim;
}
/* Страница может входить в доступный читателям MVCC-снимок, либо же она
* могла быть выделена, а затем пролита в одной из родительских
* транзакций. Поэтому пока помещаем её в retired-список, который будет
* фильтроваться относительно dirty- и spilled-списков родительских
* транзакций при коммите дочерних транзакций, либо же будет записан
* в GC в неизменном виде. */
goto retire;
}
__hot int __must_check_result page_dirty(MDBX_txn *txn, page_t *mp, size_t npages) {
tASSERT(txn, (txn->flags & MDBX_TXN_RDONLY) == 0);
mp->txnid = txn->front_txnid;
if (!txn->tw.dirtylist) {
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) != 0 && !MDBX_AVOID_MSYNC);
txn->tw.writemap_dirty_npages += npages;
tASSERT(txn, txn->tw.spilled.list == nullptr);
return MDBX_SUCCESS;
}
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC);
#if xMDBX_DEBUG_SPILLING == 2
txn->env->debug_dirtied_act += 1;
ENSURE(txn->env, txn->env->debug_dirtied_act < txn->env->debug_dirtied_est);
ENSURE(txn->env, txn->tw.dirtyroom + txn->tw.loose_count > 0);
#endif /* xMDBX_DEBUG_SPILLING == 2 */
int rc;
if (unlikely(txn->tw.dirtyroom == 0)) {
if (txn->tw.loose_count) {
page_t *lp = txn->tw.loose_pages;
DEBUG("purge-and-reclaim loose page %" PRIaPGNO, lp->pgno);
rc = pnl_insert_span(&txn->tw.repnl, lp->pgno, 1);
if (unlikely(rc != MDBX_SUCCESS))
goto bailout;
size_t di = dpl_search(txn, lp->pgno);
tASSERT(txn, txn->tw.dirtylist->items[di].ptr == lp);
dpl_remove(txn, di);
MDBX_ASAN_UNPOISON_MEMORY_REGION(&page_next(lp), sizeof(page_t *));
VALGRIND_MAKE_MEM_DEFINED(&page_next(lp), sizeof(page_t *));
txn->tw.loose_pages = page_next(lp);
txn->tw.loose_count--;
txn->tw.dirtyroom++;
if (!MDBX_AVOID_MSYNC || !(txn->flags & MDBX_WRITEMAP))
page_shadow_release(txn->env, lp, 1);
} else {
ERROR("Dirtyroom is depleted, DPL length %zu", txn->tw.dirtylist->length);
if (!MDBX_AVOID_MSYNC || !(txn->flags & MDBX_WRITEMAP))
page_shadow_release(txn->env, mp, npages);
return MDBX_TXN_FULL;
}
}
rc = dpl_append(txn, mp->pgno, mp, npages);
if (unlikely(rc != MDBX_SUCCESS)) {
bailout:
txn->flags |= MDBX_TXN_ERROR;
return rc;
}
txn->tw.dirtyroom--;
tASSERT(txn, dpl_check(txn));
return MDBX_SUCCESS;
}
void recalculate_subpage_thresholds(MDBX_env *env) {
size_t whole = env->leaf_nodemax - NODESIZE;
env->subpage_limit = (whole * env->options.subpage.limit + 32767) >> 16;
whole = env->subpage_limit;
env->subpage_reserve_limit = (whole * env->options.subpage.reserve_limit + 32767) >> 16;
eASSERT(env, env->leaf_nodemax >= env->subpage_limit + NODESIZE);
eASSERT(env, env->subpage_limit >= env->subpage_reserve_limit);
whole = env->leaf_nodemax;
env->subpage_room_threshold = (whole * env->options.subpage.room_threshold + 32767) >> 16;
env->subpage_reserve_prereq = (whole * env->options.subpage.reserve_prereq + 32767) >> 16;
if (env->subpage_room_threshold + env->subpage_reserve_limit > (intptr_t)page_space(env))
env->subpage_reserve_prereq = page_space(env);
else if (env->subpage_reserve_prereq < env->subpage_room_threshold + env->subpage_reserve_limit)
env->subpage_reserve_prereq = env->subpage_room_threshold + env->subpage_reserve_limit;
eASSERT(env, env->subpage_reserve_prereq > env->subpage_room_threshold + env->subpage_reserve_limit);
}
size_t page_subleaf2_reserve(const MDBX_env *env, size_t host_page_room, size_t subpage_len, size_t item_len) {
eASSERT(env, (subpage_len & 1) == 0);
eASSERT(env, env->leaf_nodemax >= env->subpage_limit + NODESIZE);
size_t reserve = 0;
for (size_t n = 0; n < 5 && reserve + item_len <= env->subpage_reserve_limit &&
EVEN_CEIL(subpage_len + item_len) <= env->subpage_limit &&
host_page_room >= env->subpage_reserve_prereq + EVEN_CEIL(subpage_len + item_len);
++n) {
subpage_len += item_len;
reserve += item_len;
}
return reserve + (subpage_len & 1);
}