mdbx: rework/fix retire-to-parent-txn pages (fix LMDB bug).

Change-Id: I81c3d48f19b4c7e62d77cfecc167235374f66402
This commit is contained in:
Leonid Yuriev 2019-10-17 14:07:47 +03:00
parent 659933d0c9
commit 5c54566c5c
2 changed files with 187 additions and 212 deletions

View File

@ -2172,18 +2172,6 @@ static size_t bytes_align2os_bytes(const MDBX_env *env, size_t bytes) {
return mdbx_roundup2(mdbx_roundup2(bytes, env->me_psize), env->me_os_psize);
}
static void __cold mdbx_kill_page(MDBX_env *env, MDBX_page *mp) {
mdbx_assert(env, mp->mp_pgno >= NUM_METAS);
const size_t len = env->me_psize - PAGEHDRSZ;
void *ptr = (env->me_flags & MDBX_WRITEMAP)
? &mp->mp_data
: (void *)((uint8_t *)env->me_pbuf + env->me_psize);
memset(ptr, 0x6F /* 'o', 111 */, len);
if (ptr != &mp->mp_data)
(void)mdbx_pwrite(env->me_fd, ptr, len,
pgno2bytes(env, mp->mp_pgno) + PAGEHDRSZ);
}
static __inline MDBX_db *mdbx_outer_db(MDBX_cursor *mc) {
mdbx_cassert(mc, (mc->mc_flags & C_SUB) != 0);
MDBX_xcursor *mx = container_of(mc->mc_db, MDBX_xcursor, mx_db);
@ -2355,16 +2343,85 @@ static __must_check_result int mdbx_refund_loose(MDBX_txn *txn, MDBX_page *mp) {
static __must_check_result __hot int mdbx_loose_page(MDBX_txn *txn,
MDBX_page *mp) {
VALGRIND_MAKE_MEM_UNDEFINED(mp, txn->mt_env->me_psize);
VALGRIND_MAKE_MEM_DEFINED(&mp->mp_pgno, sizeof(mp->mp_pgno));
if (unlikely(txn->mt_env->me_flags & MDBX_PAGEPERTURB))
mdbx_kill_page(txn->mt_env, mp);
const unsigned npages = IS_OVERFLOW(mp) ? mp->mp_pages : 1;
pgno_t pgno = mp->mp_pgno;
if (txn->mt_parent) {
mdbx_tassert(txn, (txn->mt_env->me_flags & MDBX_WRITEMAP) == 0);
mdbx_tassert(txn, mp != pgno2page(txn->mt_env, pgno));
/* If txn has a parent, make sure the page is in our dirty list. */
MDBX_page *dp = mdbx_dpl_find(txn->tw.dirtylist, pgno);
/* TODO: use extended flag-mask to track parent's dirty-pages */
if (dp == nullptr) {
mp->mp_next = txn->tw.retired2parent_pages;
txn->tw.retired2parent_pages = mp;
txn->tw.retired2parent_count += npages;
return MDBX_SUCCESS;
}
if (unlikely(mp != dp)) { /* bad cursor? */
mdbx_error("wrong page 0x%p #%" PRIaPGNO
" in the dirtylist, expecting %p",
dp, pgno, mp);
txn->mt_flags |= MDBX_TXN_ERROR;
return MDBX_PROBLEM;
}
/* ok, it's ours */
}
mdbx_debug("loosen page %" PRIaPGNO, pgno);
if (unlikely(txn->mt_env->me_flags & MDBX_PAGEPERTURB)) {
mdbx_tassert(txn, pgno >= NUM_METAS);
const size_t bytes = pgno2bytes(txn->mt_env, npages);
memset(mp, 0, bytes);
if ((txn->mt_env->me_flags & MDBX_WRITEMAP) == 0)
mdbx_pwrite(txn->mt_env->me_fd, mp, bytes, pgno2bytes(txn->mt_env, pgno));
VALGRIND_MAKE_MEM_UNDEFINED(mp, PAGEHDRSZ);
}
VALGRIND_MAKE_MEM_NOACCESS(&mp->mp_data, txn->mt_env->me_psize - PAGEHDRSZ);
ASAN_POISON_MEMORY_REGION(&mp->mp_data, txn->mt_env->me_psize - PAGEHDRSZ);
if (unlikely(npages >
1 /* overflow pages doesn't comes to the loose-list */)) {
if (IS_DIRTY(mp)) {
/* Remove from dirty list */
MDBX_page *dp = mdbx_dpl_remove(txn->tw.dirtylist, mp->mp_pgno);
if (unlikely(dp != mp)) {
mdbx_error("not found page 0x%p #%" PRIaPGNO " in the dirtylist", mp,
mp->mp_pgno);
txn->mt_flags |= MDBX_TXN_ERROR;
return MDBX_PROBLEM;
}
txn->tw.dirtyroom++;
mdbx_tassert(txn, txn->mt_parent ||
txn->tw.dirtyroom + txn->tw.dirtylist->length ==
MDBX_DPL_TXNFULL);
if ((txn->mt_flags & MDBX_WRITEMAP) == 0)
mdbx_dpage_free(txn->mt_env, mp, npages);
}
int rc = mdbx_pnl_need(&txn->tw.reclaimed_pglist, npages);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
/* Insert in me_reclaimed_pglist */
MDBX_PNL pnl = txn->tw.reclaimed_pglist;
unsigned r, w = MDBX_PNL_SIZE(pnl) + npages;
for (r = MDBX_PNL_SIZE(pnl); r && MDBX_PNL_DISORDERED(pnl[r], pgno);)
pnl[w--] = pnl[r--];
MDBX_PNL_SIZE(pnl) += npages;
pgno = MDBX_PNL_ASCENDING ? pgno + npages : pgno;
while (w > r)
pnl[w--] = MDBX_PNL_ASCENDING ? --pgno : pgno++;
mdbx_tassert(txn, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist,
txn->mt_next_pgno));
return MDBX_SUCCESS;
}
mp->mp_flags = P_LOOSE | P_DIRTY;
if (likely(txn->mt_next_pgno != mp->mp_pgno + 1)) {
mp->mp_pgno = pgno;
if (likely(txn->mt_next_pgno != pgno + 1)) {
mp->mp_next = txn->tw.loose_pages;
txn->tw.loose_pages = mp;
txn->tw.loose_count++;
@ -2378,51 +2435,65 @@ static __must_check_result __hot int mdbx_loose_page(MDBX_txn *txn,
static __must_check_result __hot int mdbx_retire_page(MDBX_cursor *mc,
MDBX_page *mp) {
mdbx_cassert(mc, !IS_OVERFLOW(mp));
const unsigned npages = IS_OVERFLOW(mp) ? mp->mp_pages : 1;
const pgno_t pgno = mp->mp_pgno;
MDBX_txn *const txn = mc->mc_txn;
if (unlikely(mc->mc_flags & C_SUB)) {
MDBX_db *outer = mdbx_outer_db(mc);
mdbx_cassert(mc, !IS_BRANCH(mp) || outer->md_branch_pages > 0);
outer->md_branch_pages -= IS_BRANCH(mp);
mdbx_cassert(mc, !IS_LEAF(mp) || outer->md_leaf_pages > 0);
outer->md_leaf_pages -= IS_LEAF(mp);
mdbx_cassert(mc, !IS_OVERFLOW(mp));
}
mdbx_cassert(mc, !IS_BRANCH(mp) || mc->mc_db->md_branch_pages > 0);
mc->mc_db->md_branch_pages -= IS_BRANCH(mp);
mdbx_cassert(mc, !IS_LEAF(mp) || mc->mc_db->md_leaf_pages > 0);
mc->mc_db->md_leaf_pages -= IS_LEAF(mp);
MDBX_txn *const txn = mc->mc_txn;
const pgno_t pgno = mp->mp_pgno;
bool loose = false;
mdbx_cassert(mc, !IS_OVERFLOW(mp) || mc->mc_db->md_overflow_pages >= npages);
mc->mc_db->md_overflow_pages -= IS_OVERFLOW(mp) ? npages : 0;
if (IS_DIRTY(mp)) {
loose = true /* in case no parent txn, so it's just ours */;
if (txn->mt_parent) {
/* If txn has a parent, make sure the page is in our dirty list. */
loose = false;
mdbx_cassert(mc, (txn->mt_env->me_flags & MDBX_WRITEMAP) == 0);
MDBX_page *dp = mdbx_dpl_find(txn->tw.dirtylist, pgno);
if (dp) {
if (unlikely(mp != dp)) { /* bad cursor? */
mdbx_error("wrong page 0x%p #%" PRIaPGNO
" in the dirtylist, expecting %p",
dp, pgno, mp);
mc->mc_flags &= ~(C_INITIALIZED | C_EOF);
txn->mt_flags |= MDBX_TXN_ERROR;
return MDBX_PROBLEM;
}
loose = true /* ok, it's ours */;
}
int rc = mdbx_loose_page(txn, mp);
if (unlikely(rc != MDBX_SUCCESS))
mc->mc_flags &= ~(C_INITIALIZED | C_EOF);
return rc;
}
if (txn->tw.spill_pages) {
const unsigned i = mdbx_pnl_exist(txn->tw.spill_pages, pgno << 1);
if (i) {
/* This page is no longer spilled */
mdbx_tassert(txn, i == MDBX_PNL_SIZE(txn->tw.spill_pages) ||
txn->tw.spill_pages[i + 1] >= (pgno + npages) << 1);
txn->tw.spill_pages[i] |= 1;
if (i == MDBX_PNL_SIZE(txn->tw.spill_pages))
MDBX_PNL_SIZE(txn->tw.spill_pages) -= 1;
int rc = mdbx_loose_page(txn, mp);
if (unlikely(rc != MDBX_SUCCESS))
mc->mc_flags &= ~(C_INITIALIZED | C_EOF);
return rc;
}
}
if (loose) {
mdbx_debug("loosen db %d page %" PRIaPGNO, DDBI(mc), mp->mp_pgno);
return mdbx_loose_page(txn, mp);
}
int rc = mdbx_pnl_append(&txn->tw.retired_pages, pgno);
mdbx_tassert(txn, mp == pgno2page(txn->mt_env, pgno));
int rc = (npages == 1)
? mdbx_pnl_append(&txn->tw.retired_pages, pgno)
: mdbx_pnl_append_range(&txn->tw.retired_pages, pgno, npages);
mdbx_tassert(txn, mdbx_dpl_find(txn->tw.dirtylist, pgno) == nullptr);
return rc;
}
static __must_check_result __inline int mdbx_retire_pgno(MDBX_cursor *mc,
const pgno_t pgno) {
MDBX_page *mp;
int rc = mdbx_page_get(mc, pgno, &mp, NULL);
if (likely(rc == MDBX_SUCCESS))
rc = mdbx_retire_page(mc, mp);
return rc;
}
/* Set or clear P_KEEP in dirty, non-overflow, non-sub pages watched by txn.
*
* [in] mc A cursor handle for the current operation.
@ -3430,7 +3501,7 @@ done:
mdbx_ensure(env, pgno >= NUM_METAS);
if (env->me_flags & MDBX_WRITEMAP) {
np = pgno2page(env, pgno);
/* LY: reset no-access flag from mdbx_kill_page() */
/* LY: reset no-access flag from mdbx_loose_page() */
VALGRIND_MAKE_MEM_UNDEFINED(np, pgno2bytes(env, num));
ASAN_UNPOISON_MEMORY_REGION(np, pgno2bytes(env, num));
} else {
@ -4800,7 +4871,8 @@ static __cold int mdbx_audit_ex(MDBX_txn *txn, unsigned retired_stored,
pgno_t pending = 0;
if ((txn->mt_flags & MDBX_RDONLY) == 0) {
pending = txn->tw.loose_count + MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) +
(MDBX_PNL_SIZE(txn->tw.retired_pages) - retired_stored);
(MDBX_PNL_SIZE(txn->tw.retired_pages) - retired_stored) +
txn->tw.retired2parent_count;
for (MDBX_txn *parent = txn->mt_parent; parent; parent = parent->mt_parent)
pending += parent->tw.loose_count;
}
@ -4901,11 +4973,12 @@ static __cold int mdbx_audit_ex(MDBX_txn *txn, unsigned retired_stored,
if ((txn->mt_flags & MDBX_RDONLY) == 0)
mdbx_error("audit @%" PRIaTXN ": %u(pending) = %u(loose-count) + "
"%u(reclaimed-list) + %u(retired-pending) - %u(retired-stored)",
"%u(reclaimed-list) + %u(retired-pending) - %u(retired-stored) "
"+ %u(retired2parent)",
txn->mt_txnid, pending, txn->tw.loose_count,
MDBX_PNL_SIZE(txn->tw.reclaimed_pglist),
txn->tw.retired_pages ? MDBX_PNL_SIZE(txn->tw.retired_pages) : 0,
retired_stored);
retired_stored, txn->tw.retired2parent_count);
mdbx_error("audit @%" PRIaTXN ": %" PRIaPGNO "(pending) + %" PRIaPGNO
"(free) + %" PRIaPGNO "(count) = %" PRIaPGNO
"(total) <> %" PRIaPGNO "(next-pgno)",
@ -5795,7 +5868,12 @@ int mdbx_txn_commit(MDBX_txn *txn) {
mdbx_dpage_free(txn->mt_env, mp, IS_OVERFLOW(mp) ? mp->mp_pages : 1);
dst->length -= 1;
}
parent->tw.dirtyroom += dst->sorted - dst->length;
dst->sorted = dst->length;
mdbx_tassert(parent,
parent->mt_parent ||
parent->tw.dirtyroom + parent->tw.dirtylist->length ==
MDBX_DPL_TXNFULL);
if (parent->tw.spill_pages && MDBX_PNL_SIZE(parent->tw.spill_pages) > 0 &&
MDBX_PNL_MOST(parent->tw.spill_pages) >= parent->mt_next_pgno << 1) {
@ -5855,8 +5933,9 @@ int mdbx_txn_commit(MDBX_txn *txn) {
pgno_t pn = txn->tw.spill_pages[i];
if (pn & 1)
continue; /* deleted spillpg */
parent->tw.dirtyroom += 1;
MDBX_page *dp = mdbx_dpl_remove(dst, pn >> 1);
if (dp && (env->me_flags & MDBX_WRITEMAP) == 0)
if ((env->me_flags & MDBX_WRITEMAP) == 0)
mdbx_dpage_free(env, dp, 1);
} while (++i <= MDBX_PNL_SIZE(txn->tw.spill_pages));
}
@ -5927,11 +6006,23 @@ int mdbx_txn_commit(MDBX_txn *txn) {
parent->mt_next_pgno << 1));
/* Append our loose page list to parent's */
MDBX_page **lp = &parent->tw.loose_pages;
while (*lp)
lp = &(*lp)->mp_next;
*lp = txn->tw.loose_pages;
parent->tw.loose_count += txn->tw.loose_count;
if (txn->tw.loose_pages) {
MDBX_page **lp = &parent->tw.loose_pages;
while (*lp)
lp = &(*lp)->mp_next;
*lp = txn->tw.loose_pages;
parent->tw.loose_count += txn->tw.loose_count;
}
if (txn->tw.retired2parent_pages) {
MDBX_page *mp = txn->tw.retired2parent_pages;
do {
MDBX_page *next = mp->mp_next;
rc = mdbx_loose_page(parent, mp);
if (unlikely(rc != MDBX_SUCCESS))
goto fail;
mp = next;
} while (mp);
}
env->me_txn = parent;
parent->mt_child = NULL;
@ -8045,7 +8136,7 @@ int __cold mdbx_env_open(MDBX_env *env, const char *path, unsigned flags,
size =
tsize + env->me_maxdbs * (sizeof(MDBX_db) + sizeof(MDBX_cursor *) +
sizeof(unsigned) + 1);
if ((env->me_pbuf = mdbx_calloc(2, env->me_psize)) &&
if ((env->me_pbuf = mdbx_calloc(1, env->me_psize)) &&
(txn = mdbx_calloc(1, size))) {
txn->mt_dbs = (MDBX_db *)((char *)txn + tsize);
txn->mt_cursors = (MDBX_cursor **)(txn->mt_dbs + env->me_maxdbs);
@ -8766,103 +8857,6 @@ __hot static int mdbx_page_search(MDBX_cursor *mc, MDBX_val *key, int flags) {
return mdbx_page_search_root(mc, key, flags);
}
static int mdbx_retire_ovpage(MDBX_cursor *mc, MDBX_page *mp) {
MDBX_txn *txn = mc->mc_txn;
pgno_t pg = mp->mp_pgno;
unsigned x = 0, ovpages = mp->mp_pages;
MDBX_env *env = txn->mt_env;
MDBX_PNL sl = txn->tw.spill_pages;
pgno_t pn = pg << 1;
int rc;
mdbx_cassert(mc, (mc->mc_flags & C_GCFREEZE) == 0);
mdbx_cassert(mc, IS_OVERFLOW(mp));
mdbx_debug("free ov page %" PRIaPGNO " (%u)", pg, ovpages);
if (mdbx_audit_enabled()) {
mdbx_cassert(
mc, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, txn->mt_next_pgno));
const unsigned a = mdbx_pnl_search(txn->tw.reclaimed_pglist, pg);
mdbx_cassert(mc, a > MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) ||
txn->tw.reclaimed_pglist[a] != pg);
if (a <= MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) &&
unlikely(txn->tw.reclaimed_pglist[a] == pg))
return MDBX_PROBLEM;
if (ovpages > 1) {
const unsigned b =
mdbx_pnl_search(txn->tw.reclaimed_pglist, pg + ovpages - 1);
mdbx_cassert(mc, a == b);
if (unlikely(a != b))
return MDBX_PROBLEM;
}
}
/* If the page is dirty or on the spill list we just acquired it,
* so we should give it back to our current free list, if any.
* Otherwise put it onto the list of pages we freed in this txn.
*
* Unsupported in nested txns: They would need to hide the page
* range in ancestor txns' dirty and spilled lists. */
if (txn->tw.last_reclaimed && !txn->mt_parent &&
(IS_DIRTY(mp) || (sl && (x = mdbx_pnl_exist(sl, pn)) > 0))) {
unsigned i, j;
pgno_t *mop;
rc = mdbx_pnl_need(&txn->tw.reclaimed_pglist, ovpages);
if (unlikely(rc))
return rc;
if (!IS_DIRTY(mp)) {
/* This page is no longer spilled */
if (x == MDBX_PNL_SIZE(sl))
MDBX_PNL_SIZE(sl)--;
else
sl[x] |= 1;
goto release;
}
/* Remove from dirty list */
MDBX_page *dp = mdbx_dpl_remove(txn->tw.dirtylist, mp->mp_pgno);
if (unlikely(dp != mp)) {
mdbx_error("not found page 0x%p #%" PRIaPGNO " in the dirtylist", mp,
mp->mp_pgno);
txn->mt_flags |= MDBX_TXN_ERROR;
return MDBX_PROBLEM;
}
txn->tw.dirtyroom++;
mdbx_tassert(txn, txn->mt_parent ||
txn->tw.dirtyroom + txn->tw.dirtylist->length ==
MDBX_DPL_TXNFULL);
if (!(env->me_flags & MDBX_WRITEMAP))
mdbx_dpage_free(env, mp, ovpages);
release:
/* Insert in me_reclaimed_pglist */
mop = txn->tw.reclaimed_pglist;
j = MDBX_PNL_SIZE(mop) + ovpages;
for (i = MDBX_PNL_SIZE(mop); i && MDBX_PNL_DISORDERED(mop[i], pg);)
mop[j--] = mop[i--];
MDBX_PNL_SIZE(mop) += ovpages;
pgno_t n = MDBX_PNL_ASCENDING ? pg + ovpages : pg;
while (j > i)
mop[j--] = MDBX_PNL_ASCENDING ? --n : n++;
mdbx_tassert(txn, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist,
txn->mt_next_pgno));
} else {
rc = mdbx_pnl_append_range(&txn->tw.retired_pages, pg, ovpages);
if (unlikely(rc))
return rc;
mdbx_tassert(txn, mdbx_dirtylist_check(txn));
}
mc->mc_db->md_overflow_pages -= ovpages;
if (unlikely(mc->mc_flags & C_SUB)) {
MDBX_db *outer = mdbx_outer_db(mc);
outer->md_overflow_pages -= ovpages;
}
return 0;
}
/* Return the data associated with a given node.
*
* [in] mc The cursor for this operation.
@ -10055,7 +10049,7 @@ int mdbx_cursor_put(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data,
return MDBX_SUCCESS;
}
}
if ((rc2 = mdbx_retire_ovpage(mc, omp)) != MDBX_SUCCESS)
if ((rc2 = mdbx_retire_page(mc, omp)) != MDBX_SUCCESS)
return rc2;
} else {
olddata.iov_len = NODEDSZ(leaf);
@ -10477,9 +10471,6 @@ int mdbx_cursor_del(MDBX_cursor *mc, unsigned flags) {
if (leaf->mn_flags & F_SUBDATA) {
/* add all the child DB's pages to the free list */
mc->mc_db->md_branch_pages -= mc->mc_xcursor->mx_db.md_branch_pages;
mc->mc_db->md_leaf_pages -= mc->mc_xcursor->mx_db.md_leaf_pages;
mc->mc_db->md_overflow_pages -= mc->mc_xcursor->mx_db.md_overflow_pages;
rc = mdbx_drop0(&mc->mc_xcursor->mx_cursor, 0);
if (unlikely(rc))
goto fail;
@ -10498,7 +10489,7 @@ int mdbx_cursor_del(MDBX_cursor *mc, unsigned flags) {
memcpy(&pg, NODEDATA(leaf), sizeof(pg));
if (unlikely((rc = mdbx_page_get(mc, pg, &omp, NULL)) ||
(rc = mdbx_retire_ovpage(mc, omp))))
(rc = mdbx_retire_page(mc, omp))))
goto fail;
}
@ -14157,21 +14148,6 @@ int mdbx_dbi_flags(MDBX_txn *txn, MDBX_dbi dbi, unsigned *flags) {
return mdbx_dbi_flags_ex(txn, dbi, flags, &state);
}
static __must_check_result int mdbx_retire_pgno(MDBX_cursor *mc,
const pgno_t pgno) {
MDBX_txn *const txn = mc->mc_txn;
if (txn->mt_flags & MDBX_WRITEMAP)
return mdbx_retire_page(mc, pgno2page(txn->mt_env, pgno));
MDBX_page *dp = mdbx_dpl_find(txn->tw.dirtylist, pgno);
if (dp)
return mdbx_loose_page(txn, dp);
int rc = mdbx_pnl_append(&txn->tw.retired_pages, pgno);
mdbx_tassert(txn, mdbx_dirtylist_check(txn));
return rc;
}
/* Add all the DB's pages to the free list.
* [in] mc Cursor on the DB to free.
* [in] subs non-Zero to check for sub-DBs in this DB.
@ -14188,14 +14164,7 @@ static int mdbx_drop0(MDBX_cursor *mc, int subs) {
* This also avoids any P_LEAF2 pages, which have no nodes.
* Also if the DB doesn't have sub-DBs and has no overflow
* pages, omit scanning leaves. */
if (mc->mc_flags & C_SUB) {
MDBX_db *outer = mdbx_outer_db(mc);
outer->md_branch_pages -= mc->mc_db->md_branch_pages;
outer->md_leaf_pages -= mc->mc_db->md_leaf_pages;
outer->md_overflow_pages -= mc->mc_db->md_overflow_pages;
mdbx_cursor_pop(mc);
} else if (!subs && !mc->mc_db->md_overflow_pages)
if ((mc->mc_flags & C_SUB) || (subs | mc->mc_db->md_overflow_pages) == 0)
mdbx_cursor_pop(mc);
rc = mdbx_pnl_need(&txn->tw.retired_pages,
@ -14219,7 +14188,7 @@ static int mdbx_drop0(MDBX_cursor *mc, int subs) {
if (unlikely(rc))
goto done;
mdbx_cassert(mc, IS_OVERFLOW(omp));
rc = mdbx_retire_ovpage(mc, omp);
rc = mdbx_retire_page(mc, omp);
if (unlikely(rc))
goto done;
if (!mc->mc_db->md_overflow_pages && !subs)

View File

@ -802,36 +802,6 @@ struct MDBX_txn {
MDBX_db *mt_dbs;
/* Array of sequence numbers for each DB handle */
unsigned *mt_dbiseqs;
union {
struct {
/* For read txns: This thread/txn's reader table slot, or NULL. */
MDBX_reader *reader;
} to;
struct {
/* The list of reclaimed txns from GC */
MDBX_TXL lifo_reclaimed;
/* The list of pages that became unused during this transaction. */
MDBX_PNL retired_pages;
/* The list of loose pages that became unused and may be reused
* in this transaction, linked through NEXT_LOOSE_PAGE(page). */
MDBX_page *loose_pages;
/* Number of loose pages (tw.loose_pages) */
unsigned loose_count;
/* The sorted list of dirty pages we temporarily wrote to disk
* because the dirty list was full. page numbers in here are
* shifted left by 1, deleted slots have the LSB set. */
MDBX_PNL spill_pages;
/* dirtylist room: Array size - dirty pages visible to this txn.
* Includes ancestor txns' dirty pages not hidden by other txns'
* dirty/spilled pages. Thus commit(nested txn) has room to merge
* dirtylist into mt_parent after freeing hidden mt_parent pages. */
unsigned dirtyroom;
/* For write txns: Modified pages. Sorted when not MDBX_WRITEMAP. */
MDBX_DPL dirtylist;
pgno_t *reclaimed_pglist; /* Reclaimed freeDB pages */
txnid_t last_reclaimed; /* ID of last used record */
} tw;
};
/* Transaction DB Flags */
#define DB_DIRTY MDBX_TBL_DIRTY /* DB was written in this txn */
@ -852,6 +822,42 @@ struct MDBX_txn {
MDBX_dbi mt_numdbs;
size_t mt_owner; /* thread ID that owns this transaction */
mdbx_canary mt_canary;
union {
struct {
/* For read txns: This thread/txn's reader table slot, or NULL. */
MDBX_reader *reader;
} to;
struct {
/* The list of reclaimed txns from GC */
MDBX_TXL lifo_reclaimed;
/* The list of pages that became unused during this transaction. */
MDBX_PNL retired_pages;
/* The list of loose pages that became unused and may be reused
* in this transaction, linked through `mp_next`. */
MDBX_page *loose_pages;
/* Number of loose pages (tw.loose_pages) */
unsigned loose_count;
/* Number of retired to parent pages (tw.retired2parent_pages) */
unsigned retired2parent_count;
/* The list of parent's txn dirty pages that retired (became unused)
* in this transaction, linked through `mp_next`. */
MDBX_page *retired2parent_pages;
/* The sorted list of dirty pages we temporarily wrote to disk
* because the dirty list was full. page numbers in here are
* shifted left by 1, deleted slots have the LSB set. */
MDBX_PNL spill_pages;
/* dirtylist room: Dirty array size - dirty pages visible to this txn.
* Includes ancestor txns' dirty pages not hidden by other txns'
* dirty/spilled pages. Thus commit(nested txn) has room to merge
* dirtylist into mt_parent after freeing hidden mt_parent pages. */
unsigned dirtyroom;
/* For write txns: Modified pages. Sorted when not MDBX_WRITEMAP. */
MDBX_DPL dirtylist;
pgno_t *reclaimed_pglist; /* Reclaimed freeDB pages */
txnid_t last_reclaimed; /* ID of last used record */
} tw;
};
};
/* Enough space for 2^32 nodes with minimum of 2 keys per node. I.e., plenty.