mdbx: rework/optimize pages refunding.

Change-Id: I9315ea9187eaff4572536ab9c895fb6995eebd94
This commit is contained in:
Leonid Yuriev 2019-10-17 10:08:49 +03:00
parent 7f0dd35702
commit 363841caf8
2 changed files with 280 additions and 218 deletions

View File

@ -1643,6 +1643,9 @@ uint8_t mdbx_loglevel = MDBX_DEBUG;
MDBX_debug_func *mdbx_debug_logger; MDBX_debug_func *mdbx_debug_logger;
#endif /* MDBX_ALLOY */ #endif /* MDBX_ALLOY */
static bool mdbx_refund(MDBX_txn *txn);
static __must_check_result int mdbx_page_retire(MDBX_cursor *mc, MDBX_page *mp);
static __must_check_result int mdbx_page_loose(MDBX_txn *txn, MDBX_page *mp);
static int mdbx_page_alloc(MDBX_cursor *mc, unsigned num, MDBX_page **mp, static int mdbx_page_alloc(MDBX_cursor *mc, unsigned num, MDBX_page **mp,
int flags); int flags);
static int mdbx_page_new(MDBX_cursor *mc, uint32_t flags, unsigned num, static int mdbx_page_new(MDBX_cursor *mc, uint32_t flags, unsigned num,
@ -2245,109 +2248,219 @@ static __maybe_unused bool mdbx_dirtylist_check(MDBX_txn *txn) {
return true; return true;
} }
static __must_check_result int mdbx_refund_dirty(MDBX_txn *txn, MDBX_page *mp) { static void mdbx_refund_reclaimed(MDBX_txn *txn) {
mdbx_verbose("refund page %" PRIaPGNO, mp->mp_pgno); /* Scanning in descend order */
MDBX_page *dp = mdbx_dpl_remove(txn->tw.dirtylist, mp->mp_pgno); pgno_t next_pgno = txn->mt_next_pgno;
if (unlikely(dp != mp)) { const MDBX_PNL pnl = txn->tw.reclaimed_pglist;
mdbx_error("not found page 0x%p #%" PRIaPGNO " in the dirtylist", mp, mdbx_tassert(txn, MDBX_PNL_SIZE(pnl) && MDBX_PNL_MOST(pnl) == next_pgno - 1);
mp->mp_pgno); #if MDBX_PNL_ASCENDING
txn->mt_flags |= MDBX_TXN_ERROR; unsigned i = MDBX_PNL_SIZE(pnl);
return MDBX_PROBLEM; mdbx_tassert(txn, pnl[i] == next_pgno - 1);
} while (--next_pgno, --i > 0 && pnl[i] == next_pgno - 1)
;
if (txn->tw.spill_pages) { MDBX_PNL_SIZE(pnl) = i;
unsigned i = mdbx_pnl_exist(txn->tw.spill_pages, mp->mp_pgno << 1); #else
if (i) { unsigned i = 1;
txn->tw.spill_pages[i] |= 1; mdbx_tassert(txn, pnl[i] == next_pgno - 1);
if (i == MDBX_PNL_SIZE(txn->tw.spill_pages)) unsigned len = MDBX_PNL_SIZE(pnl);
MDBX_PNL_SIZE(txn->tw.spill_pages) -= 1; while (--next_pgno, ++i <= len && pnl[i] == next_pgno - 1)
} ;
} MDBX_PNL_SIZE(pnl) = len -= i - 1;
for (unsigned move = 0; move < len; ++move)
txn->tw.dirtyroom += 1; pnl[1 + move] = pnl[i + move];
mdbx_tassert(txn, txn->mt_parent || #endif
txn->tw.dirtyroom + txn->tw.dirtylist->length == mdbx_verbose("refunded %" PRIaPGNO " pages: %" PRIaPGNO " -> %" PRIaPGNO,
MDBX_DPL_TXNFULL); txn->mt_next_pgno - next_pgno, txn->mt_next_pgno, next_pgno);
if ((txn->mt_env->me_flags & MDBX_WRITEMAP) == 0) txn->mt_next_pgno = next_pgno;
mdbx_dpage_free(txn->mt_env, mp, 1); mdbx_tassert(
return MDBX_SUCCESS; txn, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, txn->mt_next_pgno));
}
/* try to refund loose pages */
static __must_check_result int mdbx_refund_loose(MDBX_txn *txn, MDBX_page *mp) {
if (mp) {
int rc = mdbx_refund_dirty(txn, mp);
txn->mt_next_pgno -= 1;
if (unlikely(rc != MDBX_SUCCESS))
return rc;
if (!txn->tw.loose_pages)
return MDBX_SUCCESS;
} }
static void mdbx_refund_loose(MDBX_txn *txn) {
mdbx_tassert(txn, mdbx_dirtylist_check(txn)); mdbx_tassert(txn, mdbx_dirtylist_check(txn));
mdbx_tassert(txn, txn->tw.loose_pages != nullptr); mdbx_tassert(txn, txn->tw.loose_pages != nullptr);
mdbx_tassert(txn, txn->tw.loose_count > 0); mdbx_tassert(txn, txn->tw.loose_count > 0);
const MDBX_DPL dl = txn->tw.dirtylist;
mdbx_tassert(txn, dl->length >= txn->tw.loose_count);
mdbx_tassert(txn, txn->tw.spill_pages == nullptr ||
dl->length >= MDBX_PNL_SIZE(txn->tw.spill_pages));
pgno_t onstack[MDBX_CACHELINE_SIZE * 8 / sizeof(pgno_t)]; pgno_t onstack[MDBX_CACHELINE_SIZE * 8 / sizeof(pgno_t)];
MDBX_PNL pnl = onstack; MDBX_PNL suitable = onstack;
MDBX_PNL_SIZE(pnl) = 0;
if (dl->length - dl->sorted > txn->tw.loose_count) {
/* Dirty list is useless since unsorted. */
MDBX_PNL_SIZE(suitable) = 0;
if (bytes2pnl(sizeof(onstack)) < txn->tw.loose_count) { if (bytes2pnl(sizeof(onstack)) < txn->tw.loose_count) {
pnl = mdbx_pnl_alloc(txn->tw.loose_count); suitable = mdbx_pnl_alloc(txn->tw.loose_count);
if (unlikely(!pnl)) if (unlikely(!suitable))
return /* this is not a reason for transaction fail */ MDBX_SUCCESS; return /* this is not a reason for transaction fail */;
} }
/* first pass: collect pages which may be refunded */ /* Collect loose-pages which may be refunded. */
mdbx_tassert(txn, txn->mt_next_pgno >= MIN_PAGENO + txn->tw.loose_count); mdbx_tassert(txn, txn->mt_next_pgno >= MIN_PAGENO + txn->tw.loose_count);
pgno_t most = MIN_PAGENO; pgno_t most = MIN_PAGENO;
for (mp = txn->tw.loose_pages; mp; mp = mp->mp_next) { for (const MDBX_page *dp = txn->tw.loose_pages; dp; dp = dp->mp_next) {
mdbx_tassert(txn, txn->mt_next_pgno > mp->mp_pgno); mdbx_tassert(txn, dp->mp_flags == (P_LOOSE | P_DIRTY));
if (likely(txn->mt_next_pgno - txn->tw.loose_count <= mp->mp_pgno)) { mdbx_tassert(txn, txn->mt_next_pgno > dp->mp_pgno);
mdbx_tassert(txn, MDBX_PNL_SIZE(pnl) < ((pnl == onstack) if (likely(txn->mt_next_pgno - txn->tw.loose_count <= dp->mp_pgno)) {
? bytes2pnl(sizeof(onstack)) mdbx_tassert(txn,
: MDBX_PNL_ALLOCLEN(pnl))); MDBX_PNL_SIZE(suitable) <
MDBX_PNL_SIZE(pnl) += 1; ((suitable == onstack) ? bytes2pnl(sizeof(onstack))
MDBX_PNL_LAST(pnl) = mp->mp_pgno; : MDBX_PNL_ALLOCLEN(suitable)));
most = (mp->mp_pgno > most) ? mp->mp_pgno : most; MDBX_PNL_SIZE(suitable) += 1;
MDBX_PNL_LAST(suitable) = dp->mp_pgno;
most = (dp->mp_pgno > most) ? dp->mp_pgno : most;
} }
} }
int rc = MDBX_SUCCESS;
if (most + 1 == txn->mt_next_pgno) { if (most + 1 == txn->mt_next_pgno) {
/* second pass: sort and refund suitable pages */ /* Sort suitable list and refund pages at the tail. */
mdbx_pnl_sort(pnl); mdbx_pnl_sort(suitable);
txn->mt_next_pgno -= 1;
#if MDBX_PNL_ASCENDING /* Scanning in descend order */
mdbx_tassert(txn, txn->mt_next_pgno == pnl[MDBX_PNL_SIZE(pnl)]); const int step = MDBX_PNL_ASCENDING ? -1 : 1;
for (unsigned i = MDBX_PNL_SIZE(pnl); --i >= 1;) { const int begin = MDBX_PNL_ASCENDING ? MDBX_PNL_SIZE(suitable) : 1;
#else const int end = MDBX_PNL_ASCENDING ? 0 : MDBX_PNL_SIZE(suitable) + 1;
mdbx_tassert(txn, txn->mt_next_pgno == pnl[1]); mdbx_tassert(txn, suitable[begin] >= suitable[end - step]);
for (unsigned i = 1; ++i <= MDBX_PNL_SIZE(pnl);) { mdbx_tassert(txn, most == suitable[begin]);
#endif
if (pnl[i] != txn->mt_next_pgno - 1) for (int i = begin + step; i != end; i += step) {
if (suitable[i] != most - 1)
break; break;
txn->mt_next_pgno -= 1; most -= 1;
}
const unsigned refunded = txn->mt_next_pgno - most;
mdbx_verbose("refund-sorted %u pages %" PRIaPGNO " -> %" PRIaPGNO,
refunded, most, txn->mt_next_pgno);
txn->tw.loose_count -= refunded;
txn->tw.dirtyroom += refunded;
txn->mt_next_pgno = most;
/* Filter-out dirty list */
unsigned w = 0, r = w;
if (dl->sorted) {
do {
if (dl[++r].pgno < most) {
if (++w != r)
dl[w] = dl[r];
}
} while (r < dl->sorted);
dl->sorted = w;
}
while (r < dl->length) {
if (dl[++r].pgno < most) {
if (++w != r)
dl[w] = dl[r];
}
}
dl->length = w;
mdbx_tassert(txn, txn->mt_parent ||
txn->tw.dirtyroom + txn->tw.dirtylist->length ==
MDBX_DPL_TXNFULL);
goto unlink_loose;
}
} else {
/* Dirtylist is mostly sorted, just refund loose pages at the end. */
mdbx_dpl_sort(dl);
mdbx_tassert(txn, dl->length < 2 || dl[1].pgno < dl[dl->length].pgno);
mdbx_tassert(txn, dl->sorted == dl->length);
/* Scan dirtylist tail-forward and cutoff suitable pages. */
while (dl->length && dl[dl->length].pgno == txn->mt_next_pgno - 1 &&
dl[dl->length].ptr->mp_flags == (P_LOOSE | P_DIRTY)) {
MDBX_page *dp = dl[dl->length].ptr;
mdbx_verbose("refund-unsorted page %" PRIaPGNO, dp->mp_pgno);
mdbx_tassert(txn, dp->mp_pgno == dl[dl->length].pgno);
dl->length -= 1;
} }
/* third pass: filter-out & dispose refunded pages */ if (dl->sorted != dl->length) {
const unsigned refunded = dl->sorted - dl->length;
dl->sorted = dl->length;
txn->tw.loose_count -= refunded;
txn->tw.dirtyroom += refunded;
txn->mt_next_pgno -= refunded;
mdbx_tassert(txn, txn->mt_parent ||
txn->tw.dirtyroom + txn->tw.dirtylist->length ==
MDBX_DPL_TXNFULL);
/* Filter-out loose chain & dispose refunded pages. */
unlink_loose:
for (MDBX_page **link = &txn->tw.loose_pages; *link;) { for (MDBX_page **link = &txn->tw.loose_pages; *link;) {
mp = *link; MDBX_page *dp = *link;
if (txn->mt_next_pgno > mp->mp_pgno) { mdbx_tassert(txn, dp->mp_flags == (P_LOOSE | P_DIRTY));
link = &mp->mp_next; if (txn->mt_next_pgno > dp->mp_pgno) {
link = &dp->mp_next;
} else { } else {
*link = mp->mp_next; *link = dp->mp_next;
txn->tw.loose_count -= 1; if ((txn->mt_flags & MDBX_WRITEMAP) == 0)
rc = mdbx_refund_dirty(txn, mp); mdbx_dpage_free(txn->mt_env, dp, 1);
if (unlikely(rc != MDBX_SUCCESS)) }
break;
} }
} }
} }
mdbx_tassert(txn, mdbx_dirtylist_check(txn)); mdbx_tassert(txn, mdbx_dirtylist_check(txn));
if (pnl != onstack) mdbx_tassert(txn, txn->mt_parent ||
mdbx_pnl_free(pnl); txn->tw.dirtyroom + txn->tw.dirtylist->length ==
return rc; MDBX_DPL_TXNFULL);
if (suitable != onstack)
mdbx_pnl_free(suitable);
txn->tw.loose_refund_wl = txn->mt_next_pgno;
}
static bool mdbx_refund(MDBX_txn *txn) {
const pgno_t before = txn->mt_next_pgno;
if (txn->tw.loose_pages && txn->tw.loose_refund_wl > txn->mt_next_pgno)
mdbx_refund_loose(txn);
while (true) {
if (MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) == 0 ||
MDBX_PNL_MOST(txn->tw.reclaimed_pglist) != txn->mt_next_pgno - 1)
break;
mdbx_refund_reclaimed(txn);
if (!txn->tw.loose_pages || txn->tw.loose_refund_wl <= txn->mt_next_pgno)
break;
const pgno_t memo = txn->mt_next_pgno;
mdbx_refund_loose(txn);
if (memo == txn->mt_next_pgno)
break;
}
return before != txn->mt_next_pgno;
}
static __cold void mdbx_kill_page(MDBX_env *env, MDBX_page *mp, pgno_t pgno,
unsigned npages) {
mdbx_assert(env, pgno >= NUM_METAS && npages);
if (IS_DIRTY(mp) || (env->me_flags & MDBX_WRITEMAP)) {
const size_t bytes = pgno2bytes(env, npages);
memset(mp, 0, bytes);
mp->mp_pgno = pgno;
if ((env->me_flags & MDBX_WRITEMAP) == 0)
mdbx_pwrite(env->me_fd, mp, bytes, pgno2bytes(env, pgno));
} else {
struct iovec iov[MDBX_COMMIT_PAGES];
iov[0].iov_len = env->me_psize;
iov[0].iov_base = (char *)env->me_pbuf + env->me_psize;
size_t iov_off = pgno2bytes(env, pgno);
unsigned n = 1;
while (--npages) {
iov[n] = iov[0];
if (++n == MDBX_COMMIT_PAGES) {
mdbx_pwritev(env->me_fd, iov, MDBX_COMMIT_PAGES, iov_off,
pgno2bytes(env, MDBX_COMMIT_PAGES));
iov_off += pgno2bytes(env, MDBX_COMMIT_PAGES);
n = 0;
}
}
mdbx_pwritev(env->me_fd, iov, n, iov_off, pgno2bytes(env, n));
}
} }
/* Retire, loosen or free a single page. /* Retire, loosen or free a single page.
@ -2361,10 +2474,9 @@ static __must_check_result int mdbx_refund_loose(MDBX_txn *txn, MDBX_page *mp) {
* If the page wasn't dirtied in this txn, just add it * If the page wasn't dirtied in this txn, just add it
* to this txn's free list. */ * to this txn's free list. */
static __must_check_result __hot int mdbx_loose_page(MDBX_txn *txn, static __hot int mdbx_page_loose(MDBX_txn *txn, MDBX_page *mp) {
MDBX_page *mp) {
const unsigned npages = IS_OVERFLOW(mp) ? mp->mp_pages : 1; const unsigned npages = IS_OVERFLOW(mp) ? mp->mp_pages : 1;
pgno_t pgno = mp->mp_pgno; const pgno_t pgno = mp->mp_pgno;
if (txn->mt_parent) { if (txn->mt_parent) {
mdbx_tassert(txn, (txn->mt_env->me_flags & MDBX_WRITEMAP) == 0); mdbx_tassert(txn, (txn->mt_env->me_flags & MDBX_WRITEMAP) == 0);
@ -2389,12 +2501,9 @@ static __must_check_result __hot int mdbx_loose_page(MDBX_txn *txn,
} }
mdbx_debug("loosen page %" PRIaPGNO, pgno); mdbx_debug("loosen page %" PRIaPGNO, pgno);
if (unlikely(txn->mt_env->me_flags & MDBX_PAGEPERTURB)) { const bool is_dirty = IS_DIRTY(mp);
mdbx_tassert(txn, pgno >= NUM_METAS); if (MDBX_DEBUG || unlikely(txn->mt_env->me_flags & MDBX_PAGEPERTURB)) {
const size_t bytes = pgno2bytes(txn->mt_env, npages); mdbx_kill_page(txn->mt_env, mp, pgno, npages);
memset(mp, 0, bytes);
if ((txn->mt_env->me_flags & MDBX_WRITEMAP) == 0)
mdbx_pwrite(txn->mt_env->me_fd, mp, bytes, pgno2bytes(txn->mt_env, pgno));
VALGRIND_MAKE_MEM_UNDEFINED(mp, PAGEHDRSZ); VALGRIND_MAKE_MEM_UNDEFINED(mp, PAGEHDRSZ);
} }
VALGRIND_MAKE_MEM_NOACCESS(&mp->mp_data, txn->mt_env->me_psize - PAGEHDRSZ); VALGRIND_MAKE_MEM_NOACCESS(&mp->mp_data, txn->mt_env->me_psize - PAGEHDRSZ);
@ -2402,12 +2511,12 @@ static __must_check_result __hot int mdbx_loose_page(MDBX_txn *txn,
if (unlikely(npages > if (unlikely(npages >
1 /* overflow pages doesn't comes to the loose-list */)) { 1 /* overflow pages doesn't comes to the loose-list */)) {
if (IS_DIRTY(mp)) { if (is_dirty) {
/* Remove from dirty list */ /* Remove from dirty list */
MDBX_page *dp = mdbx_dpl_remove(txn->tw.dirtylist, mp->mp_pgno); MDBX_page *dp = mdbx_dpl_remove(txn->tw.dirtylist, pgno);
if (unlikely(dp != mp)) { if (unlikely(dp != mp)) {
mdbx_error("not found page 0x%p #%" PRIaPGNO " in the dirtylist", mp, mdbx_error("not found page 0x%p #%" PRIaPGNO " in the dirtylist", mp,
mp->mp_pgno); pgno);
txn->mt_flags |= MDBX_TXN_ERROR; txn->mt_flags |= MDBX_TXN_ERROR;
return MDBX_PROBLEM; return MDBX_PROBLEM;
} }
@ -2419,20 +2528,25 @@ static __must_check_result __hot int mdbx_loose_page(MDBX_txn *txn,
mdbx_dpage_free(txn->mt_env, mp, npages); mdbx_dpage_free(txn->mt_env, mp, npages);
} }
if (unlikely(pgno + npages == txn->mt_next_pgno)) {
txn->mt_next_pgno = pgno;
mdbx_refund(txn);
return MDBX_SUCCESS;
}
int rc = mdbx_pnl_need(&txn->tw.reclaimed_pglist, npages); int rc = mdbx_pnl_need(&txn->tw.reclaimed_pglist, npages);
if (unlikely(rc != MDBX_SUCCESS)) if (unlikely(rc != MDBX_SUCCESS))
return rc; return rc;
/* Insert in me_reclaimed_pglist */ /* Insert in me_reclaimed_pglist */
MDBX_PNL pnl = txn->tw.reclaimed_pglist; const MDBX_PNL pnl = txn->tw.reclaimed_pglist;
unsigned r, w = MDBX_PNL_SIZE(pnl) + npages; unsigned r = MDBX_PNL_SIZE(pnl), w = r + npages;
for (r = MDBX_PNL_SIZE(pnl); r && MDBX_PNL_DISORDERED(pnl[r], pgno);) MDBX_PNL_SIZE(pnl) = w;
while (r && MDBX_PNL_DISORDERED(pnl[r], pgno))
pnl[w--] = pnl[r--]; pnl[w--] = pnl[r--];
MDBX_PNL_SIZE(pnl) += npages;
pgno = MDBX_PNL_ASCENDING ? pgno + npages : pgno; for (pgno_t fill = MDBX_PNL_ASCENDING ? pgno + npages : pgno; w > r; --w)
while (w > r) pnl[w] = MDBX_PNL_ASCENDING ? --fill : fill++;
pnl[w--] = MDBX_PNL_ASCENDING ? --pgno : pgno++;
mdbx_tassert(txn, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, mdbx_tassert(txn, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist,
txn->mt_next_pgno)); txn->mt_next_pgno));
@ -2440,21 +2554,16 @@ static __must_check_result __hot int mdbx_loose_page(MDBX_txn *txn,
} }
mp->mp_flags = P_LOOSE | P_DIRTY; mp->mp_flags = P_LOOSE | P_DIRTY;
mp->mp_pgno = pgno;
if (likely(txn->mt_next_pgno != pgno + 1)) {
mp->mp_next = txn->tw.loose_pages; mp->mp_next = txn->tw.loose_pages;
txn->tw.loose_pages = mp; txn->tw.loose_pages = mp;
txn->tw.loose_count++; txn->tw.loose_count++;
} else { if (unlikely(txn->mt_next_pgno == mp->mp_pgno + 1))
int rc = mdbx_refund_loose(txn, mp); mdbx_refund(txn);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
}
return MDBX_SUCCESS; return MDBX_SUCCESS;
} }
static __must_check_result __hot int mdbx_retire_page(MDBX_cursor *mc, static __hot int mdbx_page_retire(MDBX_cursor *mc, MDBX_page *mp) {
MDBX_page *mp) {
const unsigned npages = IS_OVERFLOW(mp) ? mp->mp_pages : 1; const unsigned npages = IS_OVERFLOW(mp) ? mp->mp_pages : 1;
const pgno_t pgno = mp->mp_pgno; const pgno_t pgno = mp->mp_pgno;
MDBX_txn *const txn = mc->mc_txn; MDBX_txn *const txn = mc->mc_txn;
@ -2475,7 +2584,7 @@ static __must_check_result __hot int mdbx_retire_page(MDBX_cursor *mc,
mc->mc_db->md_overflow_pages -= IS_OVERFLOW(mp) ? npages : 0; mc->mc_db->md_overflow_pages -= IS_OVERFLOW(mp) ? npages : 0;
if (IS_DIRTY(mp)) { if (IS_DIRTY(mp)) {
int rc = mdbx_loose_page(txn, mp); int rc = mdbx_page_loose(txn, mp);
if (unlikely(rc != MDBX_SUCCESS)) if (unlikely(rc != MDBX_SUCCESS))
mc->mc_flags &= ~(C_INITIALIZED | C_EOF); mc->mc_flags &= ~(C_INITIALIZED | C_EOF);
return rc; return rc;
@ -2490,7 +2599,7 @@ static __must_check_result __hot int mdbx_retire_page(MDBX_cursor *mc,
txn->tw.spill_pages[i] |= 1; txn->tw.spill_pages[i] |= 1;
if (i == MDBX_PNL_SIZE(txn->tw.spill_pages)) if (i == MDBX_PNL_SIZE(txn->tw.spill_pages))
MDBX_PNL_SIZE(txn->tw.spill_pages) -= 1; MDBX_PNL_SIZE(txn->tw.spill_pages) -= 1;
int rc = mdbx_loose_page(txn, mp); int rc = mdbx_page_loose(txn, mp);
if (unlikely(rc != MDBX_SUCCESS)) if (unlikely(rc != MDBX_SUCCESS))
mc->mc_flags &= ~(C_INITIALIZED | C_EOF); mc->mc_flags &= ~(C_INITIALIZED | C_EOF);
return rc; return rc;
@ -2510,7 +2619,7 @@ static __must_check_result __inline int mdbx_retire_pgno(MDBX_cursor *mc,
MDBX_page *mp; MDBX_page *mp;
int rc = mdbx_page_get(mc, pgno, &mp, NULL); int rc = mdbx_page_get(mc, pgno, &mp, NULL);
if (likely(rc == MDBX_SUCCESS)) if (likely(rc == MDBX_SUCCESS))
rc = mdbx_retire_page(mc, mp); rc = mdbx_page_retire(mc, mp);
return rc; return rc;
} }
@ -3170,6 +3279,12 @@ static int mdbx_page_alloc(MDBX_cursor *mc, unsigned num, MDBX_page **mp,
/* If there are any loose pages, just use them */ /* If there are any loose pages, just use them */
mdbx_assert(env, mp && num); mdbx_assert(env, mp && num);
if (likely(num == 1 && txn->tw.loose_pages)) { if (likely(num == 1 && txn->tw.loose_pages)) {
if (txn->tw.loose_refund_wl > txn->mt_next_pgno) {
mdbx_refund(txn);
if (unlikely(!txn->tw.loose_pages))
goto skip_cache;
}
np = txn->tw.loose_pages; np = txn->tw.loose_pages;
txn->tw.loose_pages = np->mp_next; txn->tw.loose_pages = np->mp_next;
txn->tw.loose_count--; txn->tw.loose_count--;
@ -3184,6 +3299,7 @@ static int mdbx_page_alloc(MDBX_cursor *mc, unsigned num, MDBX_page **mp,
return MDBX_SUCCESS; return MDBX_SUCCESS;
} }
} }
skip_cache:
mdbx_tassert( mdbx_tassert(
txn, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, txn->mt_next_pgno)); txn, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, txn->mt_next_pgno));
@ -3367,38 +3483,12 @@ static int mdbx_page_alloc(MDBX_cursor *mc, unsigned num, MDBX_page **mp,
mdbx_tassert(txn, mdbx_tassert(txn,
repg_len == 0 || repg_list[repg_len] < txn->mt_next_pgno); repg_len == 0 || repg_list[repg_len] < txn->mt_next_pgno);
if (repg_len) { if (repg_len &&
unlikely(MDBX_PNL_MOST(repg_list) == txn->mt_next_pgno - 1)) {
/* Refund suitable pages into "unallocated" space */ /* Refund suitable pages into "unallocated" space */
pgno_t tail = txn->mt_next_pgno; mdbx_refund(txn);
pgno_t *const begin = repg_list + 1; repg_list = txn->tw.reclaimed_pglist;
pgno_t *const end = begin + repg_len; repg_len = MDBX_PNL_SIZE(repg_list);
pgno_t *higest;
#if MDBX_PNL_ASCENDING
for (higest = end; --higest >= begin;) {
#else
for (higest = begin; higest < end; ++higest) {
#endif /* MDBX_PNL sort-order */
mdbx_tassert(txn, *higest >= NUM_METAS && *higest < tail);
if (*higest != tail - 1)
break;
tail -= 1;
}
if (tail != txn->mt_next_pgno) {
#if MDBX_PNL_ASCENDING
repg_len = (unsigned)(higest + 1 - begin);
#else
repg_len -= (unsigned)(higest - begin);
for (pgno_t *move = begin; higest < end; ++move, ++higest)
*move = *higest;
#endif /* MDBX_PNL sort-order */
MDBX_PNL_SIZE(repg_list) = repg_len;
mdbx_verbose("refunded %" PRIaPGNO " pages: %" PRIaPGNO
" -> %" PRIaPGNO,
txn->mt_next_pgno - tail, tail, txn->mt_next_pgno);
txn->mt_next_pgno = tail;
mdbx_tassert(txn, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist,
txn->mt_next_pgno));
}
} }
/* Don't try to coalesce too much. */ /* Don't try to coalesce too much. */
@ -4229,6 +4319,7 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) {
memcpy(txn->mt_dbs, meta->mm_dbs, CORE_DBS * sizeof(MDBX_db)); memcpy(txn->mt_dbs, meta->mm_dbs, CORE_DBS * sizeof(MDBX_db));
/* Moved to here to avoid a data race in read TXNs */ /* Moved to here to avoid a data race in read TXNs */
txn->mt_geo = meta->mm_geo; txn->mt_geo = meta->mm_geo;
txn->tw.loose_refund_wl = txn->mt_next_pgno;
} }
/* Setup db info */ /* Setup db info */
@ -4461,6 +4552,7 @@ int mdbx_txn_begin(MDBX_env *env, MDBX_txn *parent, unsigned flags,
txn->mt_txnid = parent->mt_txnid; txn->mt_txnid = parent->mt_txnid;
txn->tw.dirtyroom = parent->tw.dirtyroom; txn->tw.dirtyroom = parent->tw.dirtyroom;
txn->mt_geo = parent->mt_geo; txn->mt_geo = parent->mt_geo;
txn->tw.loose_refund_wl = parent->tw.loose_refund_wl;
txn->mt_canary = parent->mt_canary; txn->mt_canary = parent->mt_canary;
parent->mt_flags |= MDBX_TXN_HAS_CHILD; parent->mt_flags |= MDBX_TXN_HAS_CHILD;
parent->mt_child = txn; parent->mt_child = txn;
@ -5116,7 +5208,8 @@ retry:
} }
} }
// handle loose pages - put ones into the reclaimed- or retired-list /* return suitable into unallocated space */
if (mdbx_refund(txn)) {
mdbx_tassert(txn, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, mdbx_tassert(txn, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist,
txn->mt_next_pgno)); txn->mt_next_pgno));
if (mdbx_audit_enabled()) { if (mdbx_audit_enabled()) {
@ -5124,15 +5217,24 @@ retry:
if (unlikely(rc != MDBX_SUCCESS)) if (unlikely(rc != MDBX_SUCCESS))
goto bailout; goto bailout;
} }
}
/* handle loose pages - put ones into the reclaimed- or retired-list */
mdbx_tassert(txn, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist,
txn->mt_next_pgno));
mdbx_tassert(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length ==
MDBX_DPL_TXNFULL);
mdbx_tassert(txn, mdbx_dirtylist_check(txn));
if (mdbx_audit_enabled()) {
rc = mdbx_audit_ex(txn, retired_stored, false);
if (unlikely(rc != MDBX_SUCCESS))
goto bailout;
}
if (txn->tw.loose_pages) { if (txn->tw.loose_pages) {
/* Return loose page numbers to me_reclaimed_pglist, /* Return loose page numbers to me_reclaimed_pglist,
* though usually none are left at this point. * though usually none are left at this point.
* The pages themselves remain in dirtylist. */ * The pages themselves remain in dirtylist. */
if (unlikely(!txn->tw.lifo_reclaimed && txn->tw.last_reclaimed < 1)) { if (unlikely(!txn->tw.lifo_reclaimed && txn->tw.last_reclaimed < 1)) {
rc = mdbx_refund_loose(txn, nullptr);
if (unlikely(rc != MDBX_SUCCESS))
goto bailout;
if (txn->tw.loose_count > 0) { if (txn->tw.loose_count > 0) {
/* Put loose page numbers in tw.retired_pages, /* Put loose page numbers in tw.retired_pages,
* since unable to return them to me_reclaimed_pglist. */ * since unable to return them to me_reclaimed_pglist. */
@ -5203,47 +5305,6 @@ retry:
} }
} }
// handle reclaimed pages - return suitable into unallocated space
mdbx_tassert(txn, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist,
txn->mt_next_pgno));
if (MDBX_PNL_SIZE(txn->tw.reclaimed_pglist)) {
pgno_t tail = txn->mt_next_pgno;
pgno_t *const begin = MDBX_PNL_BEGIN(txn->tw.reclaimed_pglist);
pgno_t *const end = MDBX_PNL_END(txn->tw.reclaimed_pglist);
pgno_t *higest;
#if MDBX_PNL_ASCENDING
for (higest = end; --higest >= begin;) {
#else
for (higest = begin; higest < end; ++higest) {
#endif /* MDBX_PNL sort-order */
mdbx_tassert(txn, *higest >= NUM_METAS && *higest < tail);
if (*higest != tail - 1)
break;
tail -= 1;
}
if (tail != txn->mt_next_pgno) {
#if MDBX_PNL_ASCENDING
MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) =
(unsigned)(higest + 1 - begin);
#else
MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) -= (unsigned)(higest - begin);
for (pgno_t *move = begin; higest < end; ++move, ++higest)
*move = *higest;
#endif /* MDBX_PNL sort-order */
mdbx_verbose(
"%s.refunded %" PRIaPGNO " pages: %" PRIaPGNO " -> %" PRIaPGNO,
dbg_prefix_mode, txn->mt_next_pgno - tail, tail, txn->mt_next_pgno);
txn->mt_next_pgno = tail;
mdbx_tassert(txn, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist,
txn->mt_next_pgno));
if (mdbx_audit_enabled()) {
rc = mdbx_audit_ex(txn, retired_stored, false);
if (unlikely(rc != MDBX_SUCCESS))
goto bailout;
}
}
}
// handle retired-list - store ones into single gc-record // handle retired-list - store ones into single gc-record
if (retired_stored < MDBX_PNL_SIZE(txn->tw.retired_pages)) { if (retired_stored < MDBX_PNL_SIZE(txn->tw.retired_pages)) {
if (unlikely(!retired_stored)) { if (unlikely(!retired_stored)) {
@ -5331,7 +5392,6 @@ retry:
const unsigned prefer_max_scatter = 257; const unsigned prefer_max_scatter = 257;
txnid_t reservation_gc_id; txnid_t reservation_gc_id;
if (lifo) { if (lifo) {
mdbx_tassert(txn, txn->tw.lifo_reclaimed != NULL);
if (unlikely(!txn->tw.lifo_reclaimed)) { if (unlikely(!txn->tw.lifo_reclaimed)) {
txn->tw.lifo_reclaimed = mdbx_txl_alloc(); txn->tw.lifo_reclaimed = mdbx_txl_alloc();
if (unlikely(!txn->tw.lifo_reclaimed)) { if (unlikely(!txn->tw.lifo_reclaimed)) {
@ -6040,7 +6100,7 @@ int mdbx_txn_commit(MDBX_txn *txn) {
MDBX_page *mp = txn->tw.retired2parent_pages; MDBX_page *mp = txn->tw.retired2parent_pages;
do { do {
MDBX_page *next = mp->mp_next; MDBX_page *next = mp->mp_next;
rc = mdbx_loose_page(parent, mp); rc = mdbx_page_loose(parent, mp);
if (unlikely(rc != MDBX_SUCCESS)) if (unlikely(rc != MDBX_SUCCESS))
goto fail; goto fail;
mp = next; mp = next;
@ -6056,12 +6116,12 @@ int mdbx_txn_commit(MDBX_txn *txn) {
/* Scan parent's loose page for suitable for refund */ /* Scan parent's loose page for suitable for refund */
for (MDBX_page *mp = parent->tw.loose_pages; mp; mp = mp->mp_next) { for (MDBX_page *mp = parent->tw.loose_pages; mp; mp = mp->mp_next) {
if (mp->mp_pgno == parent->mt_next_pgno - 1) { if (mp->mp_pgno == parent->mt_next_pgno - 1) {
rc = mdbx_refund_loose(parent, nullptr); mdbx_refund(parent);
break; break;
} }
} }
mdbx_tassert(txn, mdbx_dirtylist_check(parent)); mdbx_tassert(txn, mdbx_dirtylist_check(parent));
return rc; return MDBX_SUCCESS;
} }
mdbx_tassert(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length == mdbx_tassert(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length ==
@ -8159,7 +8219,8 @@ int __cold mdbx_env_open(MDBX_env *env, const char *path, unsigned flags,
size = size =
tsize + env->me_maxdbs * (sizeof(MDBX_db) + sizeof(MDBX_cursor *) + tsize + env->me_maxdbs * (sizeof(MDBX_db) + sizeof(MDBX_cursor *) +
sizeof(unsigned) + 1); sizeof(unsigned) + 1);
if ((env->me_pbuf = mdbx_calloc(1, env->me_psize)) && if ((env->me_pbuf = mdbx_calloc(
1 /* page buffer */ + 1 /* page killer bufer */, env->me_psize)) &&
(txn = mdbx_calloc(1, size))) { (txn = mdbx_calloc(1, size))) {
txn->mt_dbs = (MDBX_db *)((char *)txn + tsize); txn->mt_dbs = (MDBX_db *)((char *)txn + tsize);
txn->mt_cursors = (MDBX_cursor **)(txn->mt_dbs + env->me_maxdbs); txn->mt_cursors = (MDBX_cursor **)(txn->mt_dbs + env->me_maxdbs);
@ -10072,7 +10133,7 @@ int mdbx_cursor_put(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data,
return MDBX_SUCCESS; return MDBX_SUCCESS;
} }
} }
if ((rc2 = mdbx_retire_page(mc, omp)) != MDBX_SUCCESS) if ((rc2 = mdbx_page_retire(mc, omp)) != MDBX_SUCCESS)
return rc2; return rc2;
} else { } else {
olddata.iov_len = NODEDSZ(leaf); olddata.iov_len = NODEDSZ(leaf);
@ -10512,7 +10573,7 @@ int mdbx_cursor_del(MDBX_cursor *mc, unsigned flags) {
memcpy(&pg, NODEDATA(leaf), sizeof(pg)); memcpy(&pg, NODEDATA(leaf), sizeof(pg));
if (unlikely((rc = mdbx_page_get(mc, pg, &omp, NULL)) || if (unlikely((rc = mdbx_page_get(mc, pg, &omp, NULL)) ||
(rc = mdbx_retire_page(mc, omp)))) (rc = mdbx_page_retire(mc, omp))))
goto fail; goto fail;
} }
@ -11711,7 +11772,7 @@ static int mdbx_page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) {
/* If not operating on FreeDB, allow this page to be reused /* If not operating on FreeDB, allow this page to be reused
* in this txn. Otherwise just add to free list. */ * in this txn. Otherwise just add to free list. */
rc = mdbx_retire_page(csrc, psrc); rc = mdbx_page_retire(csrc, psrc);
if (unlikely(rc)) if (unlikely(rc))
return rc; return rc;
@ -11886,7 +11947,7 @@ static int mdbx_rebalance(MDBX_cursor *mc) {
mc->mc_top = 0; mc->mc_top = 0;
mc->mc_flags &= ~C_INITIALIZED; mc->mc_flags &= ~C_INITIALIZED;
rc = mdbx_retire_page(mc, mp); rc = mdbx_page_retire(mc, mp);
if (unlikely(rc != MDBX_SUCCESS)) if (unlikely(rc != MDBX_SUCCESS))
return rc; return rc;
} else if (IS_BRANCH(mp) && nkeys == 1) { } else if (IS_BRANCH(mp) && nkeys == 1) {
@ -11923,7 +11984,7 @@ static int mdbx_rebalance(MDBX_cursor *mc) {
mdbx_cassert(mc, mc->mc_snum < mc->mc_db->md_depth || mdbx_cassert(mc, mc->mc_snum < mc->mc_db->md_depth ||
IS_LEAF(mc->mc_pg[mc->mc_db->md_depth - 1])); IS_LEAF(mc->mc_pg[mc->mc_db->md_depth - 1]));
rc = mdbx_retire_page(mc, mp); rc = mdbx_page_retire(mc, mp);
if (unlikely(rc != MDBX_SUCCESS)) if (unlikely(rc != MDBX_SUCCESS))
return rc; return rc;
} else { } else {
@ -14211,7 +14272,7 @@ static int mdbx_drop0(MDBX_cursor *mc, int subs) {
if (unlikely(rc)) if (unlikely(rc))
goto done; goto done;
mdbx_cassert(mc, IS_OVERFLOW(omp)); mdbx_cassert(mc, IS_OVERFLOW(omp));
rc = mdbx_retire_page(mc, omp); rc = mdbx_page_retire(mc, omp);
if (unlikely(rc)) if (unlikely(rc))
goto done; goto done;
if (!mc->mc_db->md_overflow_pages && !subs) if (!mc->mc_db->md_overflow_pages && !subs)

View File

@ -829,6 +829,16 @@ struct MDBX_txn {
MDBX_reader *reader; MDBX_reader *reader;
} to; } to;
struct { struct {
pgno_t *reclaimed_pglist; /* Reclaimed freeDB pages */
txnid_t last_reclaimed; /* ID of last used record */
pgno_t loose_refund_wl /* FIXME: describe */;
/* dirtylist room: Dirty array size - dirty pages visible to this txn.
* Includes ancestor txns' dirty pages not hidden by other txns'
* dirty/spilled pages. Thus commit(nested txn) has room to merge
* dirtylist into mt_parent after freeing hidden mt_parent pages. */
unsigned dirtyroom;
/* For write txns: Modified pages. Sorted when not MDBX_WRITEMAP. */
MDBX_DPL dirtylist;
/* The list of reclaimed txns from GC */ /* The list of reclaimed txns from GC */
MDBX_TXL lifo_reclaimed; MDBX_TXL lifo_reclaimed;
/* The list of pages that became unused during this transaction. */ /* The list of pages that became unused during this transaction. */
@ -847,15 +857,6 @@ struct MDBX_txn {
* because the dirty list was full. page numbers in here are * because the dirty list was full. page numbers in here are
* shifted left by 1, deleted slots have the LSB set. */ * shifted left by 1, deleted slots have the LSB set. */
MDBX_PNL spill_pages; MDBX_PNL spill_pages;
/* dirtylist room: Dirty array size - dirty pages visible to this txn.
* Includes ancestor txns' dirty pages not hidden by other txns'
* dirty/spilled pages. Thus commit(nested txn) has room to merge
* dirtylist into mt_parent after freeing hidden mt_parent pages. */
unsigned dirtyroom;
/* For write txns: Modified pages. Sorted when not MDBX_WRITEMAP. */
MDBX_DPL dirtylist;
pgno_t *reclaimed_pglist; /* Reclaimed freeDB pages */
txnid_t last_reclaimed; /* ID of last used record */
} tw; } tw;
}; };
}; };