mirror of
https://github.com/isar/libmdbx.git
synced 2025-01-08 08:04:13 +08:00
mdbx: rework gc-backlog preparation.
Change-Id: I003d7995116a25c2bfdd32b663d4e605a796c17d
This commit is contained in:
parent
ba935ab8f1
commit
8b4be26ca1
@ -2105,8 +2105,8 @@ MDBX_debug_func *mdbx_debug_logger;
|
|||||||
static bool mdbx_refund(MDBX_txn *txn);
|
static bool mdbx_refund(MDBX_txn *txn);
|
||||||
static __must_check_result int mdbx_page_retire(MDBX_cursor *mc, MDBX_page *mp);
|
static __must_check_result int mdbx_page_retire(MDBX_cursor *mc, MDBX_page *mp);
|
||||||
static __must_check_result int mdbx_page_loose(MDBX_txn *txn, MDBX_page *mp);
|
static __must_check_result int mdbx_page_loose(MDBX_txn *txn, MDBX_page *mp);
|
||||||
static int mdbx_page_alloc(MDBX_cursor *mc, unsigned num, MDBX_page **mp,
|
static int mdbx_page_alloc(MDBX_cursor *mc, const unsigned num,
|
||||||
int flags);
|
MDBX_page **const mp, int flags);
|
||||||
static txnid_t mdbx_oomkick(MDBX_env *env, const txnid_t laggard);
|
static txnid_t mdbx_oomkick(MDBX_env *env, const txnid_t laggard);
|
||||||
|
|
||||||
static int mdbx_page_new(MDBX_cursor *mc, uint32_t flags, unsigned num,
|
static int mdbx_page_new(MDBX_cursor *mc, uint32_t flags, unsigned num,
|
||||||
@ -3925,8 +3925,8 @@ __cold static int mdbx_wipe_steady(MDBX_env *env, const txnid_t last_steady) {
|
|||||||
#define MDBX_ALLOC_NEW 4
|
#define MDBX_ALLOC_NEW 4
|
||||||
#define MDBX_ALLOC_ALL (MDBX_ALLOC_CACHE | MDBX_ALLOC_GC | MDBX_ALLOC_NEW)
|
#define MDBX_ALLOC_ALL (MDBX_ALLOC_CACHE | MDBX_ALLOC_GC | MDBX_ALLOC_NEW)
|
||||||
|
|
||||||
static int mdbx_page_alloc(MDBX_cursor *mc, unsigned num, MDBX_page **mp,
|
static int mdbx_page_alloc(MDBX_cursor *mc, const unsigned num,
|
||||||
int flags) {
|
MDBX_page **const mp, int flags) {
|
||||||
int rc;
|
int rc;
|
||||||
MDBX_txn *txn = mc->mc_txn;
|
MDBX_txn *txn = mc->mc_txn;
|
||||||
MDBX_env *env = txn->mt_env;
|
MDBX_env *env = txn->mt_env;
|
||||||
@ -3945,10 +3945,10 @@ static int mdbx_page_alloc(MDBX_cursor *mc, unsigned num, MDBX_page **mp,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (likely(flags & MDBX_ALLOC_CACHE)) {
|
if (likely(num == 1 && (flags & MDBX_ALLOC_CACHE) != 0)) {
|
||||||
/* If there are any loose pages, just use them */
|
/* If there are any loose pages, just use them */
|
||||||
mdbx_assert(env, mp && num);
|
mdbx_assert(env, mp && num);
|
||||||
if (likely(num == 1 && txn->tw.loose_pages)) {
|
if (likely(txn->tw.loose_pages)) {
|
||||||
if (txn->tw.loose_refund_wl > txn->mt_next_pgno) {
|
if (txn->tw.loose_refund_wl > txn->mt_next_pgno) {
|
||||||
mdbx_refund(txn);
|
mdbx_refund(txn);
|
||||||
if (unlikely(!txn->tw.loose_pages))
|
if (unlikely(!txn->tw.loose_pages))
|
||||||
@ -3976,7 +3976,7 @@ skip_cache:
|
|||||||
txnid_t oldest = 0, last = 0;
|
txnid_t oldest = 0, last = 0;
|
||||||
const unsigned wanna_range = num - 1;
|
const unsigned wanna_range = num - 1;
|
||||||
|
|
||||||
while (1) { /* oom-kick retry loop */
|
while (true) { /* oom-kick retry loop */
|
||||||
/* If our dirty list is already full, we can't do anything */
|
/* If our dirty list is already full, we can't do anything */
|
||||||
if (unlikely(txn->tw.dirtyroom == 0)) {
|
if (unlikely(txn->tw.dirtyroom == 0)) {
|
||||||
rc = MDBX_TXN_FULL;
|
rc = MDBX_TXN_FULL;
|
||||||
@ -4098,8 +4098,8 @@ skip_cache:
|
|||||||
if (flags & MDBX_LIFORECLAIM) {
|
if (flags & MDBX_LIFORECLAIM) {
|
||||||
/* skip IDs of records that already reclaimed */
|
/* skip IDs of records that already reclaimed */
|
||||||
if (txn->tw.lifo_reclaimed) {
|
if (txn->tw.lifo_reclaimed) {
|
||||||
unsigned i;
|
size_t i;
|
||||||
for (i = (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed); i > 0; --i)
|
for (i = (size_t)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed); i > 0; --i)
|
||||||
if (txn->tw.lifo_reclaimed[i] == last)
|
if (txn->tw.lifo_reclaimed[i] == last)
|
||||||
break;
|
break;
|
||||||
if (i)
|
if (i)
|
||||||
@ -4187,8 +4187,7 @@ skip_cache:
|
|||||||
flags &= ~MDBX_COALESCE;
|
flags &= ~MDBX_COALESCE;
|
||||||
}
|
}
|
||||||
|
|
||||||
if ((flags & (MDBX_COALESCE | MDBX_ALLOC_CACHE)) ==
|
if ((flags & (MDBX_COALESCE | MDBX_ALLOC_CACHE)) == MDBX_ALLOC_CACHE &&
|
||||||
(MDBX_COALESCE | MDBX_ALLOC_CACHE) &&
|
|
||||||
re_len > wanna_range) {
|
re_len > wanna_range) {
|
||||||
range_begin = MDBX_PNL_ASCENDING ? 1 : re_len;
|
range_begin = MDBX_PNL_ASCENDING ? 1 : re_len;
|
||||||
pgno = MDBX_PNL_LEAST(re_list);
|
pgno = MDBX_PNL_LEAST(re_list);
|
||||||
@ -4256,7 +4255,8 @@ skip_cache:
|
|||||||
rc = mdbx_wipe_steady(env, oldest);
|
rc = mdbx_wipe_steady(env, oldest);
|
||||||
mdbx_debug("gc-wipe-steady, rc %d", rc);
|
mdbx_debug("gc-wipe-steady, rc %d", rc);
|
||||||
mdbx_assert(env, steady != mdbx_meta_steady(env));
|
mdbx_assert(env, steady != mdbx_meta_steady(env));
|
||||||
} else if ((autosync_threshold &&
|
} else if ((flags & MDBX_ALLOC_NEW) == 0 ||
|
||||||
|
(autosync_threshold &&
|
||||||
*env->me_unsynced_pages >= autosync_threshold) ||
|
*env->me_unsynced_pages >= autosync_threshold) ||
|
||||||
(autosync_period &&
|
(autosync_period &&
|
||||||
mdbx_osal_monotime() - *env->me_sync_timestamp >=
|
mdbx_osal_monotime() - *env->me_sync_timestamp >=
|
||||||
@ -4309,8 +4309,6 @@ skip_cache:
|
|||||||
rc = mdbx_mapresize(env, txn->mt_next_pgno, aligned, txn->mt_geo.upper);
|
rc = mdbx_mapresize(env, txn->mt_next_pgno, aligned, txn->mt_geo.upper);
|
||||||
if (rc == MDBX_SUCCESS) {
|
if (rc == MDBX_SUCCESS) {
|
||||||
env->me_txn->mt_end_pgno = aligned;
|
env->me_txn->mt_end_pgno = aligned;
|
||||||
if (!mp)
|
|
||||||
return rc;
|
|
||||||
goto done;
|
goto done;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -4325,8 +4323,8 @@ skip_cache:
|
|||||||
fail:
|
fail:
|
||||||
mdbx_tassert(txn, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist,
|
mdbx_tassert(txn, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist,
|
||||||
txn->mt_next_pgno));
|
txn->mt_next_pgno));
|
||||||
if (mp) {
|
if (likely(mp)) {
|
||||||
*mp = NULL;
|
*mp = nullptr;
|
||||||
txn->mt_flags |= MDBX_TXN_ERROR;
|
txn->mt_flags |= MDBX_TXN_ERROR;
|
||||||
}
|
}
|
||||||
mdbx_assert(env, rc != MDBX_SUCCESS);
|
mdbx_assert(env, rc != MDBX_SUCCESS);
|
||||||
@ -4334,7 +4332,8 @@ skip_cache:
|
|||||||
}
|
}
|
||||||
|
|
||||||
done:
|
done:
|
||||||
mdbx_tassert(txn, mp && num);
|
if (unlikely(mp == nullptr))
|
||||||
|
return MDBX_SUCCESS;
|
||||||
mdbx_ensure(env, pgno >= NUM_METAS);
|
mdbx_ensure(env, pgno >= NUM_METAS);
|
||||||
if (env->me_flags & MDBX_WRITEMAP) {
|
if (env->me_flags & MDBX_WRITEMAP) {
|
||||||
np = pgno2page(env, pgno);
|
np = pgno2page(env, pgno);
|
||||||
@ -5685,60 +5684,6 @@ int mdbx_txn_abort(MDBX_txn *txn) {
|
|||||||
return mdbx_txn_end(txn, MDBX_END_ABORT | MDBX_END_SLOT | MDBX_END_FREE);
|
return mdbx_txn_end(txn, MDBX_END_ABORT | MDBX_END_SLOT | MDBX_END_FREE);
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline int mdbx_backlog_size(MDBX_txn *txn) {
|
|
||||||
int reclaimed = MDBX_PNL_SIZE(txn->tw.reclaimed_pglist);
|
|
||||||
return reclaimed + txn->tw.loose_count;
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline int mdbx_backlog_extragap(MDBX_env *env) {
|
|
||||||
/* LY: extra page(s) for b-tree rebalancing */
|
|
||||||
return (env->me_flags & MDBX_LIFORECLAIM) ? 2 : 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* LY: Prepare a backlog of pages to modify GC itself,
|
|
||||||
* while reclaiming is prohibited. It should be enough to prevent search
|
|
||||||
* in mdbx_page_alloc() during a deleting, when GC tree is unbalanced. */
|
|
||||||
static int mdbx_prep_backlog(MDBX_txn *txn, MDBX_cursor *mc) {
|
|
||||||
/* LY: extra page(s) for b-tree rebalancing */
|
|
||||||
const int extra =
|
|
||||||
mdbx_backlog_extragap(txn->mt_env) +
|
|
||||||
MDBX_PNL_SIZEOF(txn->tw.retired_pages) / txn->mt_env->me_maxgc_ov1page +
|
|
||||||
1;
|
|
||||||
|
|
||||||
if (mdbx_backlog_size(txn) < mc->mc_db->md_depth + extra) {
|
|
||||||
mc->mc_flags &= ~C_RECLAIMING;
|
|
||||||
int rc = mdbx_cursor_touch(mc);
|
|
||||||
if (unlikely(rc))
|
|
||||||
return rc;
|
|
||||||
|
|
||||||
while (unlikely(mdbx_backlog_size(txn) < extra)) {
|
|
||||||
rc = mdbx_page_alloc(mc, 1, NULL, MDBX_ALLOC_GC);
|
|
||||||
if (unlikely(rc)) {
|
|
||||||
if (rc != MDBX_NOTFOUND)
|
|
||||||
return rc;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
mc->mc_flags |= C_RECLAIMING;
|
|
||||||
}
|
|
||||||
|
|
||||||
return MDBX_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void mdbx_prep_backlog_data(MDBX_txn *txn, MDBX_cursor *mc,
|
|
||||||
size_t bytes) {
|
|
||||||
const int wanna = (int)number_of_ovpages(txn->mt_env, bytes) +
|
|
||||||
mdbx_backlog_extragap(txn->mt_env);
|
|
||||||
if (unlikely(wanna > mdbx_backlog_size(txn))) {
|
|
||||||
mc->mc_flags &= ~C_RECLAIMING;
|
|
||||||
do {
|
|
||||||
if (mdbx_page_alloc(mc, 1, NULL, MDBX_ALLOC_GC) != MDBX_SUCCESS)
|
|
||||||
break;
|
|
||||||
} while (wanna > mdbx_backlog_size(txn));
|
|
||||||
mc->mc_flags |= C_RECLAIMING;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Count all the pages in each DB and in the freelist and make sure
|
/* Count all the pages in each DB and in the freelist and make sure
|
||||||
* it matches the actual number of pages being used.
|
* it matches the actual number of pages being used.
|
||||||
* All named DBs must be open for a correct count. */
|
* All named DBs must be open for a correct count. */
|
||||||
@ -5866,6 +5811,46 @@ static __cold int mdbx_audit_ex(MDBX_txn *txn, unsigned retired_stored,
|
|||||||
return MDBX_PROBLEM;
|
return MDBX_PROBLEM;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static __inline unsigned backlog_size(MDBX_txn *txn) {
|
||||||
|
return MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) + txn->tw.loose_count;
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline unsigned gctree_backlog(MDBX_txn *txn) {
|
||||||
|
return /* for split upto root page */ txn->mt_dbs[FREE_DBI].md_depth +
|
||||||
|
/* for rebalance */ 2 + /* for grow */ 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* LY: Prepare a backlog of pages to modify GC itself,
|
||||||
|
* while reclaiming is prohibited. It should be enough to prevent search
|
||||||
|
* in mdbx_page_alloc() during a deleting, when GC tree is unbalanced. */
|
||||||
|
static int mdbx_prep_backlog(MDBX_txn *txn, MDBX_cursor *gc_cursor,
|
||||||
|
const size_t pnl_bytes) {
|
||||||
|
const unsigned linear = number_of_ovpages(
|
||||||
|
txn->mt_env,
|
||||||
|
pnl_bytes ? pnl_bytes : MDBX_PNL_SIZEOF(txn->tw.retired_pages));
|
||||||
|
const unsigned backlog = linear + gctree_backlog(txn);
|
||||||
|
|
||||||
|
if (likely(
|
||||||
|
linear == 1 &&
|
||||||
|
backlog_size(txn) >
|
||||||
|
(pnl_bytes
|
||||||
|
? backlog
|
||||||
|
: backlog + /* for COW */ txn->mt_dbs[FREE_DBI].md_depth)))
|
||||||
|
return MDBX_SUCCESS;
|
||||||
|
|
||||||
|
gc_cursor->mc_flags &= ~C_RECLAIMING;
|
||||||
|
|
||||||
|
int err = mdbx_cursor_touch(gc_cursor);
|
||||||
|
if (err == MDBX_SUCCESS && linear > 1)
|
||||||
|
err = mdbx_page_alloc(gc_cursor, linear, nullptr, MDBX_ALLOC_ALL);
|
||||||
|
|
||||||
|
while (err == MDBX_SUCCESS && backlog_size(txn) < backlog)
|
||||||
|
err = mdbx_page_alloc(gc_cursor, 1, NULL, MDBX_ALLOC_GC);
|
||||||
|
|
||||||
|
gc_cursor->mc_flags |= C_RECLAIMING;
|
||||||
|
return (err != MDBX_NOTFOUND) ? err : MDBX_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
static __inline void clean_reserved_gc_pnl(MDBX_env *env, MDBX_val pnl) {
|
static __inline void clean_reserved_gc_pnl(MDBX_env *env, MDBX_val pnl) {
|
||||||
/* PNL is initially empty, zero out at least the length */
|
/* PNL is initially empty, zero out at least the length */
|
||||||
memset(pnl.iov_base, 0, sizeof(pgno_t));
|
memset(pnl.iov_base, 0, sizeof(pgno_t));
|
||||||
@ -5940,7 +5925,7 @@ retry:
|
|||||||
continue;
|
continue;
|
||||||
if (unlikely(rc != MDBX_SUCCESS))
|
if (unlikely(rc != MDBX_SUCCESS))
|
||||||
goto bailout;
|
goto bailout;
|
||||||
rc = mdbx_prep_backlog(txn, &mc);
|
rc = mdbx_prep_backlog(txn, &mc, 0);
|
||||||
if (unlikely(rc != MDBX_SUCCESS))
|
if (unlikely(rc != MDBX_SUCCESS))
|
||||||
goto bailout;
|
goto bailout;
|
||||||
mdbx_tassert(txn, cleaned_gc_id < *env->me_oldest);
|
mdbx_tassert(txn, cleaned_gc_id < *env->me_oldest);
|
||||||
@ -5977,7 +5962,7 @@ retry:
|
|||||||
if (cleaned_gc_id > txn->tw.last_reclaimed)
|
if (cleaned_gc_id > txn->tw.last_reclaimed)
|
||||||
break;
|
break;
|
||||||
if (cleaned_gc_id < txn->tw.last_reclaimed) {
|
if (cleaned_gc_id < txn->tw.last_reclaimed) {
|
||||||
rc = mdbx_prep_backlog(txn, &mc);
|
rc = mdbx_prep_backlog(txn, &mc, 0);
|
||||||
if (unlikely(rc != MDBX_SUCCESS))
|
if (unlikely(rc != MDBX_SUCCESS))
|
||||||
goto bailout;
|
goto bailout;
|
||||||
}
|
}
|
||||||
@ -6095,7 +6080,7 @@ retry:
|
|||||||
key.iov_base = &txn->mt_txnid;
|
key.iov_base = &txn->mt_txnid;
|
||||||
do {
|
do {
|
||||||
data.iov_len = MDBX_PNL_SIZEOF(txn->tw.retired_pages);
|
data.iov_len = MDBX_PNL_SIZEOF(txn->tw.retired_pages);
|
||||||
mdbx_prep_backlog_data(txn, &mc, data.iov_len);
|
mdbx_prep_backlog(txn, &mc, data.iov_len);
|
||||||
rc = mdbx_cursor_put(&mc, &key, &data, MDBX_RESERVE);
|
rc = mdbx_cursor_put(&mc, &key, &data, MDBX_RESERVE);
|
||||||
if (unlikely(rc != MDBX_SUCCESS))
|
if (unlikely(rc != MDBX_SUCCESS))
|
||||||
goto bailout;
|
goto bailout;
|
||||||
@ -6347,7 +6332,7 @@ retry:
|
|||||||
data.iov_len = (chunk + 1) * sizeof(pgno_t);
|
data.iov_len = (chunk + 1) * sizeof(pgno_t);
|
||||||
mdbx_trace("%s.reserve: %u [%u...%u] @%" PRIaTXN, dbg_prefix_mode, chunk,
|
mdbx_trace("%s.reserve: %u [%u...%u] @%" PRIaTXN, dbg_prefix_mode, chunk,
|
||||||
settled + 1, settled + chunk + 1, reservation_gc_id);
|
settled + 1, settled + chunk + 1, reservation_gc_id);
|
||||||
mdbx_prep_backlog_data(txn, &mc, data.iov_len);
|
mdbx_prep_backlog(txn, &mc, data.iov_len);
|
||||||
rc = mdbx_cursor_put(&mc, &key, &data, MDBX_RESERVE | MDBX_NOOVERWRITE);
|
rc = mdbx_cursor_put(&mc, &key, &data, MDBX_RESERVE | MDBX_NOOVERWRITE);
|
||||||
mdbx_tassert(txn, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist,
|
mdbx_tassert(txn, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist,
|
||||||
txn->mt_next_pgno));
|
txn->mt_next_pgno));
|
||||||
@ -7460,8 +7445,7 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags,
|
|||||||
#endif /* MADV_FREE || MADV_DONTNEED */
|
#endif /* MADV_FREE || MADV_DONTNEED */
|
||||||
|
|
||||||
/* LY: check conditions to shrink datafile */
|
/* LY: check conditions to shrink datafile */
|
||||||
const pgno_t backlog_gap =
|
const pgno_t backlog_gap = 3 + pending->mm_dbs[FREE_DBI].md_depth * 3;
|
||||||
pending->mm_dbs[FREE_DBI].md_depth + mdbx_backlog_extragap(env);
|
|
||||||
if (pending->mm_geo.shrink && pending->mm_geo.now - pending->mm_geo.next >
|
if (pending->mm_geo.shrink && pending->mm_geo.now - pending->mm_geo.next >
|
||||||
pending->mm_geo.shrink + backlog_gap) {
|
pending->mm_geo.shrink + backlog_gap) {
|
||||||
if (pending->mm_geo.now > largest_pgno &&
|
if (pending->mm_geo.now > largest_pgno &&
|
||||||
|
Loading…
x
Reference in New Issue
Block a user