mirror of
https://github.com/isar/libmdbx.git
synced 2025-01-08 07:44:14 +08:00
mdbx: упрощение page_alloc_slowpath()
.
Упрощение за счет удаления проверки флага `MDBX_ALLOC_GC`, который всегда взведен при вызове page_alloc_slowpath().
This commit is contained in:
parent
3e05d1a427
commit
f73cd7a491
590
src/core.c
590
src/core.c
@ -5394,6 +5394,7 @@ static txnid_t find_oldest_reader(MDBX_env *const env, const txnid_t steady) {
|
|||||||
MDBX_lockinfo *const lck = env->me_lck_mmap.lck;
|
MDBX_lockinfo *const lck = env->me_lck_mmap.lck;
|
||||||
if (unlikely(lck == NULL /* exclusive without-lck mode */)) {
|
if (unlikely(lck == NULL /* exclusive without-lck mode */)) {
|
||||||
eASSERT(env, env->me_lck == (void *)&env->x_lckless_stub);
|
eASSERT(env, env->me_lck == (void *)&env->x_lckless_stub);
|
||||||
|
env->me_lck->mti_readers_refresh_flag.weak = nothing_changed;
|
||||||
return env->me_lck->mti_oldest_reader.weak = steady;
|
return env->me_lck->mti_oldest_reader.weak = steady;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -6623,6 +6624,14 @@ static int gc_cursor_init(MDBX_cursor *mc, MDBX_txn *txn) {
|
|||||||
return cursor_init(mc, txn, FREE_DBI);
|
return cursor_init(mc, txn, FREE_DBI);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
__hot static bool is_already_reclaimed(const MDBX_txn *txn, txnid_t id) {
|
||||||
|
const size_t len = MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed);
|
||||||
|
for (size_t i = 1; i <= len; ++i)
|
||||||
|
if (txn->tw.lifo_reclaimed[i] == id)
|
||||||
|
return true;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
static pgr_t page_alloc_slowpath(const MDBX_cursor *mc, const size_t num,
|
static pgr_t page_alloc_slowpath(const MDBX_cursor *mc, const size_t num,
|
||||||
char flags) {
|
char flags) {
|
||||||
#if MDBX_ENABLE_PROFGC
|
#if MDBX_ENABLE_PROFGC
|
||||||
@ -6642,11 +6651,9 @@ static pgr_t page_alloc_slowpath(const MDBX_cursor *mc, const size_t num,
|
|||||||
prof->spe_counter += 1;
|
prof->spe_counter += 1;
|
||||||
#endif /* MDBX_ENABLE_PROFGC */
|
#endif /* MDBX_ENABLE_PROFGC */
|
||||||
|
|
||||||
|
eASSERT(env, flags & MDBX_ALLOC_GC);
|
||||||
eASSERT(env, num == 0 || !(flags & MDBX_ALLOC_SLOT));
|
eASSERT(env, num == 0 || !(flags & MDBX_ALLOC_SLOT));
|
||||||
eASSERT(env, num > 0 || !(flags & MDBX_ALLOC_NEW));
|
eASSERT(env, num > 0 || !(flags & MDBX_ALLOC_NEW));
|
||||||
eASSERT(env, (flags & (MDBX_ALLOC_SLOT | MDBX_ALLOC_RESERVE |
|
|
||||||
MDBX_ALLOC_BACKLOG)) == 0 ||
|
|
||||||
(flags & MDBX_ALLOC_GC));
|
|
||||||
eASSERT(env, (flags & (MDBX_ALLOC_SLOT | MDBX_ALLOC_RESERVE |
|
eASSERT(env, (flags & (MDBX_ALLOC_SLOT | MDBX_ALLOC_RESERVE |
|
||||||
MDBX_ALLOC_BACKLOG)) == 0 ||
|
MDBX_ALLOC_BACKLOG)) == 0 ||
|
||||||
(flags & MDBX_ALLOC_NEW) == 0);
|
(flags & MDBX_ALLOC_NEW) == 0);
|
||||||
@ -6654,9 +6661,12 @@ static pgr_t page_alloc_slowpath(const MDBX_cursor *mc, const size_t num,
|
|||||||
txn->mt_next_pgno - MDBX_ENABLE_REFUND));
|
txn->mt_next_pgno - MDBX_ENABLE_REFUND));
|
||||||
|
|
||||||
pgno_t pgno = 0, *range = nullptr;
|
pgno_t pgno = 0, *range = nullptr;
|
||||||
size_t re_len = MDBX_PNL_GETSIZE(txn->tw.relist);
|
size_t newnext, re_len = MDBX_PNL_GETSIZE(txn->tw.relist);
|
||||||
if (num > 1) {
|
if (num > 1) {
|
||||||
eASSERT(env, !(flags & MDBX_ALLOC_SLOT));
|
eASSERT(env, !(flags & MDBX_ALLOC_SLOT));
|
||||||
|
eASSERT(env, (txn->mt_flags & MDBX_TXN_FROZEN_RE) == 0);
|
||||||
|
if (unlikely(txn->mt_flags & MDBX_TXN_FROZEN_RE))
|
||||||
|
goto no_gc;
|
||||||
#if MDBX_ENABLE_PROFGC
|
#if MDBX_ENABLE_PROFGC
|
||||||
prof->xpages += 1;
|
prof->xpages += 1;
|
||||||
#endif /* MDBX_ENABLE_PROFGC */
|
#endif /* MDBX_ENABLE_PROFGC */
|
||||||
@ -6672,334 +6682,322 @@ static pgr_t page_alloc_slowpath(const MDBX_cursor *mc, const size_t num,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
eASSERT(env, (flags & (MDBX_ALLOC_SLOT | MDBX_ALLOC_RESERVE)) ||
|
eASSERT(env,
|
||||||
MDBX_PNL_GETSIZE(txn->tw.relist) == 0);
|
(flags & (MDBX_ALLOC_SLOT | MDBX_ALLOC_RESERVE)) || re_len == 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
//---------------------------------------------------------------------------
|
//---------------------------------------------------------------------------
|
||||||
|
|
||||||
if (likely(flags & MDBX_ALLOC_GC)) {
|
if (unlikely(!is_gc_usable(txn)))
|
||||||
if (unlikely(!is_gc_usable(txn)))
|
goto no_gc;
|
||||||
goto no_gc;
|
|
||||||
|
|
||||||
eASSERT(env, (flags & (MDBX_ALLOC_COALESCE | MDBX_ALLOC_LIFO)) == 0);
|
eASSERT(env, (flags & (MDBX_ALLOC_COALESCE | MDBX_ALLOC_LIFO)) == 0);
|
||||||
flags += (env->me_flags & MDBX_LIFORECLAIM) ? MDBX_ALLOC_LIFO : 0;
|
flags += (env->me_flags & MDBX_LIFORECLAIM) ? MDBX_ALLOC_LIFO : 0;
|
||||||
|
|
||||||
const unsigned coalesce_threshold = env->me_maxgc_ov1page >> 2;
|
const unsigned coalesce_threshold = env->me_maxgc_ov1page >> 2;
|
||||||
if (txn->mt_dbs[FREE_DBI].md_branch_pages &&
|
if (txn->mt_dbs[FREE_DBI].md_branch_pages &&
|
||||||
MDBX_PNL_GETSIZE(txn->tw.relist) < coalesce_threshold && num)
|
MDBX_PNL_GETSIZE(txn->tw.relist) < coalesce_threshold && num)
|
||||||
flags += MDBX_ALLOC_COALESCE;
|
flags += MDBX_ALLOC_COALESCE;
|
||||||
|
|
||||||
MDBX_cursor recur;
|
MDBX_cursor recur;
|
||||||
ret.err = gc_cursor_init(&recur, txn);
|
ret.err = gc_cursor_init(&recur, txn);
|
||||||
if (unlikely(ret.err != MDBX_SUCCESS))
|
if (unlikely(ret.err != MDBX_SUCCESS))
|
||||||
goto fail;
|
goto fail;
|
||||||
|
|
||||||
retry_gc_refresh_oldest:;
|
retry_gc_refresh_oldest:;
|
||||||
txnid_t oldest = txn_oldest_reader(txn);
|
txnid_t oldest = txn_oldest_reader(txn);
|
||||||
if (unlikely(!oldest))
|
retry_gc_have_oldest:
|
||||||
goto no_gc;
|
if (unlikely(oldest >= txn->mt_txnid)) {
|
||||||
|
ERROR("unexpected/invalid oldest-readed txnid %" PRIaTXN
|
||||||
|
" for current-txnid %" PRIaTXN,
|
||||||
|
oldest, txn->mt_txnid);
|
||||||
|
ret.err = MDBX_PROBLEM;
|
||||||
|
goto fail;
|
||||||
|
}
|
||||||
|
const txnid_t detent = oldest + 1;
|
||||||
|
|
||||||
retry_gc_have_oldest:
|
txnid_t id = 0;
|
||||||
if (unlikely(oldest >= txn->mt_txnid)) {
|
bool should_scan = false;
|
||||||
ERROR("unexpected/invalid oldest-readed txnid %" PRIaTXN
|
MDBX_cursor_op op = MDBX_FIRST;
|
||||||
" for current-txnid %" PRIaTXN,
|
if (flags & MDBX_ALLOC_LIFO) {
|
||||||
oldest, txn->mt_txnid);
|
if (!txn->tw.lifo_reclaimed) {
|
||||||
ret.err = MDBX_PROBLEM;
|
txn->tw.lifo_reclaimed = txl_alloc();
|
||||||
goto fail;
|
if (unlikely(!txn->tw.lifo_reclaimed)) {
|
||||||
}
|
ret.err = MDBX_ENOMEM;
|
||||||
const txnid_t detent = oldest + 1;
|
goto fail;
|
||||||
|
|
||||||
txnid_t last = 0;
|
|
||||||
bool should_scan = false;
|
|
||||||
MDBX_cursor_op op = MDBX_FIRST;
|
|
||||||
if (flags & MDBX_ALLOC_LIFO) {
|
|
||||||
if (!txn->tw.lifo_reclaimed) {
|
|
||||||
txn->tw.lifo_reclaimed = txl_alloc();
|
|
||||||
if (unlikely(!txn->tw.lifo_reclaimed)) {
|
|
||||||
ret.err = MDBX_ENOMEM;
|
|
||||||
goto fail;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
/* Begin lookup backward from oldest reader */
|
|
||||||
last = detent - 1;
|
|
||||||
op = MDBX_SET_RANGE;
|
|
||||||
} else if (txn->tw.last_reclaimed) {
|
|
||||||
/* Continue lookup forward from last-reclaimed */
|
|
||||||
last = txn->tw.last_reclaimed + 1;
|
|
||||||
if (last >= detent)
|
|
||||||
goto no_gc;
|
|
||||||
op = MDBX_SET_RANGE;
|
|
||||||
}
|
}
|
||||||
|
/* Begin lookup backward from oldest reader */
|
||||||
|
id = detent - 1;
|
||||||
|
op = MDBX_SET_RANGE;
|
||||||
|
} else if (txn->tw.last_reclaimed) {
|
||||||
|
/* Continue lookup forward from last-reclaimed */
|
||||||
|
id = txn->tw.last_reclaimed + 1;
|
||||||
|
if (id >= detent)
|
||||||
|
goto depleted_gc;
|
||||||
|
op = MDBX_SET_RANGE;
|
||||||
|
}
|
||||||
|
|
||||||
next_gc:;
|
next_gc:;
|
||||||
MDBX_val key;
|
MDBX_val key;
|
||||||
key.iov_base = &last;
|
key.iov_base = &id;
|
||||||
key.iov_len = sizeof(last);
|
key.iov_len = sizeof(id);
|
||||||
|
|
||||||
#if MDBX_ENABLE_PROFGC
|
#if MDBX_ENABLE_PROFGC
|
||||||
prof->rsteps += 1;
|
prof->rsteps += 1;
|
||||||
#endif /* MDBX_ENABLE_PROFGC */
|
#endif /* MDBX_ENABLE_PROFGC */
|
||||||
|
|
||||||
/* Seek first/next GC record */
|
/* Seek first/next GC record */
|
||||||
ret.err = mdbx_cursor_get(&recur, &key, NULL, op);
|
ret.err = mdbx_cursor_get(&recur, &key, NULL, op);
|
||||||
if (unlikely(ret.err != MDBX_SUCCESS)) {
|
if (unlikely(ret.err != MDBX_SUCCESS)) {
|
||||||
if (unlikely(ret.err != MDBX_NOTFOUND))
|
if (unlikely(ret.err != MDBX_NOTFOUND))
|
||||||
goto fail;
|
|
||||||
if ((flags & MDBX_ALLOC_LIFO) && op == MDBX_SET_RANGE) {
|
|
||||||
op = MDBX_PREV;
|
|
||||||
goto next_gc;
|
|
||||||
}
|
|
||||||
goto depleted_gc;
|
|
||||||
}
|
|
||||||
if (unlikely(key.iov_len != sizeof(txnid_t))) {
|
|
||||||
ret.err = MDBX_CORRUPTED;
|
|
||||||
goto fail;
|
goto fail;
|
||||||
}
|
if ((flags & MDBX_ALLOC_LIFO) && op == MDBX_SET_RANGE) {
|
||||||
last = unaligned_peek_u64(4, key.iov_base);
|
|
||||||
if (flags & MDBX_ALLOC_LIFO) {
|
|
||||||
op = MDBX_PREV;
|
op = MDBX_PREV;
|
||||||
if (last >= detent)
|
goto next_gc;
|
||||||
goto next_gc;
|
|
||||||
/* skip IDs of records that already reclaimed */
|
|
||||||
for (size_t i = MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed); i > 0; --i)
|
|
||||||
if (txn->tw.lifo_reclaimed[i] == last)
|
|
||||||
goto next_gc;
|
|
||||||
} else {
|
|
||||||
op = MDBX_NEXT;
|
|
||||||
if (unlikely(last >= detent))
|
|
||||||
goto depleted_gc;
|
|
||||||
}
|
}
|
||||||
|
goto depleted_gc;
|
||||||
|
}
|
||||||
|
if (unlikely(key.iov_len != sizeof(txnid_t))) {
|
||||||
|
ret.err = MDBX_CORRUPTED;
|
||||||
|
goto fail;
|
||||||
|
}
|
||||||
|
id = unaligned_peek_u64(4, key.iov_base);
|
||||||
|
if (flags & MDBX_ALLOC_LIFO) {
|
||||||
|
op = MDBX_PREV;
|
||||||
|
if (id >= detent || is_already_reclaimed(txn, id))
|
||||||
|
goto next_gc;
|
||||||
|
} else {
|
||||||
|
op = MDBX_NEXT;
|
||||||
|
if (unlikely(id >= detent))
|
||||||
|
goto depleted_gc;
|
||||||
|
}
|
||||||
|
|
||||||
/* Reading next GC record */
|
/* Reading next GC record */
|
||||||
MDBX_val data;
|
MDBX_val data;
|
||||||
MDBX_page *const mp = recur.mc_pg[recur.mc_top];
|
MDBX_page *const mp = recur.mc_pg[recur.mc_top];
|
||||||
if (unlikely((ret.err = node_read(&recur,
|
if (unlikely(
|
||||||
page_node(mp, recur.mc_ki[recur.mc_top]),
|
(ret.err = node_read(&recur, page_node(mp, recur.mc_ki[recur.mc_top]),
|
||||||
&data, mp)) != MDBX_SUCCESS))
|
&data, mp)) != MDBX_SUCCESS))
|
||||||
|
goto fail;
|
||||||
|
|
||||||
|
eASSERT(env, (txn->mt_flags & MDBX_TXN_FROZEN_RE) == 0);
|
||||||
|
pgno_t *gc_pnl = (pgno_t *)data.iov_base;
|
||||||
|
if (unlikely(data.iov_len % sizeof(pgno_t) ||
|
||||||
|
data.iov_len < MDBX_PNL_SIZEOF(gc_pnl) ||
|
||||||
|
!pnl_check(gc_pnl, txn->mt_next_pgno))) {
|
||||||
|
ret.err = MDBX_CORRUPTED;
|
||||||
|
goto fail;
|
||||||
|
}
|
||||||
|
const size_t gc_len = MDBX_PNL_GETSIZE(gc_pnl);
|
||||||
|
if (unlikely(/* list is too long already */ MDBX_PNL_GETSIZE(
|
||||||
|
txn->tw.relist) >= env->me_options.rp_augment_limit) &&
|
||||||
|
((/* not a slot-request from gc-update */
|
||||||
|
(flags & MDBX_ALLOC_SLOT) == 0 &&
|
||||||
|
/* have enough unallocated space */ txn->mt_geo.upper >=
|
||||||
|
txn->mt_next_pgno + num) ||
|
||||||
|
gc_len + MDBX_PNL_GETSIZE(txn->tw.relist) >= MDBX_PGL_LIMIT)) {
|
||||||
|
/* Stop reclaiming to avoid large/overflow the page list.
|
||||||
|
* This is a rare case while search for a continuously multi-page region
|
||||||
|
* in a large database.
|
||||||
|
* https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/123
|
||||||
|
*/
|
||||||
|
NOTICE("stop reclaiming to avoid PNL overflow: %zu (current) + %zu "
|
||||||
|
"(chunk) -> %zu",
|
||||||
|
MDBX_PNL_GETSIZE(txn->tw.relist), gc_len,
|
||||||
|
gc_len + MDBX_PNL_GETSIZE(txn->tw.relist));
|
||||||
|
goto depleted_gc;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Remember ID of readed GC record */
|
||||||
|
txn->tw.last_reclaimed = id;
|
||||||
|
if (flags & MDBX_ALLOC_LIFO) {
|
||||||
|
ret.err = txl_append(&txn->tw.lifo_reclaimed, id);
|
||||||
|
if (unlikely(ret.err != MDBX_SUCCESS))
|
||||||
goto fail;
|
goto fail;
|
||||||
|
}
|
||||||
|
|
||||||
eASSERT(env, (txn->mt_flags & MDBX_TXN_FROZEN_RE) == 0);
|
/* Append PNL from GC record to tw.relist */
|
||||||
pgno_t *gc_pnl = (pgno_t *)data.iov_base;
|
ret.err = pnl_need(&txn->tw.relist, gc_len);
|
||||||
if (unlikely(data.iov_len % sizeof(pgno_t) ||
|
if (unlikely(ret.err != MDBX_SUCCESS))
|
||||||
data.iov_len < MDBX_PNL_SIZEOF(gc_pnl) ||
|
goto fail;
|
||||||
!pnl_check(gc_pnl, txn->mt_next_pgno))) {
|
|
||||||
|
if (LOG_ENABLED(MDBX_LOG_EXTRA)) {
|
||||||
|
DEBUG_EXTRA("readed GC-pnl txn %" PRIaTXN " root %" PRIaPGNO
|
||||||
|
" len %zu, PNL",
|
||||||
|
id, txn->mt_dbs[FREE_DBI].md_root, gc_len);
|
||||||
|
for (size_t i = gc_len; i; i--)
|
||||||
|
DEBUG_EXTRA_PRINT(" %" PRIaPGNO, gc_pnl[i]);
|
||||||
|
DEBUG_EXTRA_PRINT(", next_pgno %u\n", txn->mt_next_pgno);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Merge in descending sorted order */
|
||||||
|
re_len = pnl_merge(txn->tw.relist, gc_pnl);
|
||||||
|
should_scan = true;
|
||||||
|
if (AUDIT_ENABLED()) {
|
||||||
|
if (unlikely(!pnl_check(txn->tw.relist, txn->mt_next_pgno))) {
|
||||||
ret.err = MDBX_CORRUPTED;
|
ret.err = MDBX_CORRUPTED;
|
||||||
goto fail;
|
goto fail;
|
||||||
}
|
}
|
||||||
const size_t gc_len = MDBX_PNL_GETSIZE(gc_pnl);
|
} else {
|
||||||
if (unlikely(/* list is too long already */ MDBX_PNL_GETSIZE(
|
eASSERT(env, pnl_check_allocated(txn->tw.relist, txn->mt_next_pgno));
|
||||||
txn->tw.relist) >= env->me_options.rp_augment_limit) &&
|
}
|
||||||
((/* not a slot-request from gc-update */
|
eASSERT(env, dirtylist_check(txn));
|
||||||
(flags & MDBX_ALLOC_SLOT) == 0 &&
|
|
||||||
/* have enough unallocated space */ txn->mt_geo.upper >=
|
|
||||||
txn->mt_next_pgno + num) ||
|
|
||||||
gc_len + MDBX_PNL_GETSIZE(txn->tw.relist) >= MDBX_PGL_LIMIT)) {
|
|
||||||
/* Stop reclaiming to avoid large/overflow the page list.
|
|
||||||
* This is a rare case while search for a continuously multi-page region
|
|
||||||
* in a large database.
|
|
||||||
* https://web.archive.org/web/https://github.com/erthink/libmdbx/issues/123 */
|
|
||||||
NOTICE("stop reclaiming to avoid PNL overflow: %zu (current) + %zu "
|
|
||||||
"(chunk) -> %zu",
|
|
||||||
MDBX_PNL_GETSIZE(txn->tw.relist), gc_len,
|
|
||||||
gc_len + MDBX_PNL_GETSIZE(txn->tw.relist));
|
|
||||||
goto depleted_gc;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Remember ID of readed GC record */
|
eASSERT(env,
|
||||||
txn->tw.last_reclaimed = last;
|
re_len == 0 || MDBX_PNL_MOST(txn->tw.relist) < txn->mt_next_pgno);
|
||||||
if (flags & MDBX_ALLOC_LIFO) {
|
if (MDBX_ENABLE_REFUND && re_len &&
|
||||||
ret.err = txl_append(&txn->tw.lifo_reclaimed, last);
|
unlikely(MDBX_PNL_MOST(txn->tw.relist) == txn->mt_next_pgno - 1)) {
|
||||||
if (unlikely(ret.err != MDBX_SUCCESS))
|
/* Refund suitable pages into "unallocated" space */
|
||||||
goto fail;
|
txn_refund(txn);
|
||||||
}
|
re_len = MDBX_PNL_GETSIZE(txn->tw.relist);
|
||||||
|
}
|
||||||
|
eASSERT(env, re_len == MDBX_PNL_GETSIZE(txn->tw.relist));
|
||||||
|
eASSERT(env, pnl_check_allocated(txn->tw.relist,
|
||||||
|
txn->mt_next_pgno - MDBX_ENABLE_REFUND));
|
||||||
|
|
||||||
/* Append PNL from GC record to tw.relist */
|
/* Done for a kick-reclaim mode, actually no page needed */
|
||||||
ret.err = pnl_need(&txn->tw.relist, gc_len);
|
if (unlikely(flags & MDBX_ALLOC_SLOT)) {
|
||||||
if (unlikely(ret.err != MDBX_SUCCESS))
|
eASSERT(env, ret.err == MDBX_SUCCESS);
|
||||||
goto fail;
|
goto early_exit;
|
||||||
txn->tw.relist = txn->tw.relist;
|
}
|
||||||
|
|
||||||
if (LOG_ENABLED(MDBX_LOG_EXTRA)) {
|
/* TODO: delete reclaimed records */
|
||||||
DEBUG_EXTRA("readed GC-pnl txn %" PRIaTXN " root %" PRIaPGNO
|
|
||||||
" len %zu, PNL",
|
|
||||||
last, txn->mt_dbs[FREE_DBI].md_root, gc_len);
|
|
||||||
for (size_t i = gc_len; i; i--)
|
|
||||||
DEBUG_EXTRA_PRINT(" %" PRIaPGNO, gc_pnl[i]);
|
|
||||||
DEBUG_EXTRA_PRINT(", next_pgno %u\n", txn->mt_next_pgno);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Merge in descending sorted order */
|
/* Don't try to coalesce too much. */
|
||||||
re_len = pnl_merge(txn->tw.relist, gc_pnl);
|
eASSERT(env, op == MDBX_PREV || op == MDBX_NEXT);
|
||||||
should_scan = true;
|
if (flags & MDBX_ALLOC_COALESCE) {
|
||||||
if (AUDIT_ENABLED()) {
|
if (re_len /* current size */ < coalesce_threshold) {
|
||||||
if (unlikely(!pnl_check(txn->tw.relist, txn->mt_next_pgno))) {
|
|
||||||
ret.err = MDBX_CORRUPTED;
|
|
||||||
goto fail;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
eASSERT(env, pnl_check_allocated(txn->tw.relist, txn->mt_next_pgno));
|
|
||||||
}
|
|
||||||
eASSERT(env, dirtylist_check(txn));
|
|
||||||
|
|
||||||
eASSERT(env,
|
|
||||||
re_len == 0 || MDBX_PNL_MOST(txn->tw.relist) < txn->mt_next_pgno);
|
|
||||||
if (MDBX_ENABLE_REFUND && re_len &&
|
|
||||||
unlikely(MDBX_PNL_MOST(txn->tw.relist) == txn->mt_next_pgno - 1)) {
|
|
||||||
/* Refund suitable pages into "unallocated" space */
|
|
||||||
if (txn_refund(txn))
|
|
||||||
re_len = MDBX_PNL_GETSIZE(txn->tw.relist);
|
|
||||||
}
|
|
||||||
eASSERT(env, re_len == MDBX_PNL_GETSIZE(txn->tw.relist));
|
|
||||||
eASSERT(env, pnl_check_allocated(txn->tw.relist,
|
|
||||||
txn->mt_next_pgno - MDBX_ENABLE_REFUND));
|
|
||||||
|
|
||||||
/* Done for a kick-reclaim mode, actually no page needed */
|
|
||||||
if (unlikely(flags & MDBX_ALLOC_SLOT)) {
|
|
||||||
eASSERT(env, ret.err == MDBX_SUCCESS);
|
|
||||||
goto early_exit;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* TODO: delete reclaimed records */
|
|
||||||
|
|
||||||
/* Don't try to coalesce too much. */
|
|
||||||
eASSERT(env, op == MDBX_PREV || op == MDBX_NEXT);
|
|
||||||
if (flags & MDBX_ALLOC_COALESCE) {
|
|
||||||
if (re_len /* current size */ < coalesce_threshold) {
|
|
||||||
#if MDBX_ENABLE_PROFGC
|
#if MDBX_ENABLE_PROFGC
|
||||||
env->me_lck->mti_pgop_stat.gc_prof.coalescences += 1;
|
env->me_lck->mti_pgop_stat.gc_prof.coalescences += 1;
|
||||||
#endif /* MDBX_ENABLE_PROFGC */
|
#endif /* MDBX_ENABLE_PROFGC */
|
||||||
goto next_gc;
|
|
||||||
}
|
|
||||||
TRACE("clear %s %s", "MDBX_ALLOC_COALESCE", "since got threshold");
|
|
||||||
flags &= ~MDBX_ALLOC_COALESCE;
|
|
||||||
}
|
|
||||||
|
|
||||||
scan:
|
|
||||||
eASSERT(env, should_scan);
|
|
||||||
if (re_len >= num) {
|
|
||||||
eASSERT(env, MDBX_PNL_LAST(txn->tw.relist) < txn->mt_next_pgno &&
|
|
||||||
MDBX_PNL_FIRST(txn->tw.relist) < txn->mt_next_pgno);
|
|
||||||
range = txn->tw.relist + (MDBX_PNL_ASCENDING ? 1 : re_len);
|
|
||||||
pgno = *range;
|
|
||||||
if (num == 1)
|
|
||||||
goto done;
|
|
||||||
range = scan4seq(range, re_len, num - 1);
|
|
||||||
eASSERT(env, range == scan4range_checker(txn->tw.relist, num - 1));
|
|
||||||
if (likely(range)) {
|
|
||||||
pgno = *range;
|
|
||||||
goto done;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
should_scan = false;
|
|
||||||
if (ret.err == MDBX_SUCCESS)
|
|
||||||
goto next_gc;
|
goto next_gc;
|
||||||
|
|
||||||
depleted_gc:
|
|
||||||
ret.err = MDBX_NOTFOUND;
|
|
||||||
if (should_scan)
|
|
||||||
goto scan;
|
|
||||||
|
|
||||||
//-------------------------------------------------------------------------
|
|
||||||
|
|
||||||
/* There is no suitable pages in the GC and to be able to allocate
|
|
||||||
* we should CHOICE one of:
|
|
||||||
* - make a new steady checkpoint if reclaiming was stopped by
|
|
||||||
* the last steady-sync, or wipe it in the MDBX_UTTERLY_NOSYNC mode;
|
|
||||||
* - kick lagging reader(s) if reclaiming was stopped by ones of it.
|
|
||||||
* - extend the database file. */
|
|
||||||
|
|
||||||
/* Will use new pages from the map if nothing is suitable in the GC. */
|
|
||||||
pgno = txn->mt_next_pgno;
|
|
||||||
const size_t newnext = num + pgno;
|
|
||||||
|
|
||||||
const meta_ptr_t recent = meta_recent(env, &txn->tw.troika);
|
|
||||||
const meta_ptr_t prefer_steady = meta_prefer_steady(env, &txn->tw.troika);
|
|
||||||
/* does reclaiming stopped at the last steady point? */
|
|
||||||
if (recent.ptr_c != prefer_steady.ptr_c && prefer_steady.is_steady &&
|
|
||||||
detent == prefer_steady.txnid + 1) {
|
|
||||||
DEBUG("gc-kick-steady: recent %" PRIaTXN "-%s, steady %" PRIaTXN
|
|
||||||
"-%s, detent %" PRIaTXN,
|
|
||||||
recent.txnid, durable_caption(recent.ptr_c), prefer_steady.txnid,
|
|
||||||
durable_caption(prefer_steady.ptr_c), detent);
|
|
||||||
const pgno_t autosync_threshold =
|
|
||||||
atomic_load32(&env->me_lck->mti_autosync_threshold, mo_Relaxed);
|
|
||||||
const uint64_t autosync_period =
|
|
||||||
atomic_load64(&env->me_lck->mti_autosync_period, mo_Relaxed);
|
|
||||||
uint64_t eoos_timestamp;
|
|
||||||
/* wipe the last steady-point if one of:
|
|
||||||
* - UTTERLY_NOSYNC mode AND auto-sync threshold is NOT specified
|
|
||||||
* - UTTERLY_NOSYNC mode AND free space at steady-point is exhausted
|
|
||||||
* otherwise, make a new steady-point if one of:
|
|
||||||
* - auto-sync threshold is specified and reached;
|
|
||||||
* - upper limit of database size is reached;
|
|
||||||
* - database is full (with the current file size)
|
|
||||||
* AND auto-sync threshold it NOT specified */
|
|
||||||
if (F_ISSET(env->me_flags, MDBX_UTTERLY_NOSYNC) &&
|
|
||||||
((autosync_threshold | autosync_period) == 0 ||
|
|
||||||
newnext >= prefer_steady.ptr_c->mm_geo.now)) {
|
|
||||||
/* wipe steady checkpoint in MDBX_UTTERLY_NOSYNC mode
|
|
||||||
* without any auto-sync threshold(s). */
|
|
||||||
#if MDBX_ENABLE_PROFGC
|
|
||||||
env->me_lck->mti_pgop_stat.gc_prof.wipes += 1;
|
|
||||||
#endif /* MDBX_ENABLE_PROFGC */
|
|
||||||
ret.err = wipe_steady(txn, detent);
|
|
||||||
DEBUG("gc-wipe-steady, rc %d", ret.err);
|
|
||||||
if (unlikely(ret.err != MDBX_SUCCESS))
|
|
||||||
goto fail;
|
|
||||||
eASSERT(env, prefer_steady.ptr_c !=
|
|
||||||
meta_prefer_steady(env, &txn->tw.troika).ptr_c);
|
|
||||||
goto retry_gc_refresh_oldest;
|
|
||||||
}
|
|
||||||
if ((flags & (MDBX_ALLOC_BACKLOG | MDBX_ALLOC_NEW)) == 0 ||
|
|
||||||
(autosync_threshold &&
|
|
||||||
atomic_load64(&env->me_lck->mti_unsynced_pages, mo_Relaxed) >=
|
|
||||||
autosync_threshold) ||
|
|
||||||
(autosync_period &&
|
|
||||||
(eoos_timestamp =
|
|
||||||
atomic_load64(&env->me_lck->mti_eoos_timestamp, mo_Relaxed)) &&
|
|
||||||
osal_monotime() - eoos_timestamp >= autosync_period) ||
|
|
||||||
newnext >= txn->mt_geo.upper ||
|
|
||||||
(newnext >= txn->mt_end_pgno &&
|
|
||||||
(autosync_threshold | autosync_period) == 0)) {
|
|
||||||
/* make steady checkpoint. */
|
|
||||||
#if MDBX_ENABLE_PROFGC
|
|
||||||
env->me_lck->mti_pgop_stat.gc_prof.flushes += 1;
|
|
||||||
#endif /* MDBX_ENABLE_PROFGC */
|
|
||||||
MDBX_meta meta = *recent.ptr_c;
|
|
||||||
ret.err = sync_locked(env, env->me_flags & MDBX_WRITEMAP, &meta,
|
|
||||||
&txn->tw.troika);
|
|
||||||
DEBUG("gc-make-steady, rc %d", ret.err);
|
|
||||||
eASSERT(env, ret.err != MDBX_RESULT_TRUE);
|
|
||||||
if (unlikely(ret.err != MDBX_SUCCESS))
|
|
||||||
goto fail;
|
|
||||||
eASSERT(env, prefer_steady.ptr_c !=
|
|
||||||
meta_prefer_steady(env, &txn->tw.troika).ptr_c);
|
|
||||||
goto retry_gc_refresh_oldest;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
TRACE("clear %s %s", "MDBX_ALLOC_COALESCE", "since got threshold");
|
||||||
|
flags &= ~MDBX_ALLOC_COALESCE;
|
||||||
|
}
|
||||||
|
|
||||||
if (env->me_lck_mmap.lck &&
|
scan:
|
||||||
unlikely(true ==
|
eASSERT(env, should_scan);
|
||||||
atomic_load32(&env->me_lck_mmap.lck->mti_readers_refresh_flag,
|
if (re_len >= num) {
|
||||||
mo_AcquireRelease))) {
|
eASSERT(env, MDBX_PNL_LAST(txn->tw.relist) < txn->mt_next_pgno &&
|
||||||
oldest = txn_oldest_reader(txn);
|
MDBX_PNL_FIRST(txn->tw.relist) < txn->mt_next_pgno);
|
||||||
if (oldest >= detent)
|
range = txn->tw.relist + (MDBX_PNL_ASCENDING ? 1 : re_len);
|
||||||
goto retry_gc_have_oldest;
|
pgno = *range;
|
||||||
}
|
if (num == 1)
|
||||||
|
goto done;
|
||||||
/* avoid kick lagging reader(s) if is enough unallocated space
|
range = scan4seq(range, re_len, num - 1);
|
||||||
* at the end of database file. */
|
eASSERT(env, range == scan4range_checker(txn->tw.relist, num - 1));
|
||||||
if ((flags & MDBX_ALLOC_NEW) && newnext <= txn->mt_end_pgno) {
|
if (likely(range)) {
|
||||||
eASSERT(env, range == nullptr);
|
pgno = *range;
|
||||||
goto done;
|
goto done;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
should_scan = false;
|
||||||
|
if (ret.err == MDBX_SUCCESS)
|
||||||
|
goto next_gc;
|
||||||
|
|
||||||
if (oldest < txn->mt_txnid - xMDBX_TXNID_STEP) {
|
depleted_gc:
|
||||||
oldest = kick_longlived_readers(env, oldest);
|
ret.err = MDBX_NOTFOUND;
|
||||||
if (oldest >= detent)
|
if (should_scan)
|
||||||
goto retry_gc_have_oldest;
|
goto scan;
|
||||||
|
|
||||||
|
//-------------------------------------------------------------------------
|
||||||
|
|
||||||
|
/* There is no suitable pages in the GC and to be able to allocate
|
||||||
|
* we should CHOICE one of:
|
||||||
|
* - make a new steady checkpoint if reclaiming was stopped by
|
||||||
|
* the last steady-sync, or wipe it in the MDBX_UTTERLY_NOSYNC mode;
|
||||||
|
* - kick lagging reader(s) if reclaiming was stopped by ones of it.
|
||||||
|
* - extend the database file. */
|
||||||
|
|
||||||
|
/* Will use new pages from the map if nothing is suitable in the GC. */
|
||||||
|
newnext = (pgno = txn->mt_next_pgno) + num;
|
||||||
|
|
||||||
|
/* Does reclaiming stopped at the last steady point? */
|
||||||
|
const meta_ptr_t recent = meta_recent(env, &txn->tw.troika);
|
||||||
|
const meta_ptr_t prefer_steady = meta_prefer_steady(env, &txn->tw.troika);
|
||||||
|
if (recent.ptr_c != prefer_steady.ptr_c && prefer_steady.is_steady &&
|
||||||
|
detent == prefer_steady.txnid + 1) {
|
||||||
|
DEBUG("gc-kick-steady: recent %" PRIaTXN "-%s, steady %" PRIaTXN
|
||||||
|
"-%s, detent %" PRIaTXN,
|
||||||
|
recent.txnid, durable_caption(recent.ptr_c), prefer_steady.txnid,
|
||||||
|
durable_caption(prefer_steady.ptr_c), detent);
|
||||||
|
const pgno_t autosync_threshold =
|
||||||
|
atomic_load32(&env->me_lck->mti_autosync_threshold, mo_Relaxed);
|
||||||
|
const uint64_t autosync_period =
|
||||||
|
atomic_load64(&env->me_lck->mti_autosync_period, mo_Relaxed);
|
||||||
|
uint64_t eoos_timestamp;
|
||||||
|
/* wipe the last steady-point if one of:
|
||||||
|
* - UTTERLY_NOSYNC mode AND auto-sync threshold is NOT specified
|
||||||
|
* - UTTERLY_NOSYNC mode AND free space at steady-point is exhausted
|
||||||
|
* otherwise, make a new steady-point if one of:
|
||||||
|
* - auto-sync threshold is specified and reached;
|
||||||
|
* - upper limit of database size is reached;
|
||||||
|
* - database is full (with the current file size)
|
||||||
|
* AND auto-sync threshold it NOT specified */
|
||||||
|
if (F_ISSET(env->me_flags, MDBX_UTTERLY_NOSYNC) &&
|
||||||
|
((autosync_threshold | autosync_period) == 0 ||
|
||||||
|
newnext >= prefer_steady.ptr_c->mm_geo.now)) {
|
||||||
|
/* wipe steady checkpoint in MDBX_UTTERLY_NOSYNC mode
|
||||||
|
* without any auto-sync threshold(s). */
|
||||||
|
#if MDBX_ENABLE_PROFGC
|
||||||
|
env->me_lck->mti_pgop_stat.gc_prof.wipes += 1;
|
||||||
|
#endif /* MDBX_ENABLE_PROFGC */
|
||||||
|
ret.err = wipe_steady(txn, detent);
|
||||||
|
DEBUG("gc-wipe-steady, rc %d", ret.err);
|
||||||
|
if (unlikely(ret.err != MDBX_SUCCESS))
|
||||||
|
goto fail;
|
||||||
|
eASSERT(env, prefer_steady.ptr_c !=
|
||||||
|
meta_prefer_steady(env, &txn->tw.troika).ptr_c);
|
||||||
|
goto retry_gc_refresh_oldest;
|
||||||
}
|
}
|
||||||
|
if ((flags & (MDBX_ALLOC_BACKLOG | MDBX_ALLOC_NEW)) == 0 ||
|
||||||
|
(autosync_threshold &&
|
||||||
|
atomic_load64(&env->me_lck->mti_unsynced_pages, mo_Relaxed) >=
|
||||||
|
autosync_threshold) ||
|
||||||
|
(autosync_period &&
|
||||||
|
(eoos_timestamp =
|
||||||
|
atomic_load64(&env->me_lck->mti_eoos_timestamp, mo_Relaxed)) &&
|
||||||
|
osal_monotime() - eoos_timestamp >= autosync_period) ||
|
||||||
|
newnext >= txn->mt_geo.upper ||
|
||||||
|
(newnext >= txn->mt_end_pgno &&
|
||||||
|
(autosync_threshold | autosync_period) == 0)) {
|
||||||
|
/* make steady checkpoint. */
|
||||||
|
#if MDBX_ENABLE_PROFGC
|
||||||
|
env->me_lck->mti_pgop_stat.gc_prof.flushes += 1;
|
||||||
|
#endif /* MDBX_ENABLE_PROFGC */
|
||||||
|
MDBX_meta meta = *recent.ptr_c;
|
||||||
|
ret.err = sync_locked(env, env->me_flags & MDBX_WRITEMAP, &meta,
|
||||||
|
&txn->tw.troika);
|
||||||
|
DEBUG("gc-make-steady, rc %d", ret.err);
|
||||||
|
eASSERT(env, ret.err != MDBX_RESULT_TRUE);
|
||||||
|
if (unlikely(ret.err != MDBX_SUCCESS))
|
||||||
|
goto fail;
|
||||||
|
eASSERT(env, prefer_steady.ptr_c !=
|
||||||
|
meta_prefer_steady(env, &txn->tw.troika).ptr_c);
|
||||||
|
goto retry_gc_refresh_oldest;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (unlikely(true == atomic_load32(&env->me_lck->mti_readers_refresh_flag,
|
||||||
|
mo_AcquireRelease))) {
|
||||||
|
oldest = txn_oldest_reader(txn);
|
||||||
|
if (oldest >= detent)
|
||||||
|
goto retry_gc_have_oldest;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Avoid kick lagging reader(s) if is enough unallocated space
|
||||||
|
* at the end of database file. */
|
||||||
|
if ((flags & MDBX_ALLOC_NEW) && newnext <= txn->mt_end_pgno) {
|
||||||
|
eASSERT(env, range == nullptr);
|
||||||
|
goto done;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (oldest < txn->mt_txnid - xMDBX_TXNID_STEP) {
|
||||||
|
oldest = kick_longlived_readers(env, oldest);
|
||||||
|
if (oldest >= detent)
|
||||||
|
goto retry_gc_have_oldest;
|
||||||
}
|
}
|
||||||
|
|
||||||
//---------------------------------------------------------------------------
|
//---------------------------------------------------------------------------
|
||||||
@ -7011,8 +7009,7 @@ no_gc:
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* Will use new pages from the map if nothing is suitable in the GC. */
|
/* Will use new pages from the map if nothing is suitable in the GC. */
|
||||||
pgno = txn->mt_next_pgno;
|
newnext = (pgno = txn->mt_next_pgno) + num;
|
||||||
const size_t newnext = num + pgno;
|
|
||||||
if (newnext <= txn->mt_end_pgno)
|
if (newnext <= txn->mt_end_pgno)
|
||||||
goto done;
|
goto done;
|
||||||
|
|
||||||
@ -7055,6 +7052,7 @@ done:
|
|||||||
eASSERT(env, (txn->mt_flags & MDBX_TXN_FROZEN_RE) == 0);
|
eASSERT(env, (txn->mt_flags & MDBX_TXN_FROZEN_RE) == 0);
|
||||||
eASSERT(env, pgno == *range);
|
eASSERT(env, pgno == *range);
|
||||||
eASSERT(env, pgno + num <= txn->mt_next_pgno && pgno >= NUM_METAS);
|
eASSERT(env, pgno + num <= txn->mt_next_pgno && pgno >= NUM_METAS);
|
||||||
|
eASSERT(env, re_len == MDBX_PNL_GETSIZE(txn->tw.relist));
|
||||||
/* Cutoff allocated pages from tw.relist */
|
/* Cutoff allocated pages from tw.relist */
|
||||||
#if MDBX_PNL_ASCENDING
|
#if MDBX_PNL_ASCENDING
|
||||||
for (const pgno_t *const end = re_list + re_len - num; range <= end;
|
for (const pgno_t *const end = re_list + re_len - num; range <= end;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user