mdbx: переработка внутренних флагов связанных с выделением страниц из GC.

This commit is contained in:
Леонид Юрьев (Leonid Yuriev) 2022-11-22 01:11:46 +03:00
parent 141cce0c0f
commit da023657f5
3 changed files with 116 additions and 144 deletions

View File

@ -5,8 +5,8 @@ N | MASK | ENV | TXN | DB | PUT | DBI | NOD
2 |0000 0004|ALLOC_NEW |TXN_DIRTY |DUPSORT | |DBI_FRESH |F_DUPDATA|P_OVERFLOW| | 2 |0000 0004|ALLOC_NEW |TXN_DIRTY |DUPSORT | |DBI_FRESH |F_DUPDATA|P_OVERFLOW| |
3 |0000 0008|ALLOC_SLOT |TXN_SPILLS |INTEGERKEY| |DBI_CREAT | |P_META | | 3 |0000 0008|ALLOC_SLOT |TXN_SPILLS |INTEGERKEY| |DBI_CREAT | |P_META | |
4 |0000 0010|ALLOC_FAKE |TXN_HAS_CHILD |DUPFIXED |NOOVERWRITE|DBI_VALID | |P_BAD | | 4 |0000 0010|ALLOC_FAKE |TXN_HAS_CHILD |DUPFIXED |NOOVERWRITE|DBI_VALID | |P_BAD | |
5 |0000 0020| |TXN_UPDATE_GC |INTEGERDUP|NODUPDATA |DBI_USRVALID| |P_LEAF2 | | 5 |0000 0020| | |INTEGERDUP|NODUPDATA |DBI_USRVALID| |P_LEAF2 | |
6 |0000 0040| |TXN_FROZEN_RE |REVERSEDUP|CURRENT |DBI_DUPDATA | |P_SUBP | | 6 |0000 0040| | |REVERSEDUP|CURRENT |DBI_DUPDATA | |P_SUBP | |
7 |0000 0080| | | |ALLDUPS |DBI_AUDITED | | | | 7 |0000 0080| | | |ALLDUPS |DBI_AUDITED | | | |
8 |0000 0100| _MAY_MOVE | | | | | | | <= | 8 |0000 0100| _MAY_MOVE | | | | | | | <= |
9 |0000 0200| _MAY_UNMAP| | | | | | | <= | 9 |0000 0200| _MAY_UNMAP| | | | | | | <= |

View File

@ -6588,20 +6588,19 @@ static pgno_t *scan4seq_resolver(pgno_t *range, const size_t len,
* *
* Returns 0 on success, non-zero on failure.*/ * Returns 0 on success, non-zero on failure.*/
#define MDBX_ALLOC_GC 1 #define MDBX_ALLOC_DEFAULT 0
#define MDBX_ALLOC_NEW 2 #define MDBX_ALLOC_RESERVE 1
#define MDBX_ALLOC_COALESCE 4 #define MDBX_ALLOC_UNIMPORTANT 2
#define MDBX_ALLOC_SLOT 8 #define MDBX_ALLOC_COALESCE 4 /* внутреннее состояние */
#define MDBX_ALLOC_RESERVE 16 #define MDBX_ALLOC_SHOULD_SCAN 8 /* внутреннее состояние */
#define MDBX_ALLOC_BACKLOG 32 #define MDBX_ALLOC_LIFO 16 /* внутреннее состояние */
#define MDBX_ALLOC_ALL (MDBX_ALLOC_GC | MDBX_ALLOC_NEW)
#define MDBX_ALLOC_SHOULD_SCAN 64 /* internal state */
#define MDBX_ALLOC_LIFO 128 /* internal state */
static __inline bool is_gc_usable(const MDBX_txn *txn) { static __inline bool is_gc_usable(MDBX_txn *txn, const MDBX_cursor *mc,
const uint8_t flags) {
/* If txn is updating the GC, then the retired-list cannot play catch-up with /* If txn is updating the GC, then the retired-list cannot play catch-up with
* itself by growing while trying to save it. */ * itself by growing while trying to save it. */
if (txn->mt_flags & (MDBX_TXN_UPDATE_GC | MDBX_TXN_FROZEN_RE)) if (mc->mc_dbi == FREE_DBI && !(flags & MDBX_ALLOC_RESERVE) &&
!(mc->mc_flags & C_GCU))
return false; return false;
/* avoid (recursive) search inside empty tree and while tree is /* avoid (recursive) search inside empty tree and while tree is
@ -6609,11 +6608,6 @@ static __inline bool is_gc_usable(const MDBX_txn *txn) {
if (txn->mt_dbs[FREE_DBI].md_entries == 0) if (txn->mt_dbs[FREE_DBI].md_entries == 0)
return false; return false;
/* If our dirty list is already full, we can't touch GC */
if (unlikely(txn->tw.dirtyroom < txn->mt_dbs[FREE_DBI].md_depth) &&
!(txn->mt_dbistate[FREE_DBI] & DBI_DIRTY))
return false;
return true; return true;
} }
@ -6644,22 +6638,13 @@ static pgr_t page_alloc_slowpath(const MDBX_cursor *mc, const size_t num,
prof->spe_counter += 1; prof->spe_counter += 1;
#endif /* MDBX_ENABLE_PROFGC */ #endif /* MDBX_ENABLE_PROFGC */
eASSERT(env, flags & MDBX_ALLOC_GC); eASSERT(env, num > 0 || (flags & MDBX_ALLOC_RESERVE));
eASSERT(env, num == 0 || !(flags & MDBX_ALLOC_SLOT));
eASSERT(env, num > 0 || !(flags & MDBX_ALLOC_NEW));
eASSERT(env, (flags & (MDBX_ALLOC_SLOT | MDBX_ALLOC_RESERVE |
MDBX_ALLOC_BACKLOG)) == 0 ||
(flags & MDBX_ALLOC_NEW) == 0);
eASSERT(env, pnl_check_allocated(txn->tw.relist, eASSERT(env, pnl_check_allocated(txn->tw.relist,
txn->mt_next_pgno - MDBX_ENABLE_REFUND)); txn->mt_next_pgno - MDBX_ENABLE_REFUND));
pgno_t pgno = 0, *range = nullptr; pgno_t pgno = 0, *range = nullptr;
size_t newnext, re_len = MDBX_PNL_GETSIZE(txn->tw.relist); size_t newnext, re_len = MDBX_PNL_GETSIZE(txn->tw.relist);
if (num > 1) { if (num > 1) {
eASSERT(env, !(flags & MDBX_ALLOC_SLOT));
eASSERT(env, (txn->mt_flags & MDBX_TXN_FROZEN_RE) == 0);
if (unlikely(txn->mt_flags & MDBX_TXN_FROZEN_RE))
goto no_gc;
#if MDBX_ENABLE_PROFGC #if MDBX_ENABLE_PROFGC
prof->xpages += 1; prof->xpages += 1;
#endif /* MDBX_ENABLE_PROFGC */ #endif /* MDBX_ENABLE_PROFGC */
@ -6675,13 +6660,12 @@ static pgr_t page_alloc_slowpath(const MDBX_cursor *mc, const size_t num,
} }
} }
} else { } else {
eASSERT(env, eASSERT(env, num == 0 || re_len == 0);
(flags & (MDBX_ALLOC_SLOT | MDBX_ALLOC_RESERVE)) || re_len == 0);
} }
//--------------------------------------------------------------------------- //---------------------------------------------------------------------------
if (unlikely(!is_gc_usable(txn))) if (unlikely(!is_gc_usable(txn, mc, flags)))
goto no_gc; goto no_gc;
eASSERT(env, (flags & (MDBX_ALLOC_COALESCE | MDBX_ALLOC_LIFO | eASSERT(env, (flags & (MDBX_ALLOC_COALESCE | MDBX_ALLOC_LIFO |
@ -6691,7 +6675,7 @@ static pgr_t page_alloc_slowpath(const MDBX_cursor *mc, const size_t num,
if (/* Не коагулируем записи при подготовке резерва для обновления GC. if (/* Не коагулируем записи при подготовке резерва для обновления GC.
* Иначе попытка увеличить резерв может приводить к необходимости ещё * Иначе попытка увеличить резерв может приводить к необходимости ещё
* большего резерва из-за увеличения списка переработанных страниц. */ * большего резерва из-за увеличения списка переработанных страниц. */
flags < MDBX_ALLOC_COALESCE) { (flags & MDBX_ALLOC_RESERVE) == 0) {
if (txn->mt_dbs[FREE_DBI].md_branch_pages && if (txn->mt_dbs[FREE_DBI].md_branch_pages &&
re_len < env->me_maxgc_ov1page / 2) re_len < env->me_maxgc_ov1page / 2)
flags += MDBX_ALLOC_COALESCE; flags += MDBX_ALLOC_COALESCE;
@ -6777,7 +6761,6 @@ next_gc:;
&data, mp)) != MDBX_SUCCESS)) &data, mp)) != MDBX_SUCCESS))
goto fail; goto fail;
eASSERT(env, (txn->mt_flags & MDBX_TXN_FROZEN_RE) == 0);
pgno_t *gc_pnl = (pgno_t *)data.iov_base; pgno_t *gc_pnl = (pgno_t *)data.iov_base;
if (unlikely(data.iov_len % sizeof(pgno_t) || if (unlikely(data.iov_len % sizeof(pgno_t) ||
data.iov_len < MDBX_PNL_SIZEOF(gc_pnl) || data.iov_len < MDBX_PNL_SIZEOF(gc_pnl) ||
@ -6818,8 +6801,7 @@ next_gc:;
} }
if (unlikely(/* list is too long already */ re_len >= if (unlikely(/* list is too long already */ re_len >=
env->me_options.rp_augment_limit) && env->me_options.rp_augment_limit) &&
((/* not a slot-request from gc-update */ ((/* not a slot-request from gc-update */ num &&
(flags & MDBX_ALLOC_SLOT) == 0 &&
/* have enough unallocated space */ txn->mt_geo.upper >= /* have enough unallocated space */ txn->mt_geo.upper >=
txn->mt_next_pgno + num) || txn->mt_next_pgno + num) ||
gc_len + re_len >= MDBX_PGL_LIMIT)) { gc_len + re_len >= MDBX_PGL_LIMIT)) {
@ -6883,7 +6865,7 @@ next_gc:;
txn->mt_next_pgno - MDBX_ENABLE_REFUND)); txn->mt_next_pgno - MDBX_ENABLE_REFUND));
/* Done for a kick-reclaim mode, actually no page needed */ /* Done for a kick-reclaim mode, actually no page needed */
if (unlikely(flags & MDBX_ALLOC_SLOT)) { if (unlikely(num == 0)) {
eASSERT(env, ret.err == MDBX_SUCCESS); eASSERT(env, ret.err == MDBX_SUCCESS);
TRACE("%s: last id #%" PRIaTXN ", re-len %zu", "early-exit for slot", id, TRACE("%s: last id #%" PRIaTXN ", re-len %zu", "early-exit for slot", id,
re_len); re_len);
@ -6901,6 +6883,7 @@ next_gc:;
scan: scan:
eASSERT(env, flags & MDBX_ALLOC_SHOULD_SCAN); eASSERT(env, flags & MDBX_ALLOC_SHOULD_SCAN);
eASSERT(env, num > 0);
if (re_len >= num) { if (re_len >= num) {
eASSERT(env, MDBX_PNL_LAST(txn->tw.relist) < txn->mt_next_pgno && eASSERT(env, MDBX_PNL_LAST(txn->tw.relist) < txn->mt_next_pgno &&
MDBX_PNL_FIRST(txn->tw.relist) < txn->mt_next_pgno); MDBX_PNL_FIRST(txn->tw.relist) < txn->mt_next_pgno);
@ -6977,8 +6960,7 @@ depleted_gc:
meta_prefer_steady(env, &txn->tw.troika).ptr_c); meta_prefer_steady(env, &txn->tw.troika).ptr_c);
goto retry_gc_refresh_oldest; goto retry_gc_refresh_oldest;
} }
if ((flags & (MDBX_ALLOC_BACKLOG | MDBX_ALLOC_NEW)) == 0 || if ((autosync_threshold &&
(autosync_threshold &&
atomic_load64(&env->me_lck->mti_unsynced_pages, mo_Relaxed) >= atomic_load64(&env->me_lck->mti_unsynced_pages, mo_Relaxed) >=
autosync_threshold) || autosync_threshold) ||
(autosync_period && (autosync_period &&
@ -6986,7 +6968,7 @@ depleted_gc:
atomic_load64(&env->me_lck->mti_eoos_timestamp, mo_Relaxed)) && atomic_load64(&env->me_lck->mti_eoos_timestamp, mo_Relaxed)) &&
osal_monotime() - eoos_timestamp >= autosync_period) || osal_monotime() - eoos_timestamp >= autosync_period) ||
newnext >= txn->mt_geo.upper || newnext >= txn->mt_geo.upper ||
(newnext >= txn->mt_end_pgno && ((num == 0 || newnext >= txn->mt_end_pgno) &&
(autosync_threshold | autosync_period) == 0)) { (autosync_threshold | autosync_period) == 0)) {
/* make steady checkpoint. */ /* make steady checkpoint. */
#if MDBX_ENABLE_PROFGC #if MDBX_ENABLE_PROFGC
@ -7014,7 +6996,7 @@ depleted_gc:
/* Avoid kick lagging reader(s) if is enough unallocated space /* Avoid kick lagging reader(s) if is enough unallocated space
* at the end of database file. */ * at the end of database file. */
if ((flags & MDBX_ALLOC_NEW) && newnext <= txn->mt_end_pgno) { if (!(flags & MDBX_ALLOC_RESERVE) && newnext <= txn->mt_end_pgno) {
eASSERT(env, range == nullptr); eASSERT(env, range == nullptr);
goto done; goto done;
} }
@ -7028,7 +7010,7 @@ depleted_gc:
//--------------------------------------------------------------------------- //---------------------------------------------------------------------------
no_gc: no_gc:
if ((flags & MDBX_ALLOC_NEW) == 0) { if (flags & MDBX_ALLOC_RESERVE) {
ret.err = MDBX_NOTFOUND; ret.err = MDBX_NOTFOUND;
goto fail; goto fail;
} }
@ -7071,10 +7053,9 @@ no_gc:
done: done:
ret.err = MDBX_SUCCESS; ret.err = MDBX_SUCCESS;
if (likely((flags & (MDBX_ALLOC_SLOT | MDBX_ALLOC_RESERVE)) == 0)) { if (likely((flags & MDBX_ALLOC_RESERVE) == 0)) {
ENSURE(env, pgno >= NUM_METAS); ENSURE(env, pgno >= NUM_METAS);
if (range) { if (range) {
eASSERT(env, (txn->mt_flags & MDBX_TXN_FROZEN_RE) == 0);
eASSERT(env, pgno == *range); eASSERT(env, pgno == *range);
eASSERT(env, pgno + num <= txn->mt_next_pgno && pgno >= NUM_METAS); eASSERT(env, pgno + num <= txn->mt_next_pgno && pgno >= NUM_METAS);
eASSERT(env, re_len == MDBX_PNL_GETSIZE(txn->tw.relist)); eASSERT(env, re_len == MDBX_PNL_GETSIZE(txn->tw.relist));
@ -7091,7 +7072,6 @@ done:
eASSERT(env, pnl_check_allocated(txn->tw.relist, eASSERT(env, pnl_check_allocated(txn->tw.relist,
txn->mt_next_pgno - MDBX_ENABLE_REFUND)); txn->mt_next_pgno - MDBX_ENABLE_REFUND));
} else { } else {
eASSERT(env, flags & MDBX_ALLOC_NEW);
pgno = txn->mt_next_pgno; pgno = txn->mt_next_pgno;
txn->mt_next_pgno += (pgno_t)num; txn->mt_next_pgno += (pgno_t)num;
eASSERT(env, txn->mt_next_pgno <= txn->mt_end_pgno); eASSERT(env, txn->mt_next_pgno <= txn->mt_end_pgno);
@ -7135,8 +7115,9 @@ done:
int level; int level;
const char *what; const char *what;
if (flags & MDBX_ALLOC_RESERVE) { if (flags & MDBX_ALLOC_RESERVE) {
level = (flags & MDBX_ALLOC_BACKLOG) ? MDBX_LOG_DEBUG : MDBX_LOG_NOTICE; level =
what = (flags & MDBX_ALLOC_SLOT) ? "gc-slot/backlog" : "backlog-pages"; (flags & MDBX_ALLOC_UNIMPORTANT) ? MDBX_LOG_DEBUG : MDBX_LOG_NOTICE;
what = num ? "reserve-pages" : "fetch-slot";
} else { } else {
txn->mt_flags |= MDBX_TXN_ERROR; txn->mt_flags |= MDBX_TXN_ERROR;
level = MDBX_LOG_ERROR; level = MDBX_LOG_ERROR;
@ -7151,7 +7132,7 @@ done:
} else { } else {
early_exit: early_exit:
DEBUG("return NULL for %zu pages for ALLOC_%s, rc %d", num, DEBUG("return NULL for %zu pages for ALLOC_%s, rc %d", num,
(flags & MDBX_ALLOC_SLOT) ? "SLOT" : "RESERVE", ret.err); num ? "RESERVE" : "SLOT", ret.err);
ret.page = NULL; ret.page = NULL;
} }
@ -7197,62 +7178,60 @@ __hot static pgr_t page_alloc(const MDBX_cursor *mc) {
return ret; return ret;
} }
if (likely(!(txn->mt_flags & MDBX_TXN_FROZEN_RE))) { MDBX_PNL pnl = txn->tw.relist;
MDBX_PNL pnl = txn->tw.relist; const size_t len = MDBX_PNL_GETSIZE(pnl);
const size_t len = MDBX_PNL_GETSIZE(pnl); if (likely(len > 0)) {
if (likely(len > 0)) { MDBX_env *const env = txn->mt_env;
MDBX_env *const env = txn->mt_env;
MDBX_PNL_SETSIZE(pnl, len - 1); MDBX_PNL_SETSIZE(pnl, len - 1);
#if MDBX_PNL_ASCENDING #if MDBX_PNL_ASCENDING
const pgno_t pgno = pnl[1]; const pgno_t pgno = pnl[1];
for (size_t i = 1; i < len; ++i) for (size_t i = 1; i < len; ++i)
pnl[i] = pnl[i + 1]; pnl[i] = pnl[i + 1];
#else #else
const pgno_t pgno = pnl[len]; const pgno_t pgno = pnl[len];
#endif #endif
#if MDBX_ENABLE_PROFGC #if MDBX_ENABLE_PROFGC
const uint64_t monotime_before = osal_monotime(); const uint64_t monotime_before = osal_monotime();
size_t majflt_before; size_t majflt_before;
const uint64_t cputime_before = osal_cputime(&majflt_before); const uint64_t cputime_before = osal_cputime(&majflt_before);
profgc_stat_t *const prof = profgc_stat_t *const prof = (mc->mc_dbi == FREE_DBI)
(mc->mc_dbi == FREE_DBI) ? &env->me_lck->mti_pgop_stat.gc_prof.self ? &env->me_lck->mti_pgop_stat.gc_prof.self
: &env->me_lck->mti_pgop_stat.gc_prof.work; : &env->me_lck->mti_pgop_stat.gc_prof.work;
#endif /* MDBX_ENABLE_PROFGC */ #endif /* MDBX_ENABLE_PROFGC */
pgr_t ret; pgr_t ret;
if (env->me_flags & MDBX_WRITEMAP) { if (env->me_flags & MDBX_WRITEMAP) {
ret.page = pgno2page(env, pgno); ret.page = pgno2page(env, pgno);
MDBX_ASAN_UNPOISON_MEMORY_REGION(ret.page, env->me_psize); MDBX_ASAN_UNPOISON_MEMORY_REGION(ret.page, env->me_psize);
} else { } else {
ret.page = page_malloc(txn, 1); ret.page = page_malloc(txn, 1);
if (unlikely(!ret.page)) { if (unlikely(!ret.page)) {
ret.err = MDBX_ENOMEM; ret.err = MDBX_ENOMEM;
goto bailout; goto bailout;
}
} }
VALGRIND_MAKE_MEM_UNDEFINED(ret.page, env->me_psize);
ret.page->mp_pgno = pgno;
ret.page->mp_leaf2_ksize = 0;
ret.page->mp_flags = 0;
tASSERT(txn, ret.page->mp_pgno >= NUM_METAS);
ret.err = page_dirty(txn, ret.page, 1);
bailout:
tASSERT(txn, pnl_check_allocated(txn->tw.relist,
txn->mt_next_pgno - MDBX_ENABLE_REFUND));
#if MDBX_ENABLE_PROFGC
size_t majflt_after;
prof->rtime_cpu += osal_cputime(&majflt_after) - cputime_before;
prof->majflt += majflt_after - majflt_before;
prof->xtime_monotonic += osal_monotime() - monotime_before;
#endif /* MDBX_ENABLE_PROFGC */
return ret;
} }
VALGRIND_MAKE_MEM_UNDEFINED(ret.page, env->me_psize);
ret.page->mp_pgno = pgno;
ret.page->mp_leaf2_ksize = 0;
ret.page->mp_flags = 0;
tASSERT(txn, ret.page->mp_pgno >= NUM_METAS);
ret.err = page_dirty(txn, ret.page, 1);
bailout:
tASSERT(txn, pnl_check_allocated(txn->tw.relist,
txn->mt_next_pgno - MDBX_ENABLE_REFUND));
#if MDBX_ENABLE_PROFGC
size_t majflt_after;
prof->rtime_cpu += osal_cputime(&majflt_after) - cputime_before;
prof->majflt += majflt_after - majflt_before;
prof->xtime_monotonic += osal_monotime() - monotime_before;
#endif /* MDBX_ENABLE_PROFGC */
return ret;
} }
return page_alloc_slowpath(mc, 1, MDBX_ALLOC_ALL); return page_alloc_slowpath(mc, 1, MDBX_ALLOC_DEFAULT);
} }
/* Copy the used portions of a page. */ /* Copy the used portions of a page. */
@ -9503,6 +9482,13 @@ static int gcu_clean_stored_retired(MDBX_txn *txn, gcu_context_t *ctx) {
return err; return err;
} }
static int gcu_touch(gcu_context_t *ctx) {
ctx->cursor.mc_flags |= C_GCU;
int err = cursor_touch(&ctx->cursor);
ctx->cursor.mc_flags -= C_GCU;
return err;
}
/* Prepare a backlog of pages to modify GC itself, while reclaiming is /* Prepare a backlog of pages to modify GC itself, while reclaiming is
* prohibited. It should be enough to prevent search in page_alloc_slowpath() * prohibited. It should be enough to prevent search in page_alloc_slowpath()
* during a deleting, when GC tree is unbalanced. */ * during a deleting, when GC tree is unbalanced. */
@ -9537,9 +9523,7 @@ static int gcu_prepare_backlog(MDBX_txn *txn, gcu_context_t *ctx,
return err; return err;
} }
tASSERT(txn, txn->mt_flags & MDBX_TXN_UPDATE_GC); err = gcu_touch(ctx);
txn->mt_flags -= MDBX_TXN_UPDATE_GC;
err = cursor_touch(&ctx->cursor);
TRACE("== after-touch, backlog %zu, err %d", gcu_backlog_size(txn), err); TRACE("== after-touch, backlog %zu, err %d", gcu_backlog_size(txn), err);
if (unlikely(pages4retiredlist > 1) && if (unlikely(pages4retiredlist > 1) &&
@ -9549,9 +9533,9 @@ static int gcu_prepare_backlog(MDBX_txn *txn, gcu_context_t *ctx,
err = gcu_clean_stored_retired(txn, ctx); err = gcu_clean_stored_retired(txn, ctx);
if (unlikely(err != MDBX_SUCCESS)) if (unlikely(err != MDBX_SUCCESS))
return err; return err;
err = page_alloc_slowpath(&ctx->cursor, pages4retiredlist, err =
MDBX_ALLOC_GC | MDBX_ALLOC_RESERVE) page_alloc_slowpath(&ctx->cursor, pages4retiredlist, MDBX_ALLOC_RESERVE)
.err; .err;
TRACE("== after-4linear, backlog %zu, err %d", gcu_backlog_size(txn), err); TRACE("== after-4linear, backlog %zu, err %d", gcu_backlog_size(txn), err);
cASSERT(&ctx->cursor, cASSERT(&ctx->cursor,
gcu_backlog_size(txn) >= pages4retiredlist || err != MDBX_SUCCESS); gcu_backlog_size(txn) >= pages4retiredlist || err != MDBX_SUCCESS);
@ -9560,11 +9544,9 @@ static int gcu_prepare_backlog(MDBX_txn *txn, gcu_context_t *ctx,
while (gcu_backlog_size(txn) < backlog4cow + pages4retiredlist && while (gcu_backlog_size(txn) < backlog4cow + pages4retiredlist &&
err == MDBX_SUCCESS) err == MDBX_SUCCESS)
err = page_alloc_slowpath(&ctx->cursor, 0, err = page_alloc_slowpath(&ctx->cursor, 0,
MDBX_ALLOC_GC | MDBX_ALLOC_SLOT | MDBX_ALLOC_RESERVE | MDBX_ALLOC_UNIMPORTANT)
MDBX_ALLOC_RESERVE | MDBX_ALLOC_BACKLOG)
.err; .err;
txn->mt_flags += MDBX_TXN_UPDATE_GC;
TRACE("<< backlog %zu, err %d", gcu_backlog_size(txn), err); TRACE("<< backlog %zu, err %d", gcu_backlog_size(txn), err);
return (err != MDBX_NOTFOUND) ? err : MDBX_SUCCESS; return (err != MDBX_NOTFOUND) ? err : MDBX_SUCCESS;
} }
@ -9593,7 +9575,6 @@ static int update_gc(MDBX_txn *txn, gcu_context_t *ctx) {
MDBX_env *const env = txn->mt_env; MDBX_env *const env = txn->mt_env;
const char *const dbg_prefix_mode = ctx->lifo ? " lifo" : " fifo"; const char *const dbg_prefix_mode = ctx->lifo ? " lifo" : " fifo";
(void)dbg_prefix_mode; (void)dbg_prefix_mode;
txn->mt_flags += MDBX_TXN_UPDATE_GC;
ctx->cursor.mc_next = txn->mt_cursors[FREE_DBI]; ctx->cursor.mc_next = txn->mt_cursors[FREE_DBI];
txn->mt_cursors[FREE_DBI] = &ctx->cursor; txn->mt_cursors[FREE_DBI] = &ctx->cursor;
@ -9741,10 +9722,7 @@ retry:
if (txn->tw.loose_count > 0) { if (txn->tw.loose_count > 0) {
TRACE("%s: try allocate gc-slot for %zu loose-pages", dbg_prefix_mode, TRACE("%s: try allocate gc-slot for %zu loose-pages", dbg_prefix_mode,
txn->tw.loose_count); txn->tw.loose_count);
rc = page_alloc_slowpath(&ctx->cursor, 0, rc = page_alloc_slowpath(&ctx->cursor, 0, MDBX_ALLOC_RESERVE).err;
MDBX_ALLOC_GC | MDBX_ALLOC_SLOT |
MDBX_ALLOC_RESERVE)
.err;
if (rc == MDBX_SUCCESS) { if (rc == MDBX_SUCCESS) {
TRACE("%s: retry since gc-slot for %zu loose-pages available", TRACE("%s: retry since gc-slot for %zu loose-pages available",
dbg_prefix_mode, txn->tw.loose_count); dbg_prefix_mode, txn->tw.loose_count);
@ -9826,9 +9804,9 @@ retry:
if (ctx->retired_stored < MDBX_PNL_GETSIZE(txn->tw.retired_pages)) { if (ctx->retired_stored < MDBX_PNL_GETSIZE(txn->tw.retired_pages)) {
if (unlikely(!ctx->retired_stored)) { if (unlikely(!ctx->retired_stored)) {
/* Make sure last page of GC is touched and on retired-list */ /* Make sure last page of GC is touched and on retired-list */
txn->mt_flags -= MDBX_TXN_UPDATE_GC; rc = cursor_last(&ctx->cursor, nullptr, nullptr);
rc = page_search(&ctx->cursor, NULL, MDBX_PS_LAST | MDBX_PS_MODIFY); if (likely(rc != MDBX_SUCCESS))
txn->mt_flags += MDBX_TXN_UPDATE_GC; rc = gcu_touch(ctx);
if (unlikely(rc != MDBX_SUCCESS) && rc != MDBX_NOTFOUND) if (unlikely(rc != MDBX_SUCCESS) && rc != MDBX_NOTFOUND)
goto bailout; goto bailout;
} }
@ -9966,16 +9944,12 @@ retry:
left > (MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed) - ctx->reused_slot) * left > (MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed) - ctx->reused_slot) *
env->me_maxgc_ov1page && env->me_maxgc_ov1page &&
!ctx->dense) { !ctx->dense) {
/* Hужен свобожный для для сохранения списка страниц. */ /* Hужен свободный для для сохранения списка страниц. */
bool need_cleanup = false; bool need_cleanup = false;
txnid_t snap_oldest = 0; txnid_t snap_oldest = 0;
retry_rid: retry_rid:
txn->mt_flags -= MDBX_TXN_UPDATE_GC;
do { do {
rc = page_alloc_slowpath(&ctx->cursor, 0, rc = page_alloc_slowpath(&ctx->cursor, 0, MDBX_ALLOC_RESERVE).err;
MDBX_ALLOC_GC | MDBX_ALLOC_SLOT |
MDBX_ALLOC_RESERVE)
.err;
snap_oldest = env->me_lck->mti_oldest_reader.weak; snap_oldest = env->me_lck->mti_oldest_reader.weak;
if (likely(rc == MDBX_SUCCESS)) { if (likely(rc == MDBX_SUCCESS)) {
TRACE("%s: took @%" PRIaTXN " from GC", dbg_prefix_mode, TRACE("%s: took @%" PRIaTXN " from GC", dbg_prefix_mode,
@ -9988,7 +9962,6 @@ retry:
left > left >
(MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed) - ctx->reused_slot) * (MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed) - ctx->reused_slot) *
env->me_maxgc_ov1page); env->me_maxgc_ov1page);
txn->mt_flags += MDBX_TXN_UPDATE_GC;
if (likely(rc == MDBX_SUCCESS)) { if (likely(rc == MDBX_SUCCESS)) {
TRACE("%s: got enough from GC.", dbg_prefix_mode); TRACE("%s: got enough from GC.", dbg_prefix_mode);
@ -10006,7 +9979,7 @@ retry:
} else { } else {
tASSERT(txn, txn->tw.last_reclaimed == 0); tASSERT(txn, txn->tw.last_reclaimed == 0);
if (unlikely(txn_oldest_reader(txn) != snap_oldest)) if (unlikely(txn_oldest_reader(txn) != snap_oldest))
/* should retry page_alloc_slowpath(MDBX_ALLOC_GC) /* should retry page_alloc_slowpath()
* if the oldest reader changes since the last attempt */ * if the oldest reader changes since the last attempt */
goto retry_rid; goto retry_rid;
/* no reclaimable GC entries, /* no reclaimable GC entries,
@ -10289,7 +10262,6 @@ retry:
key.iov_len = sizeof(fill_gc_id); key.iov_len = sizeof(fill_gc_id);
tASSERT(txn, data.iov_len >= sizeof(pgno_t) * 2); tASSERT(txn, data.iov_len >= sizeof(pgno_t) * 2);
txn->mt_flags += MDBX_TXN_FROZEN_RE;
size_t chunk = data.iov_len / sizeof(pgno_t) - 1; size_t chunk = data.iov_len / sizeof(pgno_t) - 1;
if (unlikely(chunk > left)) { if (unlikely(chunk > left)) {
TRACE("%s: chunk %zu > left %zu, @%" PRIaTXN, dbg_prefix_mode, chunk, TRACE("%s: chunk %zu > left %zu, @%" PRIaTXN, dbg_prefix_mode, chunk,
@ -10297,14 +10269,11 @@ retry:
if ((ctx->loop < 5 && chunk - left > ctx->loop / 2) || if ((ctx->loop < 5 && chunk - left > ctx->loop / 2) ||
chunk - left > env->me_maxgc_ov1page) { chunk - left > env->me_maxgc_ov1page) {
data.iov_len = (left + 1) * sizeof(pgno_t); data.iov_len = (left + 1) * sizeof(pgno_t);
if (ctx->loop < 7)
txn->mt_flags &= ~MDBX_TXN_FROZEN_RE;
} }
chunk = left; chunk = left;
} }
rc = mdbx_cursor_put(&ctx->cursor, &key, &data, rc = mdbx_cursor_put(&ctx->cursor, &key, &data,
MDBX_CURRENT | MDBX_RESERVE); MDBX_CURRENT | MDBX_RESERVE);
txn->mt_flags &= ~MDBX_TXN_FROZEN_RE;
if (unlikely(rc != MDBX_SUCCESS)) if (unlikely(rc != MDBX_SUCCESS))
goto bailout; goto bailout;
gcu_clean_reserved(env, data); gcu_clean_reserved(env, data);
@ -15079,7 +15048,8 @@ static __noinline int node_read_bigdata(MDBX_cursor *mc, const MDBX_node *node,
if (!MDBX_DISABLE_VALIDATION) { if (!MDBX_DISABLE_VALIDATION) {
const MDBX_env *env = mc->mc_txn->mt_env; const MDBX_env *env = mc->mc_txn->mt_env;
const size_t dsize = data->iov_len; const size_t dsize = data->iov_len;
if (unlikely(node_size_len(node_ks(node), dsize) <= env->me_leaf_nodemax)) if (unlikely(node_size_len(node_ks(node), dsize) <= env->me_leaf_nodemax) &&
mc->mc_dbi != FREE_DBI)
poor_page(mp, "too small data (%zu bytes) for bigdata-node", dsize); poor_page(mp, "too small data (%zu bytes) for bigdata-node", dsize);
const unsigned npages = number_of_ovpages(env, dsize); const unsigned npages = number_of_ovpages(env, dsize);
if (unlikely(lp.page->mp_pages != npages)) { if (unlikely(lp.page->mp_pages != npages)) {
@ -15087,7 +15057,7 @@ static __noinline int node_read_bigdata(MDBX_cursor *mc, const MDBX_node *node,
return bad_page(lp.page, return bad_page(lp.page,
"too less n-pages %u for bigdata-node (%zu bytes)", "too less n-pages %u for bigdata-node (%zu bytes)",
lp.page->mp_pages, dsize); lp.page->mp_pages, dsize);
else else if (mc->mc_dbi != FREE_DBI)
poor_page(lp.page, "extra n-pages %u for bigdata-node (%zu bytes)", poor_page(lp.page, "extra n-pages %u for bigdata-node (%zu bytes)",
lp.page->mp_pages, dsize); lp.page->mp_pages, dsize);
} }
@ -16183,7 +16153,6 @@ static int touch_dbi(MDBX_cursor *mc) {
*mc->mc_dbistate |= DBI_DIRTY; *mc->mc_dbistate |= DBI_DIRTY;
mc->mc_txn->mt_flags |= MDBX_TXN_DIRTY; mc->mc_txn->mt_flags |= MDBX_TXN_DIRTY;
if (mc->mc_dbi >= CORE_DBS) { if (mc->mc_dbi >= CORE_DBS) {
cASSERT(mc, (mc->mc_txn->mt_flags & MDBX_TXN_UPDATE_GC) == 0);
/* Touch DB record of named DB */ /* Touch DB record of named DB */
MDBX_cursor_couple cx; MDBX_cursor_couple cx;
int rc = cursor_init(&cx.outer, mc->mc_txn, MAIN_DBI); int rc = cursor_init(&cx.outer, mc->mc_txn, MAIN_DBI);
@ -16596,9 +16565,9 @@ __hot int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data,
/* Large/Overflow page overwrites need special handling */ /* Large/Overflow page overwrites need special handling */
if (unlikely(node_flags(node) & F_BIGDATA)) { if (unlikely(node_flags(node) & F_BIGDATA)) {
int dpages = (node_size(key, data) > env->me_leaf_nodemax) const size_t dpages = (node_size(key, data) > env->me_leaf_nodemax)
? number_of_ovpages(env, data->iov_len) ? number_of_ovpages(env, data->iov_len)
: 0; : 0;
const pgno_t pgno = node_largedata_pgno(node); const pgno_t pgno = node_largedata_pgno(node);
pgr_t lp = page_get_large(mc, pgno, mc->mc_pg[mc->mc_top]->mp_txnid); pgr_t lp = page_get_large(mc, pgno, mc->mc_pg[mc->mc_top]->mp_txnid);
@ -16607,13 +16576,13 @@ __hot int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data,
cASSERT(mc, PAGETYPE_WHOLE(lp.page) == P_OVERFLOW); cASSERT(mc, PAGETYPE_WHOLE(lp.page) == P_OVERFLOW);
/* Is the ov page from this txn (or a parent) and big enough? */ /* Is the ov page from this txn (or a parent) and big enough? */
int ovpages = lp.page->mp_pages; const size_t ovpages = lp.page->mp_pages;
if (!IS_FROZEN(mc->mc_txn, lp.page) && const size_t extra_threshold =
(unlikely(mc->mc_txn->mt_flags & MDBX_TXN_FROZEN_RE) (mc->mc_dbi == FREE_DBI)
? (ovpages >= dpages) ? 1
: (ovpages == : /* LY: add configurable threshold to keep reserve space */ 0;
/* LY: add configurable threshold to keep reserve space */ if (!IS_FROZEN(mc->mc_txn, lp.page) && ovpages >= dpages &&
dpages))) { ovpages <= dpages + extra_threshold) {
/* yes, overwrite it. */ /* yes, overwrite it. */
if (!IS_MODIFIABLE(mc->mc_txn, lp.page)) { if (!IS_MODIFIABLE(mc->mc_txn, lp.page)) {
if (IS_SPILLED(mc->mc_txn, lp.page)) { if (IS_SPILLED(mc->mc_txn, lp.page)) {
@ -17168,7 +17137,7 @@ static pgr_t page_new(MDBX_cursor *mc, const unsigned flags) {
static pgr_t page_new_large(MDBX_cursor *mc, const size_t npages) { static pgr_t page_new_large(MDBX_cursor *mc, const size_t npages) {
pgr_t ret = likely(npages == 1) pgr_t ret = likely(npages == 1)
? page_alloc(mc) ? page_alloc(mc)
: page_alloc_slowpath(mc, npages, MDBX_ALLOC_ALL); : page_alloc_slowpath(mc, npages, MDBX_ALLOC_DEFAULT);
if (unlikely(ret.err != MDBX_SUCCESS)) if (unlikely(ret.err != MDBX_SUCCESS))
return ret; return ret;
@ -17279,7 +17248,6 @@ __hot static int __must_check_result node_add_leaf(MDBX_cursor *mc, size_t indx,
key ? key->iov_len : 0, DKEY_DEBUG(key)); key ? key->iov_len : 0, DKEY_DEBUG(key));
cASSERT(mc, key != NULL && data != NULL); cASSERT(mc, key != NULL && data != NULL);
cASSERT(mc, PAGETYPE_COMPAT(mp) == P_LEAF); cASSERT(mc, PAGETYPE_COMPAT(mp) == P_LEAF);
cASSERT(mc, page_room(mp) >= leaf_size(mc->mc_txn->mt_env, key, data));
MDBX_page *largepage = NULL; MDBX_page *largepage = NULL;
size_t node_bytes; size_t node_bytes;
@ -17288,6 +17256,7 @@ __hot static int __must_check_result node_add_leaf(MDBX_cursor *mc, size_t indx,
STATIC_ASSERT(sizeof(pgno_t) % 2 == 0); STATIC_ASSERT(sizeof(pgno_t) % 2 == 0);
node_bytes = node_bytes =
node_size_len(key->iov_len, 0) + sizeof(pgno_t) + sizeof(indx_t); node_size_len(key->iov_len, 0) + sizeof(pgno_t) + sizeof(indx_t);
cASSERT(mc, page_room(mp) >= node_bytes);
} else if (unlikely(node_size(key, data) > } else if (unlikely(node_size(key, data) >
mc->mc_txn->mt_env->me_leaf_nodemax)) { mc->mc_txn->mt_env->me_leaf_nodemax)) {
/* Put data on large/overflow page. */ /* Put data on large/overflow page. */
@ -17301,6 +17270,7 @@ __hot static int __must_check_result node_add_leaf(MDBX_cursor *mc, size_t indx,
flags); flags);
return MDBX_PROBLEM; return MDBX_PROBLEM;
} }
cASSERT(mc, page_room(mp) >= leaf_size(mc->mc_txn->mt_env, key, data));
const pgno_t ovpages = number_of_ovpages(mc->mc_txn->mt_env, data->iov_len); const pgno_t ovpages = number_of_ovpages(mc->mc_txn->mt_env, data->iov_len);
const pgr_t npr = page_new_large(mc, ovpages); const pgr_t npr = page_new_large(mc, ovpages);
if (unlikely(npr.err != MDBX_SUCCESS)) if (unlikely(npr.err != MDBX_SUCCESS))
@ -17312,10 +17282,12 @@ __hot static int __must_check_result node_add_leaf(MDBX_cursor *mc, size_t indx,
flags |= F_BIGDATA; flags |= F_BIGDATA;
node_bytes = node_bytes =
node_size_len(key->iov_len, 0) + sizeof(pgno_t) + sizeof(indx_t); node_size_len(key->iov_len, 0) + sizeof(pgno_t) + sizeof(indx_t);
cASSERT(mc, node_bytes == leaf_size(mc->mc_txn->mt_env, key, data));
} else { } else {
cASSERT(mc, page_room(mp) >= leaf_size(mc->mc_txn->mt_env, key, data));
node_bytes = node_size(key, data) + sizeof(indx_t); node_bytes = node_size(key, data) + sizeof(indx_t);
cASSERT(mc, node_bytes == leaf_size(mc->mc_txn->mt_env, key, data));
} }
cASSERT(mc, node_bytes == leaf_size(mc->mc_txn->mt_env, key, data));
/* Move higher pointers up one slot. */ /* Move higher pointers up one slot. */
const size_t nkeys = page_numkeys(mp); const size_t nkeys = page_numkeys(mp);
@ -19056,7 +19028,8 @@ __cold static int page_check(MDBX_cursor *const mc, const MDBX_page *const mp) {
"big-node data size (%zu) <> min/max value-length (%zu/%zu)\n", "big-node data size (%zu) <> min/max value-length (%zu/%zu)\n",
dsize, mc->mc_dbx->md_vlen_min, mc->mc_dbx->md_vlen_max); dsize, mc->mc_dbx->md_vlen_min, mc->mc_dbx->md_vlen_max);
if (unlikely(node_size_len(node_ks(node), dsize) <= if (unlikely(node_size_len(node_ks(node), dsize) <=
mc->mc_txn->mt_env->me_leaf_nodemax)) mc->mc_txn->mt_env->me_leaf_nodemax) &&
mc->mc_dbi != FREE_DBI)
poor_page(mp, "too small data (%zu bytes) for bigdata-node", dsize); poor_page(mp, "too small data (%zu bytes) for bigdata-node", dsize);
if ((mc->mc_checking & CC_RETIRING) == 0) { if ((mc->mc_checking & CC_RETIRING) == 0) {
@ -19071,7 +19044,7 @@ __cold static int page_check(MDBX_cursor *const mc, const MDBX_page *const mp) {
rc = bad_page(lp.page, rc = bad_page(lp.page,
"too less n-pages %u for bigdata-node (%zu bytes)", "too less n-pages %u for bigdata-node (%zu bytes)",
lp.page->mp_pages, dsize); lp.page->mp_pages, dsize);
else else if (mc->mc_dbi != FREE_DBI)
poor_page(lp.page, poor_page(lp.page,
"extra n-pages %u for bigdata-node (%zu bytes)", "extra n-pages %u for bigdata-node (%zu bytes)",
lp.page->mp_pages, dsize); lp.page->mp_pages, dsize);

View File

@ -1000,13 +1000,9 @@ struct MDBX_txn {
/* Additional flag for sync_locked() */ /* Additional flag for sync_locked() */
#define MDBX_SHRINK_ALLOWED UINT32_C(0x40000000) #define MDBX_SHRINK_ALLOWED UINT32_C(0x40000000)
#define MDBX_TXN_UPDATE_GC 0x20 /* GC is being updated */
#define MDBX_TXN_FROZEN_RE 0x40 /* list of reclaimed-pgno must not altered */
#define TXN_FLAGS \ #define TXN_FLAGS \
(MDBX_TXN_FINISHED | MDBX_TXN_ERROR | MDBX_TXN_DIRTY | MDBX_TXN_SPILLS | \ (MDBX_TXN_FINISHED | MDBX_TXN_ERROR | MDBX_TXN_DIRTY | MDBX_TXN_SPILLS | \
MDBX_TXN_HAS_CHILD | MDBX_TXN_INVALID | MDBX_TXN_UPDATE_GC | \ MDBX_TXN_HAS_CHILD | MDBX_TXN_INVALID)
MDBX_TXN_FROZEN_RE)
#if (TXN_FLAGS & (MDBX_TXN_RW_BEGIN_FLAGS | MDBX_TXN_RO_BEGIN_FLAGS)) || \ #if (TXN_FLAGS & (MDBX_TXN_RW_BEGIN_FLAGS | MDBX_TXN_RO_BEGIN_FLAGS)) || \
((MDBX_TXN_RW_BEGIN_FLAGS | MDBX_TXN_RO_BEGIN_FLAGS | TXN_FLAGS) & \ ((MDBX_TXN_RW_BEGIN_FLAGS | MDBX_TXN_RO_BEGIN_FLAGS | TXN_FLAGS) & \
@ -1147,6 +1143,9 @@ struct MDBX_cursor {
#define C_SUB 0x04 /* Cursor is a sub-cursor */ #define C_SUB 0x04 /* Cursor is a sub-cursor */
#define C_DEL 0x08 /* last op was a cursor_del */ #define C_DEL 0x08 /* last op was a cursor_del */
#define C_UNTRACK 0x10 /* Un-track cursor when closing */ #define C_UNTRACK 0x10 /* Un-track cursor when closing */
#define C_GCU \
0x20 /* Происходит подготовка к обновлению GC, поэтому \
* можно брать страницы из GC даже для FREE_DBI */
uint8_t mc_flags; uint8_t mc_flags;
/* Cursor checking flags. */ /* Cursor checking flags. */