mdbx: новый код обновления GC.

This commit is contained in:
Леонид Юрьев (Leonid Yuriev) 2025-04-26 00:15:41 +03:00
parent 011c3072da
commit 2b36fd5974
No known key found for this signature in database
GPG Key ID: 518BD10B927E8686
24 changed files with 1493 additions and 905 deletions

6
mdbx.h
View File

@ -2775,10 +2775,10 @@ typedef struct MDBX_stat MDBX_stat;
* Legacy mdbx_env_stat() correspond to calling \ref mdbx_env_stat_ex() with the
* null `txn` argument.
*
* \param [in] env An environment handle returned by \ref mdbx_env_create()
* \param [in] txn A transaction handle returned by \ref mdbx_txn_begin()
* \param [in] env An environment handle returned by \ref mdbx_env_create().
* \param [in] txn A transaction handle returned by \ref mdbx_txn_begin().
* \param [out] stat The address of an \ref MDBX_stat structure where
* the statistics will be copied
* the statistics will be copied.
* \param [in] bytes The size of \ref MDBX_stat.
*
* \returns A non-zero error value on failure and 0 on success. */

View File

@ -955,7 +955,7 @@ __cold int mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t si
env->basal_txn->wr.troika = meta_tap(env);
eASSERT(env, !env->txn && !env->basal_txn->nested);
env->basal_txn->txnid = env->basal_txn->wr.troika.txnid[env->basal_txn->wr.troika.recent];
txn_snapshot_oldest(env->basal_txn);
txn_gc_detent(env->basal_txn);
}
/* get untouched params from current TXN or DB */

View File

@ -513,23 +513,25 @@ int mdbx_txn_info(const MDBX_txn *txn, MDBX_txn_info *info, bool scan_rlt) {
info->txn_reader_lag = INT64_MAX;
lck_t *const lck = env->lck_mmap.lck;
if (scan_rlt && lck) {
txnid_t oldest_snapshot = txn->txnid;
txnid_t oldest_reading = txn->txnid;
const size_t snap_nreaders = atomic_load32(&lck->rdt_length, mo_AcquireRelease);
if (snap_nreaders) {
oldest_snapshot = txn_snapshot_oldest(txn);
if (oldest_snapshot == txn->txnid - 1) {
/* check if there is at least one reader */
bool exists = false;
txn_gc_detent(txn);
oldest_reading = txn->env->gc.detent;
if (oldest_reading == txn->wr.troika.txnid[txn->wr.troika.recent]) {
/* Если самый старый используемый снимок является предыдущим, т. е. непосредственно предшествующим текущей
* транзакции, то просматриваем таблицу читателей чтобы выяснить действительно ли снимок используется
* читателями. */
oldest_reading = txn->txnid;
for (size_t i = 0; i < snap_nreaders; ++i) {
if (atomic_load32(&lck->rdt[i].pid, mo_Relaxed) && txn->txnid > safe64_read(&lck->rdt[i].txnid)) {
exists = true;
if (atomic_load32(&lck->rdt[i].pid, mo_Relaxed) && txn->env->gc.detent == safe64_read(&lck->rdt[i].txnid)) {
oldest_reading = txn->env->gc.detent;
break;
}
}
oldest_snapshot += !exists;
}
}
info->txn_reader_lag = txn->txnid - oldest_snapshot;
info->txn_reader_lag = txn->txnid - oldest_reading;
}
}

View File

@ -24,12 +24,11 @@ static size_t audit_db_used(const tree_t *db) {
return db ? (size_t)db->branch_pages + (size_t)db->leaf_pages + (size_t)db->large_pages : 0;
}
__cold static int audit_ex_locked(MDBX_txn *txn, size_t retired_stored, bool dont_filter_gc) {
__cold static int audit_ex_locked(MDBX_txn *txn, const size_t retired_stored, const bool dont_filter_gc) {
const MDBX_env *const env = txn->env;
size_t pending = 0;
if ((txn->flags & MDBX_TXN_RDONLY) == 0)
pending = txn->wr.loose_count + MDBX_PNL_GETSIZE(txn->wr.repnl) +
(MDBX_PNL_GETSIZE(txn->wr.retired_pages) - retired_stored);
tASSERT(txn, (txn->flags & MDBX_TXN_RDONLY) == 0);
const size_t pending = txn->wr.loose_count + MDBX_PNL_GETSIZE(txn->wr.repnl) +
(MDBX_PNL_GETSIZE(txn->wr.retired_pages) - retired_stored);
cursor_couple_t cx;
int rc = cursor_init(&cx.outer, txn, FREE_DBI);
@ -40,17 +39,16 @@ __cold static int audit_ex_locked(MDBX_txn *txn, size_t retired_stored, bool don
MDBX_val key, data;
rc = outer_first(&cx.outer, &key, &data);
while (rc == MDBX_SUCCESS) {
if (!dont_filter_gc) {
if (unlikely(key.iov_len != sizeof(txnid_t))) {
ERROR("%s/%d: %s %u", "MDBX_CORRUPTED", MDBX_CORRUPTED, "invalid GC-key size", (unsigned)key.iov_len);
return MDBX_CORRUPTED;
}
txnid_t id = unaligned_peek_u64(4, key.iov_base);
if (txn->wr.gc.retxl ? txl_contain(txn->wr.gc.retxl, id) : (id <= txn->wr.gc.last_reclaimed))
goto skip;
if (unlikely(key.iov_len != sizeof(txnid_t))) {
ERROR("%s/%d: %s %u", "MDBX_CORRUPTED", MDBX_CORRUPTED, "invalid GC-key size", (unsigned)key.iov_len);
return MDBX_CORRUPTED;
}
gc += *(pgno_t *)data.iov_base;
skip:
const txnid_t id = unaligned_peek_u64(4, key.iov_base);
const size_t len = *(pgno_t *)data.iov_base;
const bool acc = dont_filter_gc || !gc_is_reclaimed(txn, id);
TRACE("%s id %" PRIaTXN " len %zu", acc ? "acc" : "skip", id, len);
if (acc)
gc += len;
rc = outer_next(&cx.outer, &key, &data, MDBX_NEXT);
}
tASSERT(txn, rc == MDBX_NOTFOUND);

View File

@ -1780,8 +1780,7 @@ __hot csr_t cursor_seek(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, MDBX_cur
}
int cmp = mc->clc->k.cmp(&aligned_key, &nodekey);
if (unlikely(cmp == 0)) {
/* Probably happens rarely, but first node on the page
* was the one we wanted. */
/* Probably happens rarely, but first node on the page was the one we wanted. */
mc->ki[mc->top] = 0;
ret.exact = true;
goto got_node;

View File

@ -53,7 +53,7 @@ static inline dpl_t *dpl_sort(const MDBX_txn *txn) {
return likely(dl->sorted == dl->length) ? dl : dpl_sort_slowpath(txn);
}
MDBX_INTERNAL __noinline size_t dpl_search(const MDBX_txn *txn, pgno_t pgno);
MDBX_NOTHROW_PURE_FUNCTION MDBX_INTERNAL __noinline size_t dpl_search(const MDBX_txn *txn, pgno_t pgno);
MDBX_MAYBE_UNUSED MDBX_INTERNAL const page_t *debug_dpl_find(const MDBX_txn *txn, const pgno_t pgno);
@ -68,7 +68,7 @@ MDBX_NOTHROW_PURE_FUNCTION static inline pgno_t dpl_endpgno(const dpl_t *dl, siz
return dpl_npages(dl, i) + dl->items[i].pgno;
}
static inline bool dpl_intersect(const MDBX_txn *txn, pgno_t pgno, size_t npages) {
MDBX_NOTHROW_PURE_FUNCTION static inline bool dpl_intersect(const MDBX_txn *txn, pgno_t pgno, size_t npages) {
tASSERT(txn, (txn->flags & MDBX_TXN_RDONLY) == 0);
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC);

View File

@ -1061,16 +1061,17 @@ int dxb_sync_locked(MDBX_env *env, unsigned flags, meta_t *const pending, troika
#endif /* MADV_DONTNEED || POSIX_MADV_DONTNEED */
/* LY: check conditions to shrink datafile */
const pgno_t backlog_gap = 3 + pending->trees.gc.height * 3;
const pgno_t stockpile_gap = 3 + pending->trees.gc.height * 3;
pgno_t shrink_step = 0;
if (pending->geometry.shrink_pv && pending->geometry.now - pending->geometry.first_unallocated >
(shrink_step = pv2pages(pending->geometry.shrink_pv)) + backlog_gap) {
if (pending->geometry.now > largest_pgno && pending->geometry.now - largest_pgno > shrink_step + backlog_gap) {
(shrink_step = pv2pages(pending->geometry.shrink_pv)) + stockpile_gap) {
if (pending->geometry.now > largest_pgno &&
pending->geometry.now - largest_pgno > shrink_step + stockpile_gap) {
const pgno_t aligner =
pending->geometry.grow_pv ? /* grow_step */ pv2pages(pending->geometry.grow_pv) : shrink_step;
const pgno_t with_backlog_gap = largest_pgno + backlog_gap;
const pgno_t with_stockpile_gap = largest_pgno + stockpile_gap;
const pgno_t aligned =
pgno_align2os_pgno(env, (size_t)with_backlog_gap + aligner - with_backlog_gap % aligner);
pgno_align2os_pgno(env, (size_t)with_stockpile_gap + aligner - with_stockpile_gap % aligner);
const pgno_t bottom = (aligned > pending->geometry.lower) ? aligned : pending->geometry.lower;
if (pending->geometry.now > bottom) {
if (TROIKA_HAVE_STEADY(troika))

View File

@ -164,7 +164,7 @@ retry:;
}
eASSERT(env, head.txnid == recent_committed_txnid(env));
env->basal_txn->txnid = head.txnid;
txn_snapshot_oldest(env->basal_txn);
txn_gc_detent(env->basal_txn);
flags |= txn_shrink_allowed;
}
@ -524,7 +524,7 @@ __cold int env_close(MDBX_env *env, bool resurrect_after_fork) {
env->defer_free = nullptr;
#endif /* MDBX_ENABLE_DBI_LOCKFREE */
if (!(env->flags & MDBX_RDONLY))
if ((env->flags & MDBX_RDONLY) == 0)
osal_ioring_destroy(&env->ioring);
env->lck = nullptr;

View File

@ -30,8 +30,10 @@ typedef struct iov_ctx iov_ctx_t;
#if UINTPTR_MAX > 0xffffFFFFul || ULONG_MAX > 0xffffFFFFul || defined(_WIN64)
#define MDBX_WORDBITS 64
#define MDBX_WORDBITS_LN2 6
#else
#define MDBX_WORDBITS 32
#define MDBX_WORDBITS_LN2 5
#endif /* MDBX_WORDBITS */
#include "options.h"

View File

@ -570,14 +570,11 @@ static pgno_t *scan4seq_resolver(pgno_t *range, const size_t len, const size_t s
/*----------------------------------------------------------------------------*/
#define ALLOC_COALESCE 4 /* внутреннее состояние */
#define ALLOC_SHOULD_SCAN 8 /* внутреннее состояние */
#define ALLOC_LIFO 16 /* внутреннее состояние */
static inline bool is_gc_usable(MDBX_txn *txn, const MDBX_cursor *mc, const uint8_t flags) {
static inline bool is_reclaimable(MDBX_txn *txn, const MDBX_cursor *mc, const uint8_t flags) {
/* If txn is updating the GC, then the retired-list cannot play catch-up with
* itself by growing while trying to save it. */
if (mc->tree == &txn->dbs[FREE_DBI] && !(flags & ALLOC_RESERVE) && !(mc->flags & z_gcu_preparation))
STATIC_ASSERT(ALLOC_RESERVE == z_gcu_preparation);
if (mc->tree == &txn->dbs[FREE_DBI] && !((flags | mc->flags) & z_gcu_preparation))
return false;
/* avoid search inside empty tree and while tree is updating,
@ -590,8 +587,6 @@ static inline bool is_gc_usable(MDBX_txn *txn, const MDBX_cursor *mc, const uint
return true;
}
static inline bool is_already_reclaimed(const MDBX_txn *txn, txnid_t id) { return txl_contain(txn->wr.gc.retxl, id); }
__hot static pgno_t repnl_get_single(MDBX_txn *txn) {
const size_t len = MDBX_PNL_GETSIZE(txn->wr.repnl);
assert(len > 0);
@ -721,6 +716,10 @@ __hot static pgno_t repnl_get_sequence(MDBX_txn *txn, const size_t num, uint8_t
return 0;
}
bool gc_repnl_has_span(MDBX_txn *txn, const size_t num) {
return (num > 1) ? repnl_get_sequence(txn, num, ALLOC_RESERVE) != 0 : !MDBX_PNL_IS_EMPTY(txn->wr.repnl);
}
static inline pgr_t page_alloc_finalize(MDBX_env *const env, MDBX_txn *const txn, const MDBX_cursor *const mc,
const pgno_t pgno, const size_t num) {
#if MDBX_ENABLE_PROFGC
@ -842,6 +841,13 @@ pgr_t gc_alloc_ex(const MDBX_cursor *const mc, const size_t num, uint8_t flags)
prof->spe_counter += 1;
#endif /* MDBX_ENABLE_PROFGC */
/* Если взведен флажок ALLOC_RESERVE, то требуется только обеспечение соответствующего резерва в txn->wr.repnl
* и/или txn->wr.gc.reclaimed, но без выделения и возврата страницы. При этом возможны три варианта вызова:
* 1. num == 0 требуется слот для возврата в GC остатков ранее переработанных/извлеченных страниц,
* при этом нет смысла перерабатывать длинные записи, так как тогда дефицит свободных id/слотов не уменьшится;
* 2. num == 1 требуется увеличение резерва перед обновлением GC;
* 3. num > 1 требуется последовательность страниц для сохранения retired-страниц
* при выключенном MDBX_ENABLE_BIGFOOT. */
eASSERT(env, num > 0 || (flags & ALLOC_RESERVE));
eASSERT(env, pnl_check_allocated(txn->wr.repnl, txn->geo.first_unallocated - MDBX_ENABLE_REFUND));
@ -866,13 +872,12 @@ pgr_t gc_alloc_ex(const MDBX_cursor *const mc, const size_t num, uint8_t flags)
goto done;
}
} else {
eASSERT(env, num == 0 || MDBX_PNL_GETSIZE(txn->wr.repnl) == 0);
eASSERT(env, !(flags & ALLOC_RESERVE) || num == 0);
eASSERT(env, num == 0 || MDBX_PNL_GETSIZE(txn->wr.repnl) == 0 || (flags & ALLOC_RESERVE));
}
//---------------------------------------------------------------------------
if (unlikely(!is_gc_usable(txn, mc, flags))) {
if (unlikely(!is_reclaimable(txn, mc, flags))) {
eASSERT(env, (txn->flags & txn_gc_drained) || num > 1);
goto no_gc;
}
@ -880,21 +885,18 @@ pgr_t gc_alloc_ex(const MDBX_cursor *const mc, const size_t num, uint8_t flags)
eASSERT(env, (flags & (ALLOC_COALESCE | ALLOC_LIFO | ALLOC_SHOULD_SCAN)) == 0);
flags += (env->flags & MDBX_LIFORECLAIM) ? ALLOC_LIFO : 0;
if (/* Не коагулируем записи при подготовке резерва для обновления GC.
* Иначе попытка увеличить резерв может приводить к необходимости ещё
* большего резерва из-за увеличения списка переработанных страниц. */
(flags & ALLOC_RESERVE) == 0) {
if (txn->dbs[FREE_DBI].branch_pages && MDBX_PNL_GETSIZE(txn->wr.repnl) < env->maxgc_large1page / 2)
flags += ALLOC_COALESCE;
}
/* Не коагулируем записи в случае запроса слота для возврата страниц в GC. Иначе попытка увеличить резерв
* может приводить к необходимости ещё большего резерва из-за увеличения списка переработанных страниц. */
if (num > 0 && txn->dbs[FREE_DBI].branch_pages && MDBX_PNL_GETSIZE(txn->wr.repnl) < env->maxgc_large1page / 2)
flags += ALLOC_COALESCE;
MDBX_cursor *const gc = ptr_disp(env->basal_txn, sizeof(MDBX_txn));
MDBX_cursor *const gc = txn_gc_cursor(txn);
eASSERT(env, mc != gc && gc->next == gc);
gc->txn = txn;
gc->dbi_state = txn->dbi_state;
gc->top_and_flags = z_fresh_mark;
txn->wr.prefault_write_activated = env->options.prefault_write;
txn->wr.prefault_write_activated = !env->incore && env->options.prefault_write;
if (txn->wr.prefault_write_activated) {
/* Проверка посредством minicore() существенно снижает затраты, но в
* простейших случаях (тривиальный бенчмарк) интегральная производительность
@ -911,45 +913,38 @@ pgr_t gc_alloc_ex(const MDBX_cursor *const mc, const size_t num, uint8_t flags)
txn->wr.prefault_write_activated = false;
}
retry_gc_refresh_oldest:;
txnid_t oldest = txn_snapshot_oldest(txn);
retry_gc_have_oldest:
if (unlikely(oldest >= txn->txnid)) {
ERROR("unexpected/invalid oldest-readed txnid %" PRIaTXN " for current-txnid %" PRIaTXN, oldest, txn->txnid);
retry_gc_refresh_detent:
txn_gc_detent(txn);
retry_gc_have_detent:
if (unlikely(txn->env->gc.detent >= txn->txnid)) {
FATAL("unexpected/invalid gc-detent %" PRIaTXN " for current-txnid %" PRIaTXN, txn->env->gc.detent, txn->txnid);
ret.err = MDBX_PROBLEM;
goto fail;
}
const txnid_t detent = oldest + 1;
txnid_t id = 0;
MDBX_cursor_op op = MDBX_FIRST;
if (flags & ALLOC_LIFO) {
if (!txn->wr.gc.retxl) {
txn->wr.gc.retxl = txl_alloc();
if (unlikely(!txn->wr.gc.retxl)) {
ret.err = MDBX_ENOMEM;
goto fail;
}
}
/* Begin lookup backward from oldest reader */
id = detent - 1;
id = txn->env->gc.detent;
op = MDBX_SET_RANGE;
} else if (txn->wr.gc.last_reclaimed) {
} else {
/* Continue lookup forward from last-reclaimed */
id = txn->wr.gc.last_reclaimed + 1;
if (id >= detent)
goto depleted_gc;
op = MDBX_SET_RANGE;
id = rkl_highest(&txn->wr.gc.reclaimed);
if (id) {
id += 1;
op = MDBX_SET_RANGE;
if (id >= txn->env->gc.detent)
goto depleted_gc;
}
}
next_gc:;
MDBX_val key;
key.iov_base = &id;
key.iov_len = sizeof(id);
next_gc:
#if MDBX_ENABLE_PROFGC
prof->rsteps += 1;
prof->rsteps += 1
#endif /* MDBX_ENABLE_PROFGC */
;
MDBX_val key = {.iov_base = &id, .iov_len = sizeof(id)};
/* Seek first/next GC record */
ret.err = cursor_ops(gc, &key, nullptr, op);
@ -967,15 +962,18 @@ next_gc:;
ret.err = MDBX_CORRUPTED;
goto fail;
}
id = unaligned_peek_u64(4, key.iov_base);
if (flags & ALLOC_LIFO) {
op = MDBX_PREV;
if (id >= detent || is_already_reclaimed(txn, id))
if (id >= txn->env->gc.detent || gc_is_reclaimed(txn, id))
goto next_gc;
} else {
op = MDBX_NEXT;
if (unlikely(id >= detent))
if (unlikely(id >= txn->env->gc.detent))
goto depleted_gc;
op = MDBX_NEXT;
if (gc_is_reclaimed(txn, id))
goto next_gc;
}
txn->flags &= ~txn_gc_drained;
@ -996,12 +994,23 @@ next_gc:;
const size_t gc_len = MDBX_PNL_GETSIZE(gc_pnl);
TRACE("gc-read: id #%" PRIaTXN " len %zu, re-list will %zu ", id, gc_len, gc_len + MDBX_PNL_GETSIZE(txn->wr.repnl));
if (unlikely(gc_len + MDBX_PNL_GETSIZE(txn->wr.repnl) >= env->maxgc_large1page)) {
/* Don't try to coalesce too much. */
if (unlikely(!num)) {
/* TODO: Проверка критериев пункта 2 сформулированного в gc_provide_slots().
* Сейчас тут сильно упрощенная и не совсем верная проверка, так как пока недоступна информация о кол-ве имеющихся
* слотов и их дефиците для возврата wr.repl. */
if (gc_len > env->maxgc_large1page / 4 * 3
/* если запись достаточно длинная, то переработка слота не особо увеличит место для возврата wr.repl, и т.п. */
&& MDBX_PNL_GETSIZE(txn->wr.repnl) + gc_len > env->maxgc_large1page /* не помещается в хвост */) {
DEBUG("avoid reclaiming %" PRIaTXN " slot, since it is too long (%zu)", id, gc_len);
ret.err = MDBX_NOTFOUND;
goto reserve_done;
}
}
if (unlikely(gc_len + MDBX_PNL_GETSIZE(txn->wr.repnl) /* Don't try to coalesce too much. */ >=
env->maxgc_large1page)) {
if (flags & ALLOC_SHOULD_SCAN) {
eASSERT(env, flags & ALLOC_COALESCE);
eASSERT(env, !(flags & ALLOC_RESERVE));
eASSERT(env, num > 0);
eASSERT(env, (flags & ALLOC_COALESCE) /* && !(flags & ALLOC_RESERVE) */ && num > 0);
#if MDBX_ENABLE_PROFGC
env->lck->pgops.gc_prof.coalescences += 1;
#endif /* MDBX_ENABLE_PROFGC */
@ -1010,25 +1019,25 @@ next_gc:;
eASSERT(env, MDBX_PNL_LAST(txn->wr.repnl) < txn->geo.first_unallocated &&
MDBX_PNL_FIRST(txn->wr.repnl) < txn->geo.first_unallocated);
if (likely(num == 1)) {
pgno = repnl_get_single(txn);
pgno = (flags & ALLOC_RESERVE) ? P_INVALID : repnl_get_single(txn);
goto done;
}
pgno = repnl_get_sequence(txn, num, flags);
if (likely(pgno))
goto done;
}
flags -= ALLOC_COALESCE | ALLOC_SHOULD_SCAN;
}
flags &= ~(ALLOC_COALESCE | ALLOC_SHOULD_SCAN);
if (unlikely(/* list is too long already */ MDBX_PNL_GETSIZE(txn->wr.repnl) >= env->options.rp_augment_limit) &&
((/* not a slot-request from gc-update */ num &&
/* have enough unallocated space */ txn->geo.upper >= txn->geo.first_unallocated + num &&
monotime_since_cached(monotime_begin, &now_cache) + txn->wr.gc.time_acc >= env->options.gc_time_limit) ||
monotime_since_cached(monotime_begin, &now_cache) + txn->wr.gc.spent >= env->options.gc_time_limit) ||
gc_len + MDBX_PNL_GETSIZE(txn->wr.repnl) >= PAGELIST_LIMIT)) {
/* Stop reclaiming to avoid large/overflow the page list. This is a rare
* case while search for a continuously multi-page region in a
* large database, see https://libmdbx.dqdkfa.ru/dead-github/issues/123 */
* case while search for a continuously multi-page region in a large database,
* see https://libmdbx.dqdkfa.ru/dead-github/issues/123 */
NOTICE("stop reclaiming %s: %zu (current) + %zu "
"(chunk) -> %zu, rp_augment_limit %u",
"(chunk) >= %zu, rp_augment_limit %u",
likely(gc_len + MDBX_PNL_GETSIZE(txn->wr.repnl) < PAGELIST_LIMIT) ? "since rp_augment_limit was reached"
: "to avoid PNL overflow",
MDBX_PNL_GETSIZE(txn->wr.repnl), gc_len, gc_len + MDBX_PNL_GETSIZE(txn->wr.repnl),
@ -1038,12 +1047,17 @@ next_gc:;
}
/* Remember ID of readed GC record */
txn->wr.gc.last_reclaimed = id;
if (flags & ALLOC_LIFO) {
ret.err = txl_append(&txn->wr.gc.retxl, id);
if (unlikely(ret.err != MDBX_SUCCESS))
goto fail;
}
ret.err = rkl_push(&txn->wr.gc.reclaimed, id,
false /* Вместо false, тут можно передавать/использовать (flags & ALLOC_LIFO) == 0, тогда
* дыры/пропуски в идентификаторах GC будут образовывать непрерывные интервалы в wr.gc.reclaimed,
* что обеспечит больше свободных идентификаторов/слотов для возврата страниц. Однако, это
* также приведёт к пустым попыткам удаления отсутствующих записей в gc_clear_reclaimed(),
* а далее к перекладыванию этих сплошных интервалов поэлементно в ready4reuse.
* Поэтому смысла в этом решительно нет. Следует либо формировать сплошные интервалы при
* работе gc_clear_reclaimed(), особенно в FIFO-режиме, либо искать их только в gc_provide_ids() */);
TRACE("%" PRIaTXN " len %zu pushed to txn-rkl, err %d", id, gc_len, ret.err);
if (unlikely(ret.err != MDBX_SUCCESS))
goto fail;
/* Append PNL from GC record to wr.repnl */
ret.err = pnl_need(&txn->wr.repnl, gc_len);
@ -1087,22 +1101,25 @@ next_gc:;
}
eASSERT(env, pnl_check_allocated(txn->wr.repnl, txn->geo.first_unallocated - MDBX_ENABLE_REFUND));
/* Done for a kick-reclaim mode, actually no page needed */
if (unlikely(num == 0)) {
eASSERT(env, ret.err == MDBX_SUCCESS);
TRACE("%s: last id #%" PRIaTXN ", re-len %zu", "early-exit for slot", id, MDBX_PNL_GETSIZE(txn->wr.repnl));
goto early_exit;
}
/* TODO: delete reclaimed records */
/* TODO: удаление загруженных из GC записей */
eASSERT(env, op == MDBX_PREV || op == MDBX_NEXT);
if (flags & ALLOC_COALESCE) {
TRACE("%s: last id #%" PRIaTXN ", re-len %zu", "coalesce-continue", id, MDBX_PNL_GETSIZE(txn->wr.repnl));
goto next_gc;
if (MDBX_PNL_GETSIZE(txn->wr.repnl) < env->maxgc_large1page / 2) {
TRACE("%s: last id #%" PRIaTXN ", re-len %zu", "coalesce-continue", id, MDBX_PNL_GETSIZE(txn->wr.repnl));
goto next_gc;
}
flags -= ALLOC_COALESCE;
}
scan:
if ((flags & ALLOC_RESERVE) && num < 2) {
/* Если был нужен только slot/id для gc_reclaim_slot() или gc_reserve4stockpile() */
TRACE("%s: last id #%" PRIaTXN ", re-len %zu", "reserve-done", id, MDBX_PNL_GETSIZE(txn->wr.repnl));
ret.err = MDBX_SUCCESS;
goto reserve_done;
}
eASSERT(env, flags & ALLOC_SHOULD_SCAN);
eASSERT(env, num > 0);
if (MDBX_PNL_GETSIZE(txn->wr.repnl) >= num) {
@ -1118,17 +1135,16 @@ scan:
goto done;
}
flags -= ALLOC_SHOULD_SCAN;
if (ret.err == MDBX_SUCCESS) {
if ((txn->flags & txn_gc_drained) == 0) {
TRACE("%s: last id #%" PRIaTXN ", re-len %zu", "continue-search", id, MDBX_PNL_GETSIZE(txn->wr.repnl));
goto next_gc;
}
depleted_gc:
TRACE("%s: last id #%" PRIaTXN ", re-len %zu", "gc-depleted", id, MDBX_PNL_GETSIZE(txn->wr.repnl));
ret.err = MDBX_NOTFOUND;
txn->flags |= txn_gc_drained;
if (flags & ALLOC_SHOULD_SCAN)
goto scan;
txn->flags |= txn_gc_drained;
//-------------------------------------------------------------------------
@ -1145,9 +1161,9 @@ depleted_gc:
/* Does reclaiming stopped at the last steady point? */
const meta_ptr_t recent = meta_recent(env, &txn->wr.troika);
const meta_ptr_t prefer_steady = meta_prefer_steady(env, &txn->wr.troika);
if (recent.ptr_c != prefer_steady.ptr_c && prefer_steady.is_steady && detent == prefer_steady.txnid + 1) {
DEBUG("gc-kick-steady: recent %" PRIaTXN "-%s, steady %" PRIaTXN "-%s, detent %" PRIaTXN, recent.txnid,
durable_caption(recent.ptr_c), prefer_steady.txnid, durable_caption(prefer_steady.ptr_c), detent);
if (recent.ptr_c != prefer_steady.ptr_c && prefer_steady.is_steady && txn->env->gc.detent == prefer_steady.txnid) {
DEBUG("gc-kick-steady: recent %" PRIaTXN "-%s, steady %" PRIaTXN "-%s", recent.txnid, durable_caption(recent.ptr_c),
prefer_steady.txnid, durable_caption(prefer_steady.ptr_c));
const pgno_t autosync_threshold = atomic_load32(&env->lck->autosync_threshold, mo_Relaxed);
const uint64_t autosync_period = atomic_load64(&env->lck->autosync_period, mo_Relaxed);
uint64_t eoos_timestamp;
@ -1166,12 +1182,12 @@ depleted_gc:
#if MDBX_ENABLE_PROFGC
env->lck->pgops.gc_prof.wipes += 1;
#endif /* MDBX_ENABLE_PROFGC */
ret.err = meta_wipe_steady(env, detent);
ret.err = meta_wipe_steady(env, txn->env->gc.detent);
DEBUG("gc-wipe-steady, rc %d", ret.err);
if (unlikely(ret.err != MDBX_SUCCESS))
goto fail;
eASSERT(env, prefer_steady.ptr_c != meta_prefer_steady(env, &txn->wr.troika).ptr_c);
goto retry_gc_refresh_oldest;
goto retry_gc_refresh_detent;
}
if ((autosync_threshold && atomic_load64(&env->lck->unsynced_pages, mo_Relaxed) >= autosync_threshold) ||
(autosync_period && (eoos_timestamp = atomic_load64(&env->lck->eoos_timestamp, mo_Relaxed)) &&
@ -1189,15 +1205,12 @@ depleted_gc:
if (unlikely(ret.err != MDBX_SUCCESS))
goto fail;
eASSERT(env, prefer_steady.ptr_c != meta_prefer_steady(env, &txn->wr.troika).ptr_c);
goto retry_gc_refresh_oldest;
goto retry_gc_refresh_detent;
}
}
if (unlikely(true == atomic_load32(&env->lck->rdt_refresh_flag, mo_AcquireRelease))) {
oldest = txn_snapshot_oldest(txn);
if (oldest >= detent)
goto retry_gc_have_oldest;
}
if (unlikely(true == atomic_load32(&env->lck->rdt_refresh_flag, mo_AcquireRelease)) && txn_gc_detent(txn))
goto retry_gc_have_detent;
/* Avoid kick lagging reader(s) if is enough unallocated space
* at the end of database file. */
@ -1206,11 +1219,8 @@ depleted_gc:
goto done;
}
if (oldest < txn->txnid - xMDBX_TXNID_STEP) {
oldest = mvcc_kick_laggards(env, oldest);
if (oldest >= detent)
goto retry_gc_have_oldest;
}
if (txn->txnid - txn->env->gc.detent > xMDBX_TXNID_STEP && mvcc_kick_laggards(env, txn->env->gc.detent))
goto retry_gc_refresh_detent;
//---------------------------------------------------------------------------
@ -1277,30 +1287,40 @@ done:
eASSERT(env, ret.err != MDBX_SUCCESS);
eASSERT(env, pnl_check_allocated(txn->wr.repnl, txn->geo.first_unallocated - MDBX_ENABLE_REFUND));
int level;
const char *what;
if (flags & ALLOC_RESERVE) {
level = (flags & ALLOC_UNIMPORTANT) ? MDBX_LOG_DEBUG : MDBX_LOG_NOTICE;
what = num ? "reserve-pages" : "fetch-slot";
} else {
if (flags & ALLOC_UNIMPORTANT)
level = MDBX_LOG_DEBUG;
else if (flags & ALLOC_RESERVE)
level = MDBX_LOG_NOTICE;
else {
txn->flags |= MDBX_TXN_ERROR;
level = MDBX_LOG_ERROR;
what = "pages";
}
if (LOG_ENABLED(level))
debug_log(level, __func__, __LINE__,
"unable alloc %zu %s, alloc-flags 0x%x, err %d, txn-flags "
"0x%x, re-list-len %zu, loose-count %zu, gc: height %u, "
"branch %zu, leaf %zu, large %zu, entries %zu\n",
num, what, flags, ret.err, txn->flags, MDBX_PNL_GETSIZE(txn->wr.repnl), txn->wr.loose_count,
txn->dbs[FREE_DBI].height, (size_t)txn->dbs[FREE_DBI].branch_pages,
(size_t)txn->dbs[FREE_DBI].leaf_pages, (size_t)txn->dbs[FREE_DBI].large_pages,
(size_t)txn->dbs[FREE_DBI].items);
if (LOG_ENABLED(level)) {
if (num)
debug_log(level, __func__, __LINE__,
"unable %s %zu, alloc-flags 0x%x, err %d, txn-flags "
"0x%x, re-list-len %zu, loose-count %zu, gc: height %u, "
"branch %zu, leaf %zu, large %zu, entries %zu\n",
(flags & ALLOC_RESERVE) ? "reserve" : "alloc", num, flags, ret.err, txn->flags,
MDBX_PNL_GETSIZE(txn->wr.repnl), txn->wr.loose_count, txn->dbs[FREE_DBI].height,
(size_t)txn->dbs[FREE_DBI].branch_pages, (size_t)txn->dbs[FREE_DBI].leaf_pages,
(size_t)txn->dbs[FREE_DBI].large_pages, (size_t)txn->dbs[FREE_DBI].items);
else
debug_log(level, __func__, __LINE__,
"unable fetch-slot, alloc-flags 0x%x, err %d, txn-flags "
"0x%x, re-list-len %zu, loose-count %zu, gc: height %u, "
"branch %zu, leaf %zu, large %zu, entries %zu\n",
flags, ret.err, txn->flags, MDBX_PNL_GETSIZE(txn->wr.repnl), txn->wr.loose_count,
txn->dbs[FREE_DBI].height, (size_t)txn->dbs[FREE_DBI].branch_pages,
(size_t)txn->dbs[FREE_DBI].leaf_pages, (size_t)txn->dbs[FREE_DBI].large_pages,
(size_t)txn->dbs[FREE_DBI].items);
}
ret.page = nullptr;
}
if (num > 1)
txn->wr.gc.time_acc += monotime_since_cached(monotime_begin, &now_cache);
txn->wr.gc.spent += monotime_since_cached(monotime_begin, &now_cache);
} else {
early_exit:
reserve_done:
DEBUG("return nullptr for %zu pages for ALLOC_%s, rc %d", num, num ? "RESERVE" : "SLOT", ret.err);
ret.page = nullptr;
}

File diff suppressed because it is too large Load Diff

View File

@ -5,14 +5,37 @@
#include "essentials.h"
/* Гистограмма решения нарезки фрагментов для ситуации нехватки идентификаторов/слотов. */
typedef struct gc_dense_histogram {
/* Размер массива одновременно задаёт максимальный размер последовательностей,
* с которыми решается задача распределения.
*
* Использование длинных последовательностей контрпродуктивно, так как такие последовательности будут
* создавать/воспроизводить/повторять аналогичные затруднения при последующей переработке. Однако,
* в редких ситуациях это может быть единственным выходом. */
unsigned end;
pgno_t array[31];
} gc_dense_histogram_t;
typedef struct gc_update_context {
unsigned loop;
pgno_t prev_first_unallocated;
unsigned goodchunk;
bool dense;
size_t reserve_adj;
pgno_t prev_first_unallocated;
size_t retired_stored;
size_t amount, reserved, cleaned_slot, reused_slot, fill_idx;
txnid_t cleaned_id, rid;
size_t return_reserved_lo, return_reserved_hi;
txnid_t gc_first;
intptr_t return_left;
#ifndef MDBX_DEBUG_GCU
#define MDBX_DEBUG_GCU 0
#endif
#if MDBX_DEBUG_GCU
struct {
txnid_t prev;
unsigned n;
} dbg;
#endif /* MDBX_DEBUG_GCU */
rkl_t ready4reuse, sequel;
#if MDBX_ENABLE_BIGFOOT
txnid_t bigfoot;
#endif /* MDBX_ENABLE_BIGFOOT */
@ -20,21 +43,34 @@ typedef struct gc_update_context {
MDBX_cursor cursor;
cursor_couple_t couple;
};
gc_dense_histogram_t dense_histogram;
} gcu_t;
static inline int gc_update_init(MDBX_txn *txn, gcu_t *ctx) {
memset(ctx, 0, offsetof(gcu_t, cursor));
ctx->dense = txn->txnid <= MIN_TXNID;
#if MDBX_ENABLE_BIGFOOT
ctx->bigfoot = txn->txnid;
#endif /* MDBX_ENABLE_BIGFOOT */
return cursor_init(&ctx->cursor, txn, FREE_DBI);
}
MDBX_INTERNAL int gc_put_init(MDBX_txn *txn, gcu_t *ctx);
MDBX_INTERNAL void gc_put_destroy(gcu_t *ctx);
#define ALLOC_DEFAULT 0 /* штатное/обычное выделение страниц */
#define ALLOC_UNIMPORTANT 1 /* запрос неважен, невозможность выделения не приведет к ошибке транзакции */
#define ALLOC_RESERVE 2 /* подготовка резерва для обновления GC, без аллокации */
#define ALLOC_COALESCE 4 /* внутреннее состояние/флажок */
#define ALLOC_SHOULD_SCAN 8 /* внутреннее состояние/флажок */
#define ALLOC_LIFO 16 /* внутреннее состояние/флажок */
#define ALLOC_DEFAULT 0
#define ALLOC_RESERVE 1
#define ALLOC_UNIMPORTANT 2
MDBX_INTERNAL pgr_t gc_alloc_ex(const MDBX_cursor *const mc, const size_t num, uint8_t flags);
MDBX_INTERNAL pgr_t gc_alloc_single(const MDBX_cursor *const mc);
MDBX_INTERNAL int gc_update(MDBX_txn *txn, gcu_t *ctx);
MDBX_NOTHROW_PURE_FUNCTION static inline size_t gc_stockpile(const MDBX_txn *txn) {
return MDBX_PNL_GETSIZE(txn->wr.repnl) + txn->wr.loose_count;
}
MDBX_INTERNAL bool gc_repnl_has_span(MDBX_txn *txn, const size_t num);
static inline bool gc_is_reclaimed(const MDBX_txn *txn, const txnid_t id) {
return rkl_contain(&txn->wr.gc.reclaimed, id) || rkl_contain(&txn->wr.gc.comeback, id);
}
static inline txnid_t txnid_min(txnid_t a, txnid_t b) { return (a < b) ? a : b; }
static inline txnid_t txnid_max(txnid_t a, txnid_t b) { return (a > b) ? a : b; }

View File

@ -214,10 +214,9 @@ struct MDBX_txn {
troika_t troika;
pnl_t __restrict repnl; /* Reclaimed GC pages */
struct {
/* The list of reclaimed txn-ids from GC */
txl_t __restrict retxl;
txnid_t last_reclaimed; /* ID of last used record */
uint64_t time_acc;
rkl_t reclaimed; /* The list of reclaimed txn-ids from GC */
uint64_t spent; /* Time spent reading and searching GC */
rkl_t comeback; /* The list of ids of records returned into GC during commit, etc */
} gc;
bool prefault_write_activated;
#if MDBX_ENABLE_REFUND
@ -287,13 +286,14 @@ struct MDBX_cursor {
};
/* флаги проверки, в том числе биты для проверки типа листовых страниц. */
uint8_t checking;
uint8_t pad;
/* Указывает на txn->dbi_state[] для DBI этого курсора.
* Модификатор __restrict тут полезен и безопасен в текущем понимании,
* так как пересечение возможно только с dbi_state транзакции,
* и происходит по-чтению до последующего изменения/записи. */
uint8_t *__restrict dbi_state;
/* Связь списка отслеживания курсоров в транзакции */
/* Связь списка отслеживания курсоров в транзакции. */
MDBX_txn *txn;
/* Указывает на tree->dbs[] для DBI этого курсора. */
tree_t *tree;
@ -362,15 +362,14 @@ struct MDBX_env {
uint16_t subpage_reserve_prereq;
uint16_t subpage_reserve_limit;
atomic_pgno_t mlocked_pgno;
uint8_t ps2ln; /* log2 of DB page size */
int8_t stuck_meta; /* recovery-only: target meta page or less that zero */
uint16_t merge_threshold, merge_threshold_gc; /* pages emptier than this are
candidates for merging */
unsigned max_readers; /* size of the reader table */
MDBX_dbi max_dbi; /* size of the DB table */
uint32_t pid; /* process ID of this env */
osal_thread_key_t me_txkey; /* thread-key for readers */
struct { /* path to the DB files */
uint8_t ps2ln; /* log2 of DB page size */
int8_t stuck_meta; /* recovery-only: target meta page or less that zero */
uint16_t merge_threshold; /* pages emptier than this are candidates for merging */
unsigned max_readers; /* size of the reader table */
MDBX_dbi max_dbi; /* size of the DB table */
uint32_t pid; /* process ID of this env */
osal_thread_key_t me_txkey; /* thread-key for readers */
struct { /* path to the DB files */
pathchar_t *lck, *dxb, *specified;
void *buffer;
} pathname;
@ -467,6 +466,9 @@ struct MDBX_env {
/* --------------------------------------------------- mostly volatile part */
MDBX_txn *txn; /* current write transaction */
struct {
txnid_t detent;
} gc;
osal_fastmutex_t dbi_lock;
unsigned n_dbi; /* number of DBs opened */
@ -549,11 +551,7 @@ MDBX_MAYBE_UNUSED static void static_checks(void) {
STATIC_ASSERT(sizeof(clc_t) == 3 * sizeof(void *));
STATIC_ASSERT(sizeof(kvx_t) == 8 * sizeof(void *));
#if MDBX_WORDBITS == 64
#define KVX_SIZE_LN2 6
#else
#define KVX_SIZE_LN2 5
#endif
#define KVX_SIZE_LN2 MDBX_WORDBITS_LN2
STATIC_ASSERT(sizeof(kvx_t) == (1u << KVX_SIZE_LN2));
}
#endif /* Disabled for MSVC 19.0 (VisualStudio 2015) */

View File

@ -300,7 +300,7 @@ __cold MDBX_INTERNAL int mvcc_cleanup_dead(MDBX_env *env, int rdt_locked, int *d
return rc;
}
__cold txnid_t mvcc_kick_laggards(MDBX_env *env, const txnid_t straggler) {
__cold bool mvcc_kick_laggards(MDBX_env *env, const txnid_t straggler) {
DEBUG("DB size maxed out by reading #%" PRIaTXN, straggler);
osal_memory_fence(mo_AcquireRelease, false);
MDBX_hsr_func *const callback = env->hsr_callback;
@ -410,5 +410,5 @@ __cold txnid_t mvcc_kick_laggards(MDBX_env *env, const txnid_t straggler) {
NOTICE("hsr-kick: done turn %" PRIaTXN " -> %" PRIaTXN " +%" PRIaTXN, straggler, oldest, turn);
callback(env, env->txn, 0, 0, straggler, (turn < UINT_MAX) ? (unsigned)turn : UINT_MAX, 0, -retry);
}
return oldest;
return oldest > straggler;
}

View File

@ -56,7 +56,7 @@ typedef const pgno_t *const_pnl_t;
#define MDBX_PNL_SIZEOF(pl) ((MDBX_PNL_GETSIZE(pl) + 1) * sizeof(pgno_t))
#define MDBX_PNL_IS_EMPTY(pl) (MDBX_PNL_GETSIZE(pl) == 0)
MDBX_MAYBE_UNUSED static inline size_t pnl_size2bytes(size_t size) {
MDBX_NOTHROW_PURE_FUNCTION MDBX_MAYBE_UNUSED static inline size_t pnl_size2bytes(size_t size) {
assert(size > 0 && size <= PAGELIST_LIMIT);
#if MDBX_PNL_PREALLOC_FOR_RADIXSORT
@ -71,7 +71,7 @@ MDBX_MAYBE_UNUSED static inline size_t pnl_size2bytes(size_t size) {
return bytes;
}
MDBX_MAYBE_UNUSED static inline pgno_t pnl_bytes2size(const size_t bytes) {
MDBX_NOTHROW_PURE_FUNCTION MDBX_MAYBE_UNUSED static inline pgno_t pnl_bytes2size(const size_t bytes) {
size_t size = bytes / sizeof(pgno_t);
assert(size > 3 && size <= PAGELIST_LIMIT + /* alignment gap */ 65536);
size -= 3;
@ -114,7 +114,7 @@ MDBX_INTERNAL int __must_check_result pnl_append_span(__restrict pnl_t *ppnl, pg
MDBX_INTERNAL int __must_check_result pnl_insert_span(__restrict pnl_t *ppnl, pgno_t pgno, size_t n);
MDBX_INTERNAL size_t pnl_search_nochk(const pnl_t pnl, pgno_t pgno);
MDBX_NOTHROW_PURE_FUNCTION MDBX_INTERNAL size_t pnl_search_nochk(const pnl_t pnl, pgno_t pgno);
MDBX_INTERNAL void pnl_sort_nochk(pnl_t pnl);
@ -130,7 +130,8 @@ MDBX_MAYBE_UNUSED static inline void pnl_sort(pnl_t pnl, size_t limit4check) {
(void)limit4check;
}
MDBX_MAYBE_UNUSED static inline size_t pnl_search(const pnl_t pnl, pgno_t pgno, size_t limit) {
MDBX_NOTHROW_PURE_FUNCTION MDBX_MAYBE_UNUSED static inline size_t pnl_search(const pnl_t pnl, pgno_t pgno,
size_t limit) {
assert(pnl_check_allocated(pnl, limit));
if (MDBX_HAVE_CMOV) {
/* cmov-ускоренный бинарный поиск может читать (но не использовать) один

View File

@ -15,9 +15,8 @@ MDBX_INTERNAL bsr_t mvcc_bind_slot(MDBX_env *env);
MDBX_MAYBE_UNUSED MDBX_INTERNAL pgno_t mvcc_largest_this(MDBX_env *env, pgno_t largest);
MDBX_INTERNAL txnid_t mvcc_shapshot_oldest(MDBX_env *const env, const txnid_t steady);
MDBX_INTERNAL pgno_t mvcc_snapshot_largest(const MDBX_env *env, pgno_t last_used_page);
MDBX_INTERNAL txnid_t mvcc_kick_laggards(MDBX_env *env, const txnid_t straggler);
MDBX_INTERNAL int mvcc_cleanup_dead(MDBX_env *env, int rlocked, int *dead);
MDBX_INTERNAL txnid_t mvcc_kick_laggards(MDBX_env *env, const txnid_t laggard);
MDBX_INTERNAL bool mvcc_kick_laggards(MDBX_env *env, const txnid_t laggard);
/* dxb.c */
MDBX_INTERNAL int dxb_setup(MDBX_env *env, const int lck_rc, const mdbx_mode_t mode_bits);
@ -62,10 +61,11 @@ struct commit_timestamp {
};
MDBX_INTERNAL bool txn_refund(MDBX_txn *txn);
MDBX_INTERNAL txnid_t txn_snapshot_oldest(const MDBX_txn *const txn);
MDBX_INTERNAL bool txn_gc_detent(const MDBX_txn *const txn);
MDBX_INTERNAL int txn_check_badbits_parked(const MDBX_txn *txn, int bad_bits);
MDBX_INTERNAL void txn_done_cursors(MDBX_txn *txn);
MDBX_INTERNAL int txn_shadow_cursors(const MDBX_txn *parent, const size_t dbi);
MDBX_INTERNAL MDBX_cursor *txn_gc_cursor(MDBX_txn *txn);
MDBX_INTERNAL MDBX_txn *txn_alloc(const MDBX_txn_flags_t flags, MDBX_env *env);
MDBX_INTERNAL int txn_abort(MDBX_txn *txn);

View File

@ -33,6 +33,7 @@ typedef struct MDBX_rkl {
MDBX_MAYBE_UNUSED MDBX_INTERNAL void rkl_init(rkl_t *rkl);
MDBX_MAYBE_UNUSED MDBX_INTERNAL void rkl_clear(rkl_t *rkl);
static inline void rkl_clear_and_shrink(rkl_t *rkl) { rkl_clear(rkl); /* TODO */ }
MDBX_MAYBE_UNUSED MDBX_INTERNAL void rkl_destroy(rkl_t *rkl);
MDBX_MAYBE_UNUSED MDBX_INTERNAL void rkl_destructive_move(rkl_t *dst, rkl_t *src);
MDBX_MAYBE_UNUSED MDBX_INTERNAL __must_check_result int rkl_copy(const rkl_t *src, rkl_t *dst);

View File

@ -63,14 +63,14 @@ static int txl_reserve(txl_t __restrict *__restrict ptxl, const size_t wanna) {
return MDBX_ENOMEM;
}
static __always_inline int __must_check_result txl_need(txl_t __restrict *__restrict ptxl, size_t num) {
static inline int __must_check_result txl_need(txl_t __restrict *__restrict ptxl, size_t num) {
assert(MDBX_PNL_GETSIZE(*ptxl) <= txl_max && MDBX_PNL_ALLOCLEN(*ptxl) >= MDBX_PNL_GETSIZE(*ptxl));
assert(num <= PAGELIST_LIMIT);
const size_t wanna = (size_t)MDBX_PNL_GETSIZE(*ptxl) + num;
return likely(MDBX_PNL_ALLOCLEN(*ptxl) >= wanna) ? MDBX_SUCCESS : txl_reserve(ptxl, wanna);
}
static __always_inline void txl_xappend(txl_t __restrict txl, txnid_t id) {
static inline void txl_xappend(txl_t __restrict txl, txnid_t id) {
assert(MDBX_PNL_GETSIZE(txl) < MDBX_PNL_ALLOCLEN(txl));
txl[0] += 1;
MDBX_PNL_LAST(txl) = id;

View File

@ -15,12 +15,12 @@ enum txl_rules {
txl_max = (1u << 26) - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(txnid_t)
};
MDBX_INTERNAL txl_t txl_alloc(void);
MDBX_MAYBE_UNUSED MDBX_INTERNAL txl_t txl_alloc(void);
MDBX_INTERNAL void txl_free(txl_t txl);
MDBX_MAYBE_UNUSED MDBX_INTERNAL void txl_free(txl_t txl);
MDBX_INTERNAL int __must_check_result txl_append(txl_t __restrict *ptxl, txnid_t id);
MDBX_MAYBE_UNUSED MDBX_INTERNAL int __must_check_result txl_append(txl_t __restrict *ptxl, txnid_t id);
MDBX_INTERNAL void txl_sort(txl_t txl);
MDBX_MAYBE_UNUSED MDBX_INTERNAL void txl_sort(txl_t txl);
MDBX_INTERNAL bool txl_contain(const txl_t txl, txnid_t id);
MDBX_MAYBE_UNUSED MDBX_INTERNAL bool txl_contain(const txl_t txl, txnid_t id);

View File

@ -62,6 +62,8 @@ __cold MDBX_txn *txn_basal_create(const size_t max_dbi) {
if (unlikely(!txn))
return txn;
rkl_init(&txn->wr.gc.reclaimed);
rkl_init(&txn->wr.gc.comeback);
txn->dbs = ptr_disp(txn, base);
txn->cursors = ptr_disp(txn->dbs, max_dbi * sizeof(txn->dbs[0]));
txn->dbi_seqs = ptr_disp(txn->cursors, max_dbi * sizeof(txn->cursors[0]));
@ -82,7 +84,8 @@ __cold MDBX_txn *txn_basal_create(const size_t max_dbi) {
__cold void txn_basal_destroy(MDBX_txn *txn) {
dpl_free(txn);
txl_free(txn->wr.gc.retxl);
rkl_destroy(&txn->wr.gc.reclaimed);
rkl_destroy(&txn->wr.gc.comeback);
pnl_free(txn->wr.retired_pages);
pnl_free(txn->wr.spilled.list);
pnl_free(txn->wr.repnl);
@ -121,10 +124,9 @@ int txn_basal_start(MDBX_txn *txn, unsigned flags) {
MDBX_PNL_SETSIZE(txn->wr.retired_pages, 0);
txn->wr.spilled.list = nullptr;
txn->wr.spilled.least_removed = 0;
txn->wr.gc.time_acc = 0;
txn->wr.gc.last_reclaimed = 0;
if (txn->wr.gc.retxl)
MDBX_PNL_SETSIZE(txn->wr.gc.retxl, 0);
txn->wr.gc.spent = 0;
tASSERT(txn, rkl_empty(&txn->wr.gc.reclaimed));
txn->env->gc.detent = 0;
env->txn = txn;
return MDBX_SUCCESS;
@ -140,6 +142,8 @@ int txn_basal_end(MDBX_txn *txn, unsigned mode) {
env->txn = nullptr;
pnl_free(txn->wr.spilled.list);
txn->wr.spilled.list = nullptr;
rkl_clear_and_shrink(&txn->wr.gc.reclaimed);
rkl_clear_and_shrink(&txn->wr.gc.comeback);
eASSERT(env, txn->parent == nullptr);
pnl_shrink(&txn->wr.retired_pages);
@ -258,9 +262,19 @@ int txn_basal_commit(MDBX_txn *txn, struct commit_timestamp *ts) {
}
gcu_t gcu_ctx;
int rc = gc_update_init(txn, &gcu_ctx);
int rc = gc_put_init(txn, &gcu_ctx);
if (likely(rc == MDBX_SUCCESS))
rc = gc_update(txn, &gcu_ctx);
#if MDBX_ENABLE_BIGFOOT
const txnid_t commit_txnid = gcu_ctx.bigfoot;
if (commit_txnid > txn->txnid)
TRACE("use @%" PRIaTXN " (+%zu) for commit bigfoot-txn", commit_txnid, (size_t)(commit_txnid - txn->txnid));
#else
const txnid_t commit_txnid = txn->txnid;
#endif
gc_put_destroy(&gcu_ctx);
if (ts)
ts->gc_cpu = osal_cputime(nullptr) - ts->gc_cpu;
if (unlikely(rc != MDBX_SUCCESS))
@ -334,13 +348,6 @@ int txn_basal_commit(MDBX_txn *txn, struct commit_timestamp *ts) {
meta.canary = txn->canary;
memcpy(&meta.dxbid, &head.ptr_c->dxbid, sizeof(meta.dxbid));
txnid_t commit_txnid = txn->txnid;
#if MDBX_ENABLE_BIGFOOT
if (gcu_ctx.bigfoot > txn->txnid) {
commit_txnid = gcu_ctx.bigfoot;
TRACE("use @%" PRIaTXN " (+%zu) for commit bigfoot-txn", commit_txnid, (size_t)(commit_txnid - txn->txnid));
}
#endif
meta.unsafe_sign = DATASIGN_NONE;
meta_set_txnid(env, &meta, commit_txnid);

View File

@ -349,6 +349,7 @@ int txn_nested_create(MDBX_txn *parent, const MDBX_txn_flags_t flags) {
return LOG_IFERR(MDBX_ENOMEM);
tASSERT(parent, dpl_check(parent));
rkl_init(&txn->wr.gc.reclaimed);
#if MDBX_ENABLE_DBI_SPARSE
txn->dbi_sparse = parent->dbi_sparse;
#endif /* MDBX_ENABLE_DBI_SPARSE */
@ -403,12 +404,11 @@ int txn_nested_create(MDBX_txn *parent, const MDBX_txn_flags_t flags) {
= parent->geo.first_unallocated) -
MDBX_ENABLE_REFUND));
txn->wr.gc.time_acc = parent->wr.gc.time_acc;
txn->wr.gc.last_reclaimed = parent->wr.gc.last_reclaimed;
if (parent->wr.gc.retxl) {
txn->wr.gc.retxl = parent->wr.gc.retxl;
parent->wr.gc.retxl = (void *)(intptr_t)MDBX_PNL_GETSIZE(parent->wr.gc.retxl);
}
txn->wr.gc.spent = parent->wr.gc.spent;
rkl_init(&txn->wr.gc.comeback);
err = rkl_copy(&parent->wr.gc.reclaimed, &txn->wr.gc.reclaimed);
if (unlikely(err != MDBX_SUCCESS))
return err;
txn->wr.retired_pages = parent->wr.retired_pages;
parent->wr.retired_pages = (void *)(intptr_t)MDBX_PNL_GETSIZE(parent->wr.retired_pages);
@ -438,6 +438,7 @@ int txn_nested_create(MDBX_txn *parent, const MDBX_txn_flags_t flags) {
(txn->parent ? txn->parent->wr.dirtyroom : txn->env->options.dp_limit));
parent->env->txn = txn;
tASSERT(parent, parent->cursors[FREE_DBI] == nullptr);
// TODO: shadow GC' cursor
return txn_shadow_cursors(parent, MAIN_DBI);
}
@ -447,11 +448,7 @@ void txn_nested_abort(MDBX_txn *nested) {
nested->signature = 0;
nested->owner = 0;
if (nested->wr.gc.retxl) {
tASSERT(parent, MDBX_PNL_GETSIZE(nested->wr.gc.retxl) >= (uintptr_t)parent->wr.gc.retxl);
MDBX_PNL_SETSIZE(nested->wr.gc.retxl, (uintptr_t)parent->wr.gc.retxl);
parent->wr.gc.retxl = nested->wr.gc.retxl;
}
rkl_destroy(&nested->wr.gc.reclaimed);
if (nested->wr.retired_pages) {
tASSERT(parent, MDBX_PNL_GETSIZE(nested->wr.retired_pages) >= (uintptr_t)parent->wr.retired_pages);
@ -530,17 +527,14 @@ int txn_nested_join(MDBX_txn *txn, struct commit_timestamp *ts) {
//-------------------------------------------------------------------------
parent->wr.gc.retxl = txn->wr.gc.retxl;
txn->wr.gc.retxl = nullptr;
parent->wr.retired_pages = txn->wr.retired_pages;
txn->wr.retired_pages = nullptr;
pnl_free(parent->wr.repnl);
parent->wr.repnl = txn->wr.repnl;
txn->wr.repnl = nullptr;
parent->wr.gc.time_acc = txn->wr.gc.time_acc;
parent->wr.gc.last_reclaimed = txn->wr.gc.last_reclaimed;
parent->wr.gc.spent = txn->wr.gc.spent;
rkl_destructive_move(&txn->wr.gc.reclaimed, &parent->wr.gc.reclaimed);
parent->geo = txn->geo;
parent->canary = txn->canary;

View File

@ -3,8 +3,18 @@
#include "internals.h"
__hot txnid_t txn_snapshot_oldest(const MDBX_txn *const txn) {
return mvcc_shapshot_oldest(txn->env, txn->wr.troika.txnid[txn->wr.troika.prefer_steady]);
MDBX_cursor *txn_gc_cursor(MDBX_txn *txn) {
tASSERT(txn, (txn->flags & (MDBX_TXN_BLOCKED | MDBX_TXN_RDONLY)) == 0);
return ptr_disp(txn->env->basal_txn, sizeof(MDBX_txn));
}
__hot bool txn_gc_detent(const MDBX_txn *const txn) {
const txnid_t detent = mvcc_shapshot_oldest(txn->env, txn->wr.troika.txnid[txn->wr.troika.prefer_steady]);
if (likely(detent == txn->env->gc.detent))
return false;
txn->env->gc.detent = detent;
return true;
}
void txn_done_cursors(MDBX_txn *txn) {
@ -417,12 +427,9 @@ MDBX_txn *txn_alloc(const MDBX_txn_flags_t flags, MDBX_env *env) {
txn = osal_malloc(size);
if (unlikely(!txn))
return txn;
#if MDBX_DEBUG
memset(txn, 0xCD, size);
VALGRIND_MAKE_MEM_UNDEFINED(txn, size);
#endif /* MDBX_DEBUG */
MDBX_ANALYSIS_ASSUME(size > base);
memset(txn, 0, (MDBX_GOOFY_MSVC_STATIC_ANALYZER && base > size) ? size : base);
txn->dbs = ptr_disp(txn, base);
txn->cursors = ptr_disp(txn->dbs, env->max_dbi * sizeof(txn->dbs[0]));
#if MDBX_DEBUG

View File

@ -3,6 +3,17 @@
#include "internals.h"
MDBX_NOTHROW_CONST_FUNCTION MDBX_MAYBE_UNUSED MDBX_INTERNAL unsigned ceil_log2n(size_t value_uintptr) {
assert(value_uintptr > 0 && value_uintptr < INT32_MAX);
value_uintptr -= 1;
value_uintptr |= value_uintptr >> 1;
value_uintptr |= value_uintptr >> 2;
value_uintptr |= value_uintptr >> 4;
value_uintptr |= value_uintptr >> 8;
value_uintptr |= value_uintptr >> 16;
return log2n_powerof2(value_uintptr + 1);
}
MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION MDBX_INTERNAL unsigned log2n_powerof2(size_t value_uintptr) {
assert(value_uintptr > 0 && value_uintptr < INT32_MAX && is_powerof2(value_uintptr));
assert((value_uintptr & -(intptr_t)value_uintptr) == value_uintptr);

View File

@ -58,6 +58,8 @@ MDBX_NOTHROW_CONST_FUNCTION MDBX_MAYBE_UNUSED static inline size_t ceil_powerof2
MDBX_NOTHROW_CONST_FUNCTION MDBX_MAYBE_UNUSED MDBX_INTERNAL unsigned log2n_powerof2(size_t value_uintptr);
MDBX_NOTHROW_CONST_FUNCTION MDBX_MAYBE_UNUSED MDBX_INTERNAL unsigned ceil_log2n(size_t value_uintptr);
MDBX_NOTHROW_CONST_FUNCTION MDBX_INTERNAL uint64_t rrxmrrxmsx_0(uint64_t v);
struct monotime_cache {