diff --git a/mdbx.h b/mdbx.h index 35c3318a..b1cfc0f0 100644 --- a/mdbx.h +++ b/mdbx.h @@ -2561,9 +2561,6 @@ struct MDBX_envinfo { msync; /**< Number of explicit msync-to-disk operations (not a pages) */ uint64_t fsync; /**< Number of explicit fsync-to-disk operations (not a pages) */ - uint64_t - gcrtime_seconds16dot16; /**< Time spent loading and searching inside - GC (aka FreeDB) in 1/65536 of second */ } mi_pgop_stat; }; #ifndef __cplusplus @@ -3713,8 +3710,8 @@ struct MDBX_commit_latency { /** \brief Duration of preparation (commit child transactions, update * sub-databases records and cursors destroying). */ uint32_t preparation; - /** \brief Duration of GC/freeDB handling & updation. */ - uint32_t gc; + /** \brief Duration of GC update by wall clock. */ + uint32_t gc_wallclock; /** \brief Duration of internal audit if enabled. */ uint32_t audit; /** \brief Duration of writing dirty/modified data pages to a filesystem, @@ -3727,6 +3724,80 @@ struct MDBX_commit_latency { uint32_t ending; /** \brief The total duration of a commit. */ uint32_t whole; + /** \brief User-mode CPU time spent on GC update. */ + uint32_t gc_cputime; + + /** \brief Информация для профилирования работы GC. + * \note Статистика является общей для всех процессов работающих с одним + * файлом БД и хранится в LCK-файле. Данные аккумулируются при фиксации всех + * транзакций, но только в сборках libmdbx c установленной опцией + * \ref MDBX_ENABLE_PROFGC. Собранная статистика возвращаются любому процессу + * при использовании \ref mdbx_txn_commit_ex() и одновременно обнуляется + * при завершении транзакций верхнего уровня (не вложенных). */ + struct { + /** \brief Количество итераций обновления GC, + * больше 1 если были повторы/перезапуски. */ + uint32_t wloops; + /** \brief Количество итераций слияния записей GC. */ + uint32_t coalescences; + /** \brief Количество уничтожений предыдущих надежных/устойчивых + * точек фиксации при работе в режиме \ref MDBX_UTTERLY_NOSYNC. */ + uint32_t wipes; + /** \brief Количество принудительных фиксаций на диск + * во избежания приращения БД при работе вне режима + * \ref MDBX_UTTERLY_NOSYNC. */ + uint32_t flushes; + /** \brief Количество обращений к механизму Handle-Slow-Readers + * во избежания приращения БД. + * \see MDBX_hsr_func */ + uint32_t kicks; + + /** \brief Счетчик выполнения по медленному пути (slow path execution count) + * GC ради данных пользователя. */ + uint32_t work_counter; + /** \brief Время "по настенным часам" затраченное на чтение и поиск внутри + * GC ради данных пользователя. */ + uint32_t work_rtime_monotonic; + /** \brief Монотонное время по "настенным часам" затраченное + * на подготовку страниц извлекаемых из GC для данных пользователя, + * включая подкачку с диска. */ + uint32_t work_xtime_monotonic; + /** \brief Время ЦПУ в режиме пользователе затраченное на чтение и поиск + * внтури GC ради данных пользователя. */ + uint32_t work_rtime_cpu; + /** \brief Количество итераций поиска внутри GC при выделении страниц + * ради данных пользователя. */ + uint32_t work_rsteps; + /** \brief Количество запросов на выделение последовательностей страниц + * ради данных пользователя. */ + uint32_t work_xpages; + /** \brief Количество страничных промахов (page faults) внутри GC + * при выделении и подготовки страниц для данных пользователя. */ + uint32_t work_majflt; + + /** \brief Счетчик выполнения по медленному пути (slow path execution count) + * GC для целей поддержки и обновления самой GC. */ + uint32_t self_counter; + /** \brief Время "по настенным часам" затраченное на чтение и поиск внутри + * GC для целей поддержки и обновления самой GC. */ + uint32_t self_rtime_monotonic; + /** \brief Монотонное время по "настенным часам" затраченное на подготовку + * страниц извлекаемых из GC для целей поддержки и обновления самой GC, + * включая подкачку с диска. */ + uint32_t self_xtime_monotonic; + /** \brief Время ЦПУ в режиме пользователе затраченное на чтение и поиск + * внтури GC для целей поддержки и обновления самой GC. */ + uint32_t self_rtime_cpu; + /** \brief Количество итераций поиска внутри GC при выделении страниц + * для целей поддержки и обновления самой GC. */ + uint32_t self_rsteps; + /** \brief Количество запросов на выделение последовательностей страниц + * для самой GC. */ + uint32_t self_xpages; + /** \brief Количество страничных промахов (page faults) внутри GC + * при выделении и подготовки страниц для самой GC. */ + uint32_t self_majflt; + } gc_prof; }; #ifndef __cplusplus /** \ingroup c_statinfo */ diff --git a/src/bits.md b/src/bits.md index 82c9eed4..a9b7c2b1 100644 --- a/src/bits.md +++ b/src/bits.md @@ -5,8 +5,8 @@ N | MASK | ENV | TXN | DB | PUT | DBI | NOD 2 |0000 0004|ALLOC_NEW |TXN_DIRTY |DUPSORT | |DBI_FRESH |F_DUPDATA|P_OVERFLOW| | 3 |0000 0008|ALLOC_SLOT |TXN_SPILLS |INTEGERKEY| |DBI_CREAT | |P_META | | 4 |0000 0010|ALLOC_FAKE |TXN_HAS_CHILD |DUPFIXED |NOOVERWRITE|DBI_VALID | |P_BAD | | -5 |0000 0020| | |INTEGERDUP|NODUPDATA |DBI_USRVALID| |P_LEAF2 | | -6 |0000 0040| | |REVERSEDUP|CURRENT |DBI_DUPDATA | |P_SUBP | | +5 |0000 0020| |TXN_UPDATE_GC |INTEGERDUP|NODUPDATA |DBI_USRVALID| |P_LEAF2 | | +6 |0000 0040| |TXN_FROZEN_RE |REVERSEDUP|CURRENT |DBI_DUPDATA | |P_SUBP | | 7 |0000 0080| | | |ALLDUPS |DBI_AUDITED | | | | 8 |0000 0100| _MAY_MOVE | | | | | | | <= | 9 |0000 0200| _MAY_UNMAP| | | | | | | <= | diff --git a/src/core.c b/src/core.c index 60b060b9..0f13c846 100644 --- a/src/core.c +++ b/src/core.c @@ -2442,19 +2442,21 @@ pnl_merge_inner(pgno_t *__restrict dst, const pgno_t *__restrict src_a, } /* Merge a PNL onto a PNL. The destination PNL must be big enough */ -__hot static void pnl_merge(MDBX_PNL dst, const MDBX_PNL src) { +__hot static size_t pnl_merge(MDBX_PNL dst, const MDBX_PNL src) { assert(pnl_check_allocated(dst, MAX_PAGENO + 1)); assert(pnl_check(src, MAX_PAGENO + 1)); const size_t src_len = MDBX_PNL_GETSIZE(src); const size_t dst_len = MDBX_PNL_GETSIZE(dst); + size_t total = dst_len; if (likely(src_len > 0)) { - const size_t total = dst_len + src_len; + total += src_len; assert(MDBX_PNL_ALLOCLEN(dst) >= total); dst[0] = /* the detent */ (MDBX_PNL_ASCENDING ? 0 : P_INVALID); pnl_merge_inner(dst + total, dst + dst_len, src + src_len, src); MDBX_PNL_SETSIZE(dst, total); } assert(pnl_check_allocated(dst, MAX_PAGENO + 1)); + return total; } static void spill_remove(MDBX_txn *txn, size_t idx, pgno_t npages) { @@ -3791,15 +3793,15 @@ MDBX_MAYBE_UNUSED __cold static bool dirtylist_check(MDBX_txn *txn) { } const size_t rpa = - pnl_search(txn->tw.reclaimed_pglist, dp->mp_pgno, txn->mt_next_pgno); - tASSERT(txn, rpa > MDBX_PNL_GETSIZE(txn->tw.reclaimed_pglist) || - txn->tw.reclaimed_pglist[rpa] != dp->mp_pgno); - if (rpa <= MDBX_PNL_GETSIZE(txn->tw.reclaimed_pglist) && - unlikely(txn->tw.reclaimed_pglist[rpa] == dp->mp_pgno)) + pnl_search(txn->tw.relist, dp->mp_pgno, txn->mt_next_pgno); + tASSERT(txn, rpa > MDBX_PNL_GETSIZE(txn->tw.relist) || + txn->tw.relist[rpa] != dp->mp_pgno); + if (rpa <= MDBX_PNL_GETSIZE(txn->tw.relist) && + unlikely(txn->tw.relist[rpa] == dp->mp_pgno)) return false; if (num > 1) { - const size_t rpb = pnl_search(txn->tw.reclaimed_pglist, - dp->mp_pgno + num - 1, txn->mt_next_pgno); + const size_t rpb = + pnl_search(txn->tw.relist, dp->mp_pgno + num - 1, txn->mt_next_pgno); tASSERT(txn, rpa == rpb); if (unlikely(rpa != rpb)) return false; @@ -3828,7 +3830,7 @@ MDBX_MAYBE_UNUSED __cold static bool dirtylist_check(MDBX_txn *txn) { static void refund_reclaimed(MDBX_txn *txn) { /* Scanning in descend order */ pgno_t next_pgno = txn->mt_next_pgno; - const MDBX_PNL pnl = txn->tw.reclaimed_pglist; + const MDBX_PNL pnl = txn->tw.relist; tASSERT(txn, MDBX_PNL_GETSIZE(pnl) && MDBX_PNL_MOST(pnl) == next_pgno - 1); #if MDBX_PNL_ASCENDING size_t i = MDBX_PNL_GETSIZE(pnl); @@ -3849,8 +3851,7 @@ static void refund_reclaimed(MDBX_txn *txn) { VERBOSE("refunded %" PRIaPGNO " pages: %" PRIaPGNO " -> %" PRIaPGNO, txn->mt_next_pgno - next_pgno, txn->mt_next_pgno, next_pgno); txn->mt_next_pgno = next_pgno; - tASSERT(txn, - pnl_check_allocated(txn->tw.reclaimed_pglist, txn->mt_next_pgno - 1)); + tASSERT(txn, pnl_check_allocated(txn->tw.relist, txn->mt_next_pgno - 1)); } static void refund_loose(MDBX_txn *txn) { @@ -4005,8 +4006,8 @@ static bool txn_refund(MDBX_txn *txn) { refund_loose(txn); while (true) { - if (MDBX_PNL_GETSIZE(txn->tw.reclaimed_pglist) == 0 || - MDBX_PNL_MOST(txn->tw.reclaimed_pglist) != txn->mt_next_pgno - 1) + if (MDBX_PNL_GETSIZE(txn->tw.relist) == 0 || + MDBX_PNL_MOST(txn->tw.relist) != txn->mt_next_pgno - 1) break; refund_reclaimed(txn); @@ -4336,8 +4337,8 @@ status_done: reclaim: DEBUG("reclaim %u %s page %" PRIaPGNO, npages, "dirty", pgno); - rc = pnl_insert_range(&txn->tw.reclaimed_pglist, pgno, npages); - tASSERT(txn, pnl_check_allocated(txn->tw.reclaimed_pglist, + rc = pnl_insert_range(&txn->tw.relist, pgno, npages); + tASSERT(txn, pnl_check_allocated(txn->tw.relist, txn->mt_next_pgno - MDBX_ENABLE_REFUND)); tASSERT(txn, dirtylist_check(txn)); return rc; @@ -5498,7 +5499,7 @@ __hot static int __must_check_result page_dirty(MDBX_txn *txn, MDBX_page *mp, if (txn->tw.loose_count) { MDBX_page *loose = txn->tw.loose_pages; DEBUG("purge-and-reclaim loose page %" PRIaPGNO, loose->mp_pgno); - rc = pnl_insert_range(&txn->tw.reclaimed_pglist, loose->mp_pgno, 1); + rc = pnl_insert_range(&txn->tw.relist, loose->mp_pgno, 1); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; size_t di = dpl_search(txn, loose->mp_pgno); @@ -6564,13 +6565,13 @@ static pgno_t *scan4seq_resolver(pgno_t *range, const size_t len, //------------------------------------------------------------------------------ /* Allocate page numbers and memory for writing. Maintain mt_last_reclaimed, - * mt_reclaimed_pglist and mt_next_pgno. Set MDBX_TXN_ERROR on failure. + * mt_relist and mt_next_pgno. Set MDBX_TXN_ERROR on failure. * * If there are free pages available from older transactions, they * are re-used first. Otherwise allocate a new page at mt_next_pgno. - * Do not modify the GC, just merge GC records into mt_reclaimed_pglist + * Do not modify the GC, just merge GC records into mt_relist * and move mt_last_reclaimed to say which records were consumed. Only this - * function can create mt_reclaimed_pglist and move + * function can create mt_relist and move * mt_last_reclaimed/mt_next_pgno. * * [in] mc cursor A cursor handle identifying the transaction and @@ -6586,267 +6587,318 @@ static pgno_t *scan4seq_resolver(pgno_t *range, const size_t len, #define MDBX_ALLOC_RESERVE 16 #define MDBX_ALLOC_BACKLOG 32 #define MDBX_ALLOC_ALL (MDBX_ALLOC_GC | MDBX_ALLOC_NEW) +#define MDBX_ALLOC_LIFO 128 + +static __inline bool is_gc_usable(const MDBX_txn *txn) { + /* If txn is updating the GC, then the retired-list cannot play catch-up with + * itself by growing while trying to save it. */ + if (txn->mt_flags & (MDBX_TXN_UPDATE_GC | MDBX_TXN_FROZEN_RE)) + return false; + + /* avoid (recursive) search inside empty tree and while tree is + updating, todo4recovery://erased_by_github/libmdbx/issues/31 */ + if (txn->mt_dbs[FREE_DBI].md_entries == 0) + return false; + + /* If our dirty list is already full, we can't touch GC */ + if (unlikely(txn->tw.dirtyroom < txn->mt_dbs[FREE_DBI].md_depth) && + !(txn->mt_dbistate[FREE_DBI] & DBI_DIRTY)) + return false; + + return true; +} + +static int gc_cursor_init(MDBX_cursor *mc, MDBX_txn *txn) { + if (unlikely(txn->mt_dbs[FREE_DBI].md_flags != MDBX_INTEGERKEY)) { + ERROR("unexpected/invalid db-flags 0x%u for GC/FreeDB", + txn->mt_dbs[FREE_DBI].md_flags); + return MDBX_CORRUPTED; + } + return cursor_init(mc, txn, FREE_DBI); +} + +static pgr_t page_alloc_slowpath(MDBX_cursor *mc, const size_t num, + char flags) { +#if MDBX_ENABLE_PROFGC + const uint64_t monotime_before = osal_monotime(); + size_t majflt_before; + const uint64_t cputime_before = osal_cputime(&majflt_before); + uint64_t monotime_shot = 0; +#endif /* MDBX_ENABLE_PROFGC */ -static pgr_t page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num, int flags) { pgr_t ret; MDBX_txn *const txn = mc->mc_txn; MDBX_env *const env = txn->mt_env; +#if MDBX_ENABLE_PROFGC + profgc_stat_t *const prof = (mc->mc_dbi == FREE_DBI) + ? &env->me_lck->mti_pgop_stat.gc_prof.self + : &env->me_lck->mti_pgop_stat.gc_prof.work; + prof->spe_counter += 1; +#endif /* MDBX_ENABLE_PROFGC */ + eASSERT(env, num == 0 || !(flags & MDBX_ALLOC_SLOT)); eASSERT(env, num > 0 || !(flags & MDBX_ALLOC_NEW)); + eASSERT(env, (flags & (MDBX_ALLOC_SLOT | MDBX_ALLOC_RESERVE | + MDBX_ALLOC_BACKLOG)) == 0 || + (flags & MDBX_ALLOC_GC)); + eASSERT(env, (flags & (MDBX_ALLOC_SLOT | MDBX_ALLOC_RESERVE | + MDBX_ALLOC_BACKLOG)) == 0 || + (flags & MDBX_ALLOC_NEW) == 0); + eASSERT(env, pnl_check_allocated(txn->tw.relist, + txn->mt_next_pgno - MDBX_ENABLE_REFUND)); - const unsigned coalesce_threshold = env->me_maxgc_ov1page >> 2; - if (likely(flags & MDBX_ALLOC_GC)) { - if (unlikely( - /* If mc is updating the GC, then the retired-list cannot play - catch-up with itself by growing while trying to save it. */ - (mc->mc_flags & (C_RECLAIMING | C_GCFREEZE)) || - /* avoid (recursive) search inside empty tree and while tree is - updating, todo4recovery://erased_by_github/libmdbx/issues/31 */ - txn->mt_dbs[FREE_DBI].md_entries == 0 || - /* If our dirty list is already full, we can't touch GC */ - (txn->tw.dirtyroom < txn->mt_dbs[FREE_DBI].md_depth && - !(txn->mt_dbistate[FREE_DBI] & DBI_DIRTY)))) - flags -= MDBX_ALLOC_GC; - else { - flags |= env->me_flags & MDBX_LIFORECLAIM; - if (txn->mt_dbs[FREE_DBI].md_branch_pages && - MDBX_PNL_GETSIZE(txn->tw.reclaimed_pglist) < coalesce_threshold) - flags |= MDBX_ALLOC_COALESCE; + pgno_t pgno = 0, *range = nullptr; + size_t re_len = MDBX_PNL_GETSIZE(txn->tw.relist); + if (num > 1) { + eASSERT(env, !(flags & MDBX_ALLOC_SLOT)); +#if MDBX_ENABLE_PROFGC + prof->xpages += 1; +#endif /* MDBX_ENABLE_PROFGC */ + if (re_len >= num) { + eASSERT(env, MDBX_PNL_LAST(txn->tw.relist) < txn->mt_next_pgno && + MDBX_PNL_FIRST(txn->tw.relist) < txn->mt_next_pgno); + range = txn->tw.relist + (MDBX_PNL_ASCENDING ? 1 : re_len); + pgno = *range; + if (num == 1) + goto done; + range = scan4seq(range, re_len, num - 1); + eASSERT(env, range == scan4range_checker(txn->tw.relist, num - 1)); + if (likely(range)) { + pgno = *range; + goto done; + } } + } else { + eASSERT(env, (flags & (MDBX_ALLOC_SLOT | MDBX_ALLOC_RESERVE)) || + MDBX_PNL_GETSIZE(txn->tw.relist) == 0); } - eASSERT(env, pnl_check_allocated(txn->tw.reclaimed_pglist, - txn->mt_next_pgno - MDBX_ENABLE_REFUND)); - pgno_t pgno, *re_list = txn->tw.reclaimed_pglist; - size_t re_len = MDBX_PNL_GETSIZE(re_list); - pgno_t *range = nullptr; - txnid_t detent = 0, last = 0; -#if MDBX_ENABLE_PGOP_STAT - uint64_t timestamp = 0; -#endif /* MDBX_ENABLE_PGOP_STAT */ + //--------------------------------------------------------------------------- - while (true) { /* hsr-kick retry loop */ - MDBX_cursor_couple recur; - for (MDBX_cursor_op op = MDBX_FIRST;; - op = (flags & MDBX_LIFORECLAIM) ? MDBX_PREV : MDBX_NEXT) { - MDBX_val key, data; + if (likely(flags & MDBX_ALLOC_GC)) { + if (unlikely(!is_gc_usable(txn))) + goto no_gc; - /* Seek a big enough contiguous page range. - * Prefer pages with lower pgno. */ - eASSERT(env, - pnl_check_allocated(txn->tw.reclaimed_pglist, txn->mt_next_pgno)); - if (!(flags & (MDBX_ALLOC_COALESCE | MDBX_ALLOC_SLOT)) && re_len >= num) { - eASSERT(env, MDBX_PNL_LAST(re_list) < txn->mt_next_pgno && - MDBX_PNL_FIRST(re_list) < txn->mt_next_pgno); - range = re_list + (MDBX_PNL_ASCENDING ? 1 : re_len); - pgno = *range; - if (num == 1) - goto done; - range = scan4seq(range, re_len, num - 1); - tASSERT(txn, range == scan4range_checker(re_list, num - 1)); - if (likely(range)) { - pgno = *range; - goto done; - } - } + eASSERT(env, (flags & (MDBX_ALLOC_COALESCE | MDBX_ALLOC_LIFO)) == 0); + flags += (env->me_flags & MDBX_LIFORECLAIM) ? MDBX_ALLOC_LIFO : 0; - if (op == MDBX_FIRST) { /* 1st iteration, setup cursor, etc */ - if (unlikely(!(flags & MDBX_ALLOC_GC))) - break /* reclaiming is prohibited for now */; + const unsigned coalesce_threshold = env->me_maxgc_ov1page >> 2; + if (txn->mt_dbs[FREE_DBI].md_branch_pages && + MDBX_PNL_GETSIZE(txn->tw.relist) < coalesce_threshold && num) + flags += MDBX_ALLOC_COALESCE; - /* Prepare to fetch and coalesce */ -#if MDBX_ENABLE_PGOP_STAT - if (likely(timestamp == 0)) - timestamp = osal_monotime(); -#endif /* MDBX_ENABLE_PGOP_STAT */ - detent = txn_oldest_reader(txn) + 1; + MDBX_cursor recur; + ret.err = gc_cursor_init(&recur, txn); + if (unlikely(ret.err != MDBX_SUCCESS)) + goto fail; - ret.err = cursor_init(&recur.outer, txn, FREE_DBI); - if (unlikely(ret.err != MDBX_SUCCESS)) - goto fail; - if (flags & MDBX_LIFORECLAIM) { - /* Begin from oldest reader if any */ - if (detent > MIN_TXNID) { - last = detent - 1; - op = MDBX_SET_RANGE; - } - } else if (txn->tw.last_reclaimed) { - /* Continue lookup from txn->tw.last_reclaimed to oldest reader */ - last = txn->tw.last_reclaimed; - op = MDBX_SET_RANGE; - } + retry_gc_refresh_oldest:; + txnid_t oldest = txn_oldest_reader(txn); + if (unlikely(!oldest)) + goto no_gc; - key.iov_base = &last; - key.iov_len = sizeof(last); - } + retry_gc_have_oldest: + if (unlikely(oldest >= txn->mt_txnid)) { + ERROR("unexpected/invalid oldest-readed txnid %" PRIaTXN + " for current-txnid %" PRIaTXN, + oldest, txn->mt_txnid); + ret.err = MDBX_PROBLEM; + goto fail; + } + const txnid_t detent = oldest + 1; - if (!(flags & MDBX_LIFORECLAIM)) { - /* Do not try fetch more if the record will be too recent */ - if (op != MDBX_FIRST && ++last >= detent) { - detent = txn_oldest_reader(txn) + 1; - if (detent <= last) - break; - } - } - - ret.err = mdbx_cursor_get(&recur.outer, &key, NULL, op); - if (ret.err == MDBX_NOTFOUND && (flags & MDBX_LIFORECLAIM)) { - if (op == MDBX_SET_RANGE) - continue; - const txnid_t snap = txn_oldest_reader(txn); - if (unlikely(detent <= snap)) { - detent = snap + 1; - last = snap; - key.iov_base = &last; - key.iov_len = sizeof(last); - op = MDBX_SET_RANGE; - ret.err = mdbx_cursor_get(&recur.outer, &key, NULL, op); - } - } - if (unlikely(ret.err)) { - if (ret.err == MDBX_NOTFOUND) - break; - goto fail; - } - - if (unlikely(key.iov_len != sizeof(txnid_t))) { - ret.err = MDBX_CORRUPTED; - goto fail; - } - last = unaligned_peek_u64(4, key.iov_base); - if (detent <= last) { - detent = txn_oldest_reader(txn) + 1; - if (detent <= last) { - if (flags & MDBX_LIFORECLAIM) - continue; - break; - } - } - - if (flags & MDBX_LIFORECLAIM) { - /* skip IDs of records that already reclaimed */ - if (txn->tw.lifo_reclaimed) { - size_t i; - for (i = MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed); i > 0; --i) - if (txn->tw.lifo_reclaimed[i] == last) - break; - if (i) - continue; - } - } - - /* Reading next GC record */ - MDBX_page *const mp = recur.outer.mc_pg[recur.outer.mc_top]; - if (unlikely((ret.err = node_read( - &recur.outer, - page_node(mp, recur.outer.mc_ki[recur.outer.mc_top]), - &data, mp)) != MDBX_SUCCESS)) - goto fail; - - if ((flags & MDBX_LIFORECLAIM) && !txn->tw.lifo_reclaimed) { + txnid_t last = 0; + bool should_scan = false; + MDBX_cursor_op op = MDBX_FIRST; + if (flags & MDBX_ALLOC_LIFO) { + if (!txn->tw.lifo_reclaimed) { txn->tw.lifo_reclaimed = txl_alloc(); if (unlikely(!txn->tw.lifo_reclaimed)) { ret.err = MDBX_ENOMEM; goto fail; } } + /* Begin lookup backward from oldest reader */ + last = detent - 1; + op = MDBX_SET_RANGE; + } else if (txn->tw.last_reclaimed) { + /* Continue lookup forward from last-reclaimed */ + last = txn->tw.last_reclaimed + 1; + if (last >= detent) + goto no_gc; + op = MDBX_SET_RANGE; + } - /* Append PNL from GC record to tw.reclaimed_pglist */ - cASSERT(mc, (mc->mc_flags & C_GCFREEZE) == 0); - pgno_t *gc_pnl = (pgno_t *)data.iov_base; - tASSERT(txn, data.iov_len >= MDBX_PNL_SIZEOF(gc_pnl)); - if (unlikely(data.iov_len % sizeof(pgno_t) || - data.iov_len < MDBX_PNL_SIZEOF(gc_pnl) || - !pnl_check(gc_pnl, txn->mt_next_pgno))) { - ret.err = MDBX_CORRUPTED; + next_gc:; + MDBX_val key; + key.iov_base = &last; + key.iov_len = sizeof(last); + +#if MDBX_ENABLE_PROFGC + prof->rsteps += 1; +#endif /* MDBX_ENABLE_PROFGC */ + + /* Seek first/next GC record */ + ret.err = mdbx_cursor_get(&recur, &key, NULL, op); + if (unlikely(ret.err != MDBX_SUCCESS)) { + if (unlikely(ret.err != MDBX_NOTFOUND)) goto fail; + if ((flags & MDBX_ALLOC_LIFO) && op == MDBX_SET_RANGE) { + op = MDBX_PREV; + goto next_gc; } - const size_t gc_len = MDBX_PNL_GETSIZE(gc_pnl); - if (unlikely(/* list is too long already */ MDBX_PNL_GETSIZE( - txn->tw.reclaimed_pglist) >= - env->me_options.rp_augment_limit) && - ((/* not a slot-request from gc-update */ - (flags & MDBX_ALLOC_SLOT) == 0 && - /* have enough unallocated space */ txn->mt_geo.upper >= - txn->mt_next_pgno + (size_t)num) || - gc_len + MDBX_PNL_GETSIZE(txn->tw.reclaimed_pglist) >= - MDBX_PGL_LIMIT)) { - /* Stop reclaiming to avoid large/overflow the page list. - * This is a rare case while search for a continuously multi-page region - * in a large database. - * todo4recovery://erased_by_github/libmdbx/issues/123 */ - NOTICE("stop reclaiming to avoid PNL overflow: %zu (current) + %zu " - "(chunk) -> %zu", - MDBX_PNL_GETSIZE(txn->tw.reclaimed_pglist), gc_len, - gc_len + MDBX_PNL_GETSIZE(txn->tw.reclaimed_pglist)); - flags &= ~(MDBX_ALLOC_GC | MDBX_ALLOC_COALESCE); - break; - } - ret.err = pnl_need(&txn->tw.reclaimed_pglist, gc_len); + goto depleted_gc; + } + if (unlikely(key.iov_len != sizeof(txnid_t))) { + ret.err = MDBX_CORRUPTED; + goto fail; + } + last = unaligned_peek_u64(4, key.iov_base); + if (flags & MDBX_ALLOC_LIFO) { + op = MDBX_PREV; + if (last >= detent) + goto next_gc; + /* skip IDs of records that already reclaimed */ + for (size_t i = MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed); i > 0; --i) + if (txn->tw.lifo_reclaimed[i] == last) + goto next_gc; + } else { + op = MDBX_NEXT; + if (unlikely(last >= detent)) + goto depleted_gc; + } + + /* Reading next GC record */ + MDBX_val data; + MDBX_page *const mp = recur.mc_pg[recur.mc_top]; + if (unlikely((ret.err = node_read(&recur, + page_node(mp, recur.mc_ki[recur.mc_top]), + &data, mp)) != MDBX_SUCCESS)) + goto fail; + + eASSERT(env, (txn->mt_flags & MDBX_TXN_FROZEN_RE) == 0); + pgno_t *gc_pnl = (pgno_t *)data.iov_base; + if (unlikely(data.iov_len % sizeof(pgno_t) || + data.iov_len < MDBX_PNL_SIZEOF(gc_pnl) || + !pnl_check(gc_pnl, txn->mt_next_pgno))) { + ret.err = MDBX_CORRUPTED; + goto fail; + } + const size_t gc_len = MDBX_PNL_GETSIZE(gc_pnl); + if (unlikely(/* list is too long already */ MDBX_PNL_GETSIZE( + txn->tw.relist) >= env->me_options.rp_augment_limit) && + ((/* not a slot-request from gc-update */ + (flags & MDBX_ALLOC_SLOT) == 0 && + /* have enough unallocated space */ txn->mt_geo.upper >= + txn->mt_next_pgno + num) || + gc_len + MDBX_PNL_GETSIZE(txn->tw.relist) >= MDBX_PGL_LIMIT)) { + /* Stop reclaiming to avoid large/overflow the page list. + * This is a rare case while search for a continuously multi-page region + * in a large database. + * todo4recovery://erased_by_github/libmdbx/issues/123 */ + NOTICE("stop reclaiming to avoid PNL overflow: %zu (current) + %zu " + "(chunk) -> %zu", + MDBX_PNL_GETSIZE(txn->tw.relist), gc_len, + gc_len + MDBX_PNL_GETSIZE(txn->tw.relist)); + goto depleted_gc; + } + + /* Remember ID of readed GC record */ + txn->tw.last_reclaimed = last; + if (flags & MDBX_ALLOC_LIFO) { + ret.err = txl_append(&txn->tw.lifo_reclaimed, last); if (unlikely(ret.err != MDBX_SUCCESS)) goto fail; - re_list = txn->tw.reclaimed_pglist; + } - /* Remember ID of GC record */ - if (flags & MDBX_LIFORECLAIM) { - ret.err = txl_append(&txn->tw.lifo_reclaimed, last); - if (unlikely(ret.err != MDBX_SUCCESS)) - goto fail; - } - txn->tw.last_reclaimed = last; + /* Append PNL from GC record to tw.relist */ + ret.err = pnl_need(&txn->tw.relist, gc_len); + if (unlikely(ret.err != MDBX_SUCCESS)) + goto fail; + txn->tw.relist = txn->tw.relist; - if (LOG_ENABLED(MDBX_LOG_EXTRA)) { - DEBUG_EXTRA("PNL read txn %" PRIaTXN " root %" PRIaPGNO " len %zu, PNL", - last, txn->mt_dbs[FREE_DBI].md_root, gc_len); - for (size_t i = gc_len; i; i--) - DEBUG_EXTRA_PRINT(" %" PRIaPGNO, gc_pnl[i]); - DEBUG_EXTRA_PRINT("%s\n", "."); - } + if (LOG_ENABLED(MDBX_LOG_EXTRA)) { + DEBUG_EXTRA("readed GC-pnl txn %" PRIaTXN " root %" PRIaPGNO + " len %zu, PNL", + last, txn->mt_dbs[FREE_DBI].md_root, gc_len); + for (size_t i = gc_len; i; i--) + DEBUG_EXTRA_PRINT(" %" PRIaPGNO, gc_pnl[i]); + DEBUG_EXTRA_PRINT(", next_pgno %u\n", txn->mt_next_pgno); + } - /* Merge in descending sorted order */ - pnl_merge(re_list, gc_pnl); - if (AUDIT_ENABLED() && unlikely(!pnl_check(re_list, txn->mt_next_pgno))) { + /* Merge in descending sorted order */ + re_len = pnl_merge(txn->tw.relist, gc_pnl); + should_scan = true; + if (AUDIT_ENABLED()) { + if (unlikely(!pnl_check(txn->tw.relist, txn->mt_next_pgno))) { ret.err = MDBX_CORRUPTED; goto fail; } - tASSERT(txn, dirtylist_check(txn)); + } else { + eASSERT(env, pnl_check_allocated(txn->tw.relist, txn->mt_next_pgno)); + } + eASSERT(env, dirtylist_check(txn)); - re_len = MDBX_PNL_GETSIZE(re_list); - tASSERT(txn, re_len == 0 || re_list[re_len] < txn->mt_next_pgno); - if (MDBX_ENABLE_REFUND && re_len && - unlikely(MDBX_PNL_MOST(re_list) == txn->mt_next_pgno - 1)) { - /* Refund suitable pages into "unallocated" space */ - txn_refund(txn); - re_list = txn->tw.reclaimed_pglist; - re_len = MDBX_PNL_GETSIZE(re_list); - } + eASSERT(env, + re_len == 0 || MDBX_PNL_MOST(txn->tw.relist) < txn->mt_next_pgno); + if (MDBX_ENABLE_REFUND && re_len && + unlikely(MDBX_PNL_MOST(txn->tw.relist) == txn->mt_next_pgno - 1)) { + /* Refund suitable pages into "unallocated" space */ + if (txn_refund(txn)) + re_len = MDBX_PNL_GETSIZE(txn->tw.relist); + } + eASSERT(env, re_len == MDBX_PNL_GETSIZE(txn->tw.relist)); + eASSERT(env, pnl_check_allocated(txn->tw.relist, + txn->mt_next_pgno - MDBX_ENABLE_REFUND)); - /* Done for a kick-reclaim mode, actually no page needed */ - if (unlikely(flags & MDBX_ALLOC_SLOT)) { - DEBUG("early-return NULL-page for %s mode", "MDBX_ALLOC_SLOT"); -#if MDBX_ENABLE_PGOP_STAT - eASSERT(env, timestamp != 0); - env->me_lck->mti_pgop_stat.gcrtime.weak += osal_monotime() - timestamp; -#endif /* MDBX_ENABLE_PGOP_STAT */ - ret.err = MDBX_SUCCESS; - ret.page = NULL; - return ret; - } - - /* Don't try to coalesce too much. */ - if (re_len /* current size */ > coalesce_threshold) { - if (flags & MDBX_ALLOC_COALESCE) - TRACE("clear %s %s", "MDBX_ALLOC_COALESCE", "since got threshold"); - flags &= ~MDBX_ALLOC_COALESCE; - } + /* Done for a kick-reclaim mode, actually no page needed */ + if (unlikely(flags & MDBX_ALLOC_SLOT)) { + eASSERT(env, ret.err == MDBX_SUCCESS); + goto early_exit; } - if (F_ISSET(flags, MDBX_ALLOC_COALESCE | MDBX_ALLOC_GC)) { - DEBUG_EXTRA("clear %s and continue", "MDBX_ALLOC_COALESCE"); + /* TODO: delete reclaimed records */ + + /* Don't try to coalesce too much. */ + eASSERT(env, op == MDBX_PREV || op == MDBX_NEXT); + if (flags & MDBX_ALLOC_COALESCE) { + if (re_len /* current size */ < coalesce_threshold) { +#if MDBX_ENABLE_PROFGC + env->me_lck->mti_pgop_stat.gc_prof.coalescences += 1; +#endif /* MDBX_ENABLE_PROFGC */ + goto next_gc; + } + TRACE("clear %s %s", "MDBX_ALLOC_COALESCE", "since got threshold"); flags &= ~MDBX_ALLOC_COALESCE; - continue; } + scan: + eASSERT(env, should_scan); + if (re_len >= num) { + eASSERT(env, MDBX_PNL_LAST(txn->tw.relist) < txn->mt_next_pgno && + MDBX_PNL_FIRST(txn->tw.relist) < txn->mt_next_pgno); + range = txn->tw.relist + (MDBX_PNL_ASCENDING ? 1 : re_len); + pgno = *range; + if (num == 1) + goto done; + range = scan4seq(range, re_len, num - 1); + eASSERT(env, range == scan4range_checker(txn->tw.relist, num - 1)); + if (likely(range)) { + pgno = *range; + goto done; + } + } + should_scan = false; + if (ret.err == MDBX_SUCCESS) + goto next_gc; + + depleted_gc: + ret.err = MDBX_NOTFOUND; + if (should_scan) + goto scan; + + //------------------------------------------------------------------------- + /* There is no suitable pages in the GC and to be able to allocate * we should CHOICE one of: * - make a new steady checkpoint if reclaiming was stopped by @@ -6855,203 +6907,241 @@ static pgr_t page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num, int flags) { * - extend the database file. */ /* Will use new pages from the map if nothing is suitable in the GC. */ - range = nullptr; pgno = txn->mt_next_pgno; - const size_t next = (size_t)pgno + num; + const size_t newnext = num + pgno; - if (flags & MDBX_ALLOC_GC) { - const meta_ptr_t recent = meta_recent(env, &txn->tw.troika); - const meta_ptr_t prefer_steady = meta_prefer_steady(env, &txn->tw.troika); - /* does reclaiming stopped at the last steady point? */ - if (recent.ptr_c != prefer_steady.ptr_c && prefer_steady.is_steady && - detent == prefer_steady.txnid + 1) { - DEBUG("gc-kick-steady: recent %" PRIaTXN "-%s, steady %" PRIaTXN - "-%s, detent %" PRIaTXN, - recent.txnid, durable_caption(recent.ptr_c), prefer_steady.txnid, - durable_caption(prefer_steady.ptr_c), detent); - ret.err = MDBX_RESULT_TRUE; - const pgno_t autosync_threshold = - atomic_load32(&env->me_lck->mti_autosync_threshold, mo_Relaxed); - const uint64_t autosync_period = - atomic_load64(&env->me_lck->mti_autosync_period, mo_Relaxed); - uint64_t eoos_timestamp; - /* wipe the last steady-point if one of: - * - UTTERLY_NOSYNC mode AND auto-sync threshold is NOT specified - * - UTTERLY_NOSYNC mode AND free space at steady-point is exhausted - * otherwise, make a new steady-point if one of: - * - auto-sync threshold is specified and reached; - * - upper limit of database size is reached; - * - database is full (with the current file size) - * AND auto-sync threshold it NOT specified */ - if (F_ISSET(env->me_flags, MDBX_UTTERLY_NOSYNC) && - ((autosync_threshold | autosync_period) == 0 || - next >= prefer_steady.ptr_c->mm_geo.now)) { - /* wipe steady checkpoint in MDBX_UTTERLY_NOSYNC mode - * without any auto-sync threshold(s). */ - ret.err = wipe_steady(txn, detent); - DEBUG("gc-wipe-steady, rc %d", ret.err); - eASSERT(env, prefer_steady.ptr_c != - meta_prefer_steady(env, &txn->tw.troika).ptr_c); - } else if ((flags & (MDBX_ALLOC_BACKLOG | MDBX_ALLOC_NEW)) == 0 || - (autosync_threshold && - atomic_load64(&env->me_lck->mti_unsynced_pages, - mo_Relaxed) >= autosync_threshold) || - (autosync_period && - (eoos_timestamp = atomic_load64( - &env->me_lck->mti_eoos_timestamp, mo_Relaxed)) && - osal_monotime() - eoos_timestamp >= autosync_period) || - next >= txn->mt_geo.upper || - (next >= txn->mt_end_pgno && - (autosync_threshold | autosync_period) == 0)) { - /* make steady checkpoint. */ - MDBX_meta meta = *recent.ptr_c; - ret.err = sync_locked(env, env->me_flags & MDBX_WRITEMAP, &meta, - &txn->tw.troika); - DEBUG("gc-make-steady, rc %d", ret.err); - eASSERT(env, prefer_steady.ptr_c != - meta_prefer_steady(env, &txn->tw.troika).ptr_c); - } - if (likely(ret.err != MDBX_RESULT_TRUE)) { - if (unlikely(ret.err != MDBX_SUCCESS)) - goto fail; - continue; - } + const meta_ptr_t recent = meta_recent(env, &txn->tw.troika); + const meta_ptr_t prefer_steady = meta_prefer_steady(env, &txn->tw.troika); + /* does reclaiming stopped at the last steady point? */ + if (recent.ptr_c != prefer_steady.ptr_c && prefer_steady.is_steady && + detent == prefer_steady.txnid + 1) { + DEBUG("gc-kick-steady: recent %" PRIaTXN "-%s, steady %" PRIaTXN + "-%s, detent %" PRIaTXN, + recent.txnid, durable_caption(recent.ptr_c), prefer_steady.txnid, + durable_caption(prefer_steady.ptr_c), detent); + const pgno_t autosync_threshold = + atomic_load32(&env->me_lck->mti_autosync_threshold, mo_Relaxed); + const uint64_t autosync_period = + atomic_load64(&env->me_lck->mti_autosync_period, mo_Relaxed); + uint64_t eoos_timestamp; + /* wipe the last steady-point if one of: + * - UTTERLY_NOSYNC mode AND auto-sync threshold is NOT specified + * - UTTERLY_NOSYNC mode AND free space at steady-point is exhausted + * otherwise, make a new steady-point if one of: + * - auto-sync threshold is specified and reached; + * - upper limit of database size is reached; + * - database is full (with the current file size) + * AND auto-sync threshold it NOT specified */ + if (F_ISSET(env->me_flags, MDBX_UTTERLY_NOSYNC) && + ((autosync_threshold | autosync_period) == 0 || + newnext >= prefer_steady.ptr_c->mm_geo.now)) { + /* wipe steady checkpoint in MDBX_UTTERLY_NOSYNC mode + * without any auto-sync threshold(s). */ +#if MDBX_ENABLE_PROFGC + env->me_lck->mti_pgop_stat.gc_prof.wipes += 1; +#endif /* MDBX_ENABLE_PROFGC */ + ret.err = wipe_steady(txn, detent); + DEBUG("gc-wipe-steady, rc %d", ret.err); + if (unlikely(ret.err != MDBX_SUCCESS)) + goto fail; + eASSERT(env, prefer_steady.ptr_c != + meta_prefer_steady(env, &txn->tw.troika).ptr_c); + goto retry_gc_refresh_oldest; + } + if ((flags & (MDBX_ALLOC_BACKLOG | MDBX_ALLOC_NEW)) == 0 || + (autosync_threshold && + atomic_load64(&env->me_lck->mti_unsynced_pages, mo_Relaxed) >= + autosync_threshold) || + (autosync_period && + (eoos_timestamp = + atomic_load64(&env->me_lck->mti_eoos_timestamp, mo_Relaxed)) && + osal_monotime() - eoos_timestamp >= autosync_period) || + newnext >= txn->mt_geo.upper || + (newnext >= txn->mt_end_pgno && + (autosync_threshold | autosync_period) == 0)) { + /* make steady checkpoint. */ +#if MDBX_ENABLE_PROFGC + env->me_lck->mti_pgop_stat.gc_prof.flushes += 1; +#endif /* MDBX_ENABLE_PROFGC */ + MDBX_meta meta = *recent.ptr_c; + ret.err = sync_locked(env, env->me_flags & MDBX_WRITEMAP, &meta, + &txn->tw.troika); + DEBUG("gc-make-steady, rc %d", ret.err); + eASSERT(env, ret.err != MDBX_RESULT_TRUE); + if (unlikely(ret.err != MDBX_SUCCESS)) + goto fail; + eASSERT(env, prefer_steady.ptr_c != + meta_prefer_steady(env, &txn->tw.troika).ptr_c); + goto retry_gc_refresh_oldest; } } - /* don't kick lagging reader(s) if is enough unallocated space + if (env->me_lck_mmap.lck && + unlikely(true == + atomic_load32(&env->me_lck_mmap.lck->mti_readers_refresh_flag, + mo_AcquireRelease))) { + oldest = txn_oldest_reader(txn); + if (oldest >= detent) + goto retry_gc_have_oldest; + } + + /* avoid kick lagging reader(s) if is enough unallocated space * at the end of database file. */ - if ((flags & MDBX_ALLOC_NEW) && next <= txn->mt_end_pgno) + if ((flags & MDBX_ALLOC_NEW) && newnext <= txn->mt_end_pgno) { + eASSERT(env, range == nullptr); goto done; - - if (flags & MDBX_ALLOC_GC) { - const txnid_t laggard = txn_oldest_reader(txn); - if (laggard >= detent || (laggard < txn->mt_txnid - xMDBX_TXNID_STEP && - kick_longlived_readers(env, laggard) >= detent)) - continue; } - ret.err = MDBX_NOTFOUND; - if (flags & MDBX_ALLOC_NEW) { - ret.err = MDBX_MAP_FULL; - if (next < txn->mt_geo.upper && txn->mt_geo.grow_pv) { - eASSERT(env, next > txn->mt_end_pgno); - const pgno_t grow_step = pv2pages(txn->mt_geo.grow_pv); - size_t aligned = pgno_align2os_pgno( - env, (pgno_t)(next + grow_step - next % grow_step)); - - if (aligned > txn->mt_geo.upper) - aligned = txn->mt_geo.upper; - eASSERT(env, aligned > txn->mt_end_pgno); - - VERBOSE("try growth datafile to %zu pages (+%zu)", aligned, - aligned - txn->mt_end_pgno); - ret.err = map_resize_implicit(env, txn->mt_next_pgno, (pgno_t)aligned, - txn->mt_geo.upper); - if (ret.err == MDBX_SUCCESS) { - env->me_txn->mt_end_pgno = (pgno_t)aligned; - goto done; - } - - ERROR("unable growth datafile to %zu pages (+%zu), errcode %d", aligned, - aligned - txn->mt_end_pgno, ret.err); - } else { - NOTICE("gc-alloc: next %zu > upper %" PRIaPGNO, next, - txn->mt_geo.upper); - } + if (oldest < txn->mt_txnid - xMDBX_TXNID_STEP) { + oldest = kick_longlived_readers(env, oldest); + if (oldest >= detent) + goto retry_gc_have_oldest; } - - fail: -#if MDBX_ENABLE_PGOP_STAT - if (timestamp) - env->me_lck->mti_pgop_stat.gcrtime.weak += osal_monotime() - timestamp; -#endif /* MDBX_ENABLE_PGOP_STAT */ - eASSERT(env, pnl_check_allocated(txn->tw.reclaimed_pglist, - txn->mt_next_pgno - MDBX_ENABLE_REFUND)); - int level; - const char *what; - if (likely(!(flags & MDBX_ALLOC_RESERVE))) { - txn->mt_flags |= MDBX_TXN_ERROR; - level = MDBX_LOG_ERROR; - what = "pages"; - } else { - level = (flags & MDBX_ALLOC_BACKLOG) ? MDBX_LOG_DEBUG : MDBX_LOG_NOTICE; - what = (flags & MDBX_ALLOC_SLOT) ? "gc-slot/backlog" : "backlog-pages"; - } - if (LOG_ENABLED(level)) - debug_log(level, __func__, __LINE__, - "unable alloc %u %s, flags 0x%x, errcode %d\n", num, what, - flags, ret.err); - - eASSERT(env, ret.err != MDBX_SUCCESS); - ret.page = NULL; - return ret; } + //--------------------------------------------------------------------------- + +no_gc: + if ((flags & MDBX_ALLOC_NEW) == 0) { + ret.err = MDBX_NOTFOUND; + goto fail; + } + + /* Will use new pages from the map if nothing is suitable in the GC. */ + pgno = txn->mt_next_pgno; + const size_t newnext = num + pgno; + if (newnext <= txn->mt_end_pgno) + goto done; + + if (newnext > txn->mt_geo.upper || !txn->mt_geo.grow_pv) { + NOTICE("gc-alloc: next %zu > upper %" PRIaPGNO, newnext, txn->mt_geo.upper); + ret.err = MDBX_MAP_FULL; + goto fail; + } + + eASSERT(env, newnext > txn->mt_end_pgno); + const size_t grow_step = pv2pages(txn->mt_geo.grow_pv); + size_t aligned = pgno_align2os_pgno( + env, (pgno_t)(newnext + grow_step - newnext % grow_step)); + + if (aligned > txn->mt_geo.upper) + aligned = txn->mt_geo.upper; + eASSERT(env, aligned >= newnext); + +#if MDBX_ENABLE_PROFGC + monotime_shot = osal_monotime(); +#endif /* MDBX_ENABLE_PROFGC */ + VERBOSE("try growth datafile to %zu pages (+%zu)", aligned, + aligned - txn->mt_end_pgno); + ret.err = map_resize_implicit(env, txn->mt_next_pgno, (pgno_t)aligned, + txn->mt_geo.upper); + if (ret.err != MDBX_SUCCESS) { + ERROR("unable growth datafile to %zu pages (+%zu), errcode %d", aligned, + aligned - txn->mt_end_pgno, ret.err); + goto fail; + } + env->me_txn->mt_end_pgno = (pgno_t)aligned; + + //--------------------------------------------------------------------------- + done: - eASSERT(env, !(flags & MDBX_ALLOC_SLOT)); - ENSURE(env, pgno >= NUM_METAS); -#if MDBX_ENABLE_PGOP_STAT - if (likely(timestamp)) - env->me_lck->mti_pgop_stat.gcrtime.weak += osal_monotime() - timestamp; -#endif /* MDBX_ENABLE_PGOP_STAT */ - if (unlikely(flags & MDBX_ALLOC_RESERVE)) { - DEBUG("return NULL for %u pages %s reservation", num, "gc-slot/backlog"); - ret.page = NULL; - ret.err = MDBX_SUCCESS; - return ret; - } - - if (env->me_flags & MDBX_WRITEMAP) { - ret.page = pgno2page(env, pgno); - VALGRIND_MAKE_MEM_UNDEFINED(ret.page, pgno2bytes(env, num)); - MDBX_ASAN_UNPOISON_MEMORY_REGION(ret.page, pgno2bytes(env, num)); - } else { - ret.page = page_malloc(txn, num); - if (unlikely(!ret.page)) { - ret.err = MDBX_ENOMEM; - goto fail; - } - } - - if (range) { - cASSERT(mc, (mc->mc_flags & C_GCFREEZE) == 0); - tASSERT(txn, pgno < txn->mt_next_pgno); - tASSERT(txn, pgno == *range); - /* Cutoff allocated pages from tw.reclaimed_pglist */ + ret.err = MDBX_SUCCESS; + if (likely((flags & (MDBX_ALLOC_SLOT | MDBX_ALLOC_RESERVE)) == 0)) { + ENSURE(env, pgno >= NUM_METAS); + if (range) { + eASSERT(env, (txn->mt_flags & MDBX_TXN_FROZEN_RE) == 0); + eASSERT(env, pgno == *range); + eASSERT(env, pgno + num <= txn->mt_next_pgno && pgno >= NUM_METAS); + /* Cutoff allocated pages from tw.relist */ #if MDBX_PNL_ASCENDING - for (const pgno_t *const end = re_list + re_len - num; range <= end; - ++range) - *range = range[num]; + for (const pgno_t *const end = re_list + re_len - num; range <= end; + ++range) + *range = range[num]; #else - for (const pgno_t *const end = re_list + re_len; ++range <= end;) - range[-(ptrdiff_t)num] = *range; + for (const pgno_t *const end = txn->tw.relist + re_len; ++range <= end;) + range[-(ptrdiff_t)num] = *range; #endif - MDBX_PNL_SETSIZE(re_list, re_len -= num); - tASSERT(txn, pnl_check_allocated(txn->tw.reclaimed_pglist, - txn->mt_next_pgno - MDBX_ENABLE_REFUND)); + MDBX_PNL_SETSIZE(txn->tw.relist, re_len -= num); + eASSERT(env, pnl_check_allocated(txn->tw.relist, + txn->mt_next_pgno - MDBX_ENABLE_REFUND)); + } else { + eASSERT(env, flags & MDBX_ALLOC_NEW); + pgno = txn->mt_next_pgno; + txn->mt_next_pgno += (pgno_t)num; + eASSERT(env, txn->mt_next_pgno <= txn->mt_end_pgno); + eASSERT(env, pgno >= NUM_METAS && pgno + num <= txn->mt_next_pgno); + } + +#if MDBX_ENABLE_PROFGC + if (!monotime_shot) + monotime_shot = osal_monotime(); +#endif /* MDBX_ENABLE_PROFGC */ + if (env->me_flags & MDBX_WRITEMAP) { + ret.page = pgno2page(env, pgno); + VALGRIND_MAKE_MEM_UNDEFINED(ret.page, pgno2bytes(env, num)); + MDBX_ASAN_UNPOISON_MEMORY_REGION(ret.page, pgno2bytes(env, num)); + } else { + ret.page = page_malloc(txn, num); + if (unlikely(!ret.page)) { + ret.err = MDBX_ENOMEM; + goto fail; + } + } + + if (unlikely(env->me_flags & MDBX_PAGEPERTURB)) + memset(ret.page, -1, pgno2bytes(env, num)); + VALGRIND_MAKE_MEM_UNDEFINED(ret.page, pgno2bytes(env, num)); + + ret.page->mp_pgno = pgno; + ret.page->mp_leaf2_ksize = 0; + ret.page->mp_flags = 0; + if ((ASSERT_ENABLED() || AUDIT_ENABLED()) && num > 1) { + ret.page->mp_pages = (pgno_t)num; + ret.page->mp_flags = P_OVERFLOW; + } + + ret.err = page_dirty(txn, ret.page, (pgno_t)num); + if (unlikely(ret.err != MDBX_SUCCESS)) { + fail: + eASSERT(env, ret.err != MDBX_SUCCESS); + eASSERT(env, pnl_check_allocated(txn->tw.relist, + txn->mt_next_pgno - MDBX_ENABLE_REFUND)); + int level; + const char *what; + if (flags & MDBX_ALLOC_RESERVE) { + level = (flags & MDBX_ALLOC_BACKLOG) ? MDBX_LOG_DEBUG : MDBX_LOG_NOTICE; + what = (flags & MDBX_ALLOC_SLOT) ? "gc-slot/backlog" : "backlog-pages"; + } else { + txn->mt_flags |= MDBX_TXN_ERROR; + level = MDBX_LOG_ERROR; + what = "pages"; + } + if (LOG_ENABLED(level)) + debug_log(level, __func__, __LINE__, + "unable alloc %zu %s, flags 0x%x, errcode %d\n", num, what, + flags, ret.err); + ret.page = NULL; + } } else { - txn->mt_next_pgno = (pgno_t)(pgno + num); - eASSERT(env, txn->mt_next_pgno <= txn->mt_end_pgno); + early_exit: + DEBUG("return NULL for %zu pages for ALLOC_%s, rc %d", num, + (flags & MDBX_ALLOC_SLOT) ? "SLOT" : "RESERVE", ret.err); + ret.page = NULL; } - if (unlikely(env->me_flags & MDBX_PAGEPERTURB)) - memset(ret.page, -1, pgno2bytes(env, num)); - VALGRIND_MAKE_MEM_UNDEFINED(ret.page, pgno2bytes(env, num)); - - ret.page->mp_pgno = pgno; - ret.page->mp_leaf2_ksize = 0; - ret.page->mp_flags = 0; - if ((ASSERT_ENABLED() || AUDIT_ENABLED()) && num > 1) { - ret.page->mp_pages = (pgno_t)num; - ret.page->mp_flags = P_OVERFLOW; - } - ret.err = page_dirty(txn, ret.page, num); - if (unlikely(ret.err != MDBX_SUCCESS)) - goto fail; - - tASSERT(txn, pnl_check_allocated(txn->tw.reclaimed_pglist, + eASSERT(env, pnl_check_allocated(txn->tw.relist, txn->mt_next_pgno - MDBX_ENABLE_REFUND)); +#if MDBX_ENABLE_PROFGC + size_t majflt_after; + prof->rtime_cpu += osal_cputime(&majflt_after) - cputime_before; + prof->majflt += majflt_after - majflt_before; + const uint64_t monotime_now = osal_monotime(); + if (monotime_shot) { + prof->xtime_monotonic += monotime_shot - monotime_before; + prof->rtime_monotonic += monotime_now - monotime_shot; + } else + prof->rtime_monotonic += monotime_now - monotime_before; +#endif /* MDBX_ENABLE_PROFGC */ return ret; } @@ -7081,10 +7171,12 @@ __hot static pgr_t page_alloc(MDBX_cursor *mc) { return ret; } - if (likely(!(mc->mc_flags & C_GCFREEZE))) { - MDBX_PNL pnl = txn->tw.reclaimed_pglist; + if (likely(!(txn->mt_flags & MDBX_TXN_FROZEN_RE))) { + MDBX_PNL pnl = txn->tw.relist; const size_t len = MDBX_PNL_GETSIZE(pnl); if (likely(len > 0)) { + MDBX_env *const env = txn->mt_env; + MDBX_PNL_SETSIZE(pnl, len - 1); #if MDBX_PNL_ASCENDING const pgno_t pgno = pnl[1]; @@ -7094,7 +7186,14 @@ __hot static pgr_t page_alloc(MDBX_cursor *mc) { const pgno_t pgno = pnl[len]; #endif - MDBX_env *const env = txn->mt_env; +#if MDBX_ENABLE_PROFGC + const uint64_t monotime_before = osal_monotime(); + size_t majflt_before; + const uint64_t cputime_before = osal_cputime(&majflt_before); + profgc_stat_t *const prof = + (mc->mc_dbi == FREE_DBI) ? &env->me_lck->mti_pgop_stat.gc_prof.self + : &env->me_lck->mti_pgop_stat.gc_prof.work; +#endif /* MDBX_ENABLE_PROFGC */ pgr_t ret; if (env->me_flags & MDBX_WRITEMAP) { ret.page = pgno2page(env, pgno); @@ -7103,7 +7202,7 @@ __hot static pgr_t page_alloc(MDBX_cursor *mc) { ret.page = page_malloc(txn, 1); if (unlikely(!ret.page)) { ret.err = MDBX_ENOMEM; - return ret; + goto bailout; } } @@ -7114,8 +7213,15 @@ __hot static pgr_t page_alloc(MDBX_cursor *mc) { tASSERT(txn, ret.page->mp_pgno >= NUM_METAS); ret.err = page_dirty(txn, ret.page, 1); - tASSERT(txn, pnl_check_allocated(txn->tw.reclaimed_pglist, + bailout: + tASSERT(txn, pnl_check_allocated(txn->tw.relist, txn->mt_next_pgno - MDBX_ENABLE_REFUND)); +#if MDBX_ENABLE_PROFGC + size_t majflt_after; + prof->rtime_cpu += osal_cputime(&majflt_after) - cputime_before; + prof->majflt += majflt_after - majflt_before; + prof->xtime_monotonic += osal_monotime() - monotime_before; +#endif /* MDBX_ENABLE_PROFGC */ return ret; } } @@ -8483,16 +8589,16 @@ int mdbx_txn_begin_ex(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags, txn->mt_geo = parent->mt_geo; rc = dpl_alloc(txn); if (likely(rc == MDBX_SUCCESS)) { - const size_t len = MDBX_PNL_GETSIZE(parent->tw.reclaimed_pglist) + - parent->tw.loose_count; - txn->tw.reclaimed_pglist = + const size_t len = + MDBX_PNL_GETSIZE(parent->tw.relist) + parent->tw.loose_count; + txn->tw.relist = pnl_alloc((len > MDBX_PNL_INITIAL) ? len : MDBX_PNL_INITIAL); - if (unlikely(!txn->tw.reclaimed_pglist)) + if (unlikely(!txn->tw.relist)) rc = MDBX_ENOMEM; } if (unlikely(rc != MDBX_SUCCESS)) { nested_failed: - pnl_free(txn->tw.reclaimed_pglist); + pnl_free(txn->tw.relist); dpl_free(txn); osal_free(txn); return rc; @@ -8505,7 +8611,7 @@ int mdbx_txn_begin_ex(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags, const size_t di = dpl_exist(parent, lp->mp_pgno); tASSERT(parent, di && parent->tw.dirtylist->items[di].ptr == lp); tASSERT(parent, lp->mp_flags == P_LOOSE); - rc = pnl_insert_range(&parent->tw.reclaimed_pglist, lp->mp_pgno, 1); + rc = pnl_insert_range(&parent->tw.relist, lp->mp_pgno, 1); if (unlikely(rc != MDBX_SUCCESS)) goto nested_failed; parent->tw.loose_pages = lp->mp_next; @@ -8525,12 +8631,12 @@ int mdbx_txn_begin_ex(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags, if (parent->tw.spill_pages) spill_purge(parent); - tASSERT(txn, MDBX_PNL_ALLOCLEN(txn->tw.reclaimed_pglist) >= - MDBX_PNL_GETSIZE(parent->tw.reclaimed_pglist)); - memcpy(txn->tw.reclaimed_pglist, parent->tw.reclaimed_pglist, - MDBX_PNL_SIZEOF(parent->tw.reclaimed_pglist)); + tASSERT(txn, MDBX_PNL_ALLOCLEN(txn->tw.relist) >= + MDBX_PNL_GETSIZE(parent->tw.relist)); + memcpy(txn->tw.relist, parent->tw.relist, + MDBX_PNL_SIZEOF(parent->tw.relist)); eASSERT(env, pnl_check_allocated( - txn->tw.reclaimed_pglist, + txn->tw.relist, (txn->mt_next_pgno /* LY: intentional assignment here, only for assertion */ = parent->mt_next_pgno) - @@ -9028,7 +9134,7 @@ static int txn_end(MDBX_txn *txn, const unsigned mode) { /* Export or close DBI handles created in this txn */ dbi_update(txn, mode & MDBX_END_UPDATE); pnl_shrink(&txn->tw.retired_pages); - pnl_shrink(&txn->tw.reclaimed_pglist); + pnl_shrink(&txn->tw.relist); if (!(env->me_flags & MDBX_WRITEMAP)) dlist_free(txn); /* The writer mutex was locked in mdbx_txn_begin. */ @@ -9039,7 +9145,7 @@ static int txn_end(MDBX_txn *txn, const unsigned mode) { eASSERT(env, parent->mt_signature == MDBX_MT_SIGNATURE); eASSERT(env, parent->mt_child == txn && (parent->mt_flags & MDBX_TXN_HAS_CHILD) != 0); - eASSERT(env, pnl_check_allocated(txn->tw.reclaimed_pglist, + eASSERT(env, pnl_check_allocated(txn->tw.relist, txn->mt_next_pgno - MDBX_ENABLE_REFUND)); eASSERT(env, memcmp(&txn->tw.troika, &parent->tw.troika, sizeof(meta_troika_t)) == 0); @@ -9067,7 +9173,7 @@ static int txn_end(MDBX_txn *txn, const unsigned mode) { tASSERT(parent, audit_ex(parent, 0, false) == 0); dlist_free(txn); dpl_free(txn); - pnl_free(txn->tw.reclaimed_pglist); + pnl_free(txn->tw.relist); if (parent->mt_geo.upper != txn->mt_geo.upper || parent->mt_geo.now != txn->mt_geo.now) { @@ -9163,7 +9269,7 @@ __cold static int audit_ex(MDBX_txn *txn, size_t retired_stored, bool dont_filter_gc) { size_t pending = 0; if ((txn->mt_flags & MDBX_TXN_RDONLY) == 0) - pending = txn->tw.loose_count + MDBX_PNL_GETSIZE(txn->tw.reclaimed_pglist) + + pending = txn->tw.loose_count + MDBX_PNL_GETSIZE(txn->tw.relist) + (MDBX_PNL_GETSIZE(txn->tw.retired_pages) - retired_stored); MDBX_cursor_couple cx; @@ -9270,7 +9376,7 @@ __cold static int audit_ex(MDBX_txn *txn, size_t retired_stored, ERROR("audit @%" PRIaTXN ": %zu(pending) = %zu(loose) + " "%zu(reclaimed) + %zu(retired-pending) - %zu(retired-stored)", txn->mt_txnid, pending, txn->tw.loose_count, - MDBX_PNL_GETSIZE(txn->tw.reclaimed_pglist), + MDBX_PNL_GETSIZE(txn->tw.relist), txn->tw.retired_pages ? MDBX_PNL_GETSIZE(txn->tw.retired_pages) : 0, retired_stored); ERROR("audit @%" PRIaTXN ": %zu(pending) + %zu" @@ -9302,7 +9408,7 @@ static __inline int gcu_context_init(MDBX_txn *txn, gcu_context_t *ctx) { } static __always_inline size_t gcu_backlog_size(MDBX_txn *txn) { - return MDBX_PNL_GETSIZE(txn->tw.reclaimed_pglist) + txn->tw.loose_count; + return MDBX_PNL_GETSIZE(txn->tw.relist) + txn->tw.loose_count; } static int gcu_clean_stored_retired(MDBX_txn *txn, gcu_context_t *ctx) { @@ -9367,7 +9473,8 @@ static int gcu_prepare_backlog(MDBX_txn *txn, gcu_context_t *ctx, return err; } - ctx->cursor.outer.mc_flags &= ~C_RECLAIMING; + tASSERT(txn, txn->mt_flags & MDBX_TXN_UPDATE_GC); + txn->mt_flags -= MDBX_TXN_UPDATE_GC; err = cursor_touch(&ctx->cursor.outer); TRACE("== after-touch, backlog %zu, err %d", gcu_backlog_size(txn), err); @@ -9393,7 +9500,7 @@ static int gcu_prepare_backlog(MDBX_txn *txn, gcu_context_t *ctx, MDBX_ALLOC_RESERVE | MDBX_ALLOC_BACKLOG) .err; - ctx->cursor.outer.mc_flags |= C_RECLAIMING; + txn->mt_flags += MDBX_TXN_UPDATE_GC; TRACE("<< backlog %zu, err %d", gcu_backlog_size(txn), err); return (err != MDBX_NOTFOUND) ? err : MDBX_SUCCESS; } @@ -9422,11 +9529,11 @@ static int update_gc(MDBX_txn *txn, gcu_context_t *ctx) { MDBX_env *const env = txn->mt_env; const char *const dbg_prefix_mode = ctx->lifo ? " lifo" : " fifo"; (void)dbg_prefix_mode; - ctx->cursor.outer.mc_flags |= C_RECLAIMING; + txn->mt_flags += MDBX_TXN_UPDATE_GC; ctx->cursor.outer.mc_next = txn->mt_cursors[FREE_DBI]; txn->mt_cursors[FREE_DBI] = &ctx->cursor.outer; - /* txn->tw.reclaimed_pglist[] can grow and shrink during this call. + /* txn->tw.relist[] can grow and shrink during this call. * txn->tw.last_reclaimed and txn->tw.retired_pages[] can only grow. * Page numbers cannot disappear from txn->tw.retired_pages[]. */ @@ -9434,7 +9541,7 @@ retry: ++ctx->loop; TRACE("%s", " >> restart"); int rc = MDBX_SUCCESS; - tASSERT(txn, pnl_check_allocated(txn->tw.reclaimed_pglist, + tASSERT(txn, pnl_check_allocated(txn->tw.relist, txn->mt_next_pgno - MDBX_ENABLE_REFUND)); tASSERT(txn, dirtylist_check(txn)); if (unlikely(/* paranoia */ ctx->loop > ((MDBX_DEBUG > 0) ? 12 : 42))) { @@ -9468,7 +9575,7 @@ retry: goto bailout; } - tASSERT(txn, pnl_check_allocated(txn->tw.reclaimed_pglist, + tASSERT(txn, pnl_check_allocated(txn->tw.relist, txn->mt_next_pgno - MDBX_ENABLE_REFUND)); if (ctx->lifo) { if (ctx->cleaned_slot < (txn->tw.lifo_reclaimed @@ -9508,7 +9615,7 @@ retry: } } else { /* If using records from GC which we have not yet deleted, - * now delete them and any we reserved for tw.reclaimed_pglist. */ + * now delete them and any we reserved for tw.relist. */ while (ctx->cleaned_id <= txn->tw.last_reclaimed) { rc = cursor_first(&ctx->cursor.outer, &key, NULL); if (rc == MDBX_NOTFOUND) @@ -9542,7 +9649,7 @@ retry: } } - tASSERT(txn, pnl_check_allocated(txn->tw.reclaimed_pglist, + tASSERT(txn, pnl_check_allocated(txn->tw.relist, txn->mt_next_pgno - MDBX_ENABLE_REFUND)); tASSERT(txn, dirtylist_check(txn)); if (AUDIT_ENABLED()) { @@ -9553,7 +9660,7 @@ retry: /* return suitable into unallocated space */ if (txn_refund(txn)) { - tASSERT(txn, pnl_check_allocated(txn->tw.reclaimed_pglist, + tASSERT(txn, pnl_check_allocated(txn->tw.relist, txn->mt_next_pgno - MDBX_ENABLE_REFUND)); if (AUDIT_ENABLED()) { rc = audit_ex(txn, ctx->retired_stored, false); @@ -9564,7 +9671,7 @@ retry: /* handle loose pages - put ones into the reclaimed- or retired-list */ if (txn->tw.loose_pages) { - /* Return loose page numbers to tw.reclaimed_pglist, + /* Return loose page numbers to tw.relist, * though usually none are left at this point. * The pages themselves remain in dirtylist. */ if (unlikely(!txn->tw.lifo_reclaimed && txn->tw.last_reclaimed < 1)) { @@ -9582,7 +9689,7 @@ retry: } /* Put loose page numbers in tw.retired_pages, - * since unable to return them to tw.reclaimed_pglist. */ + * since unable to return them to tw.relist. */ if (unlikely((rc = pnl_need(&txn->tw.retired_pages, txn->tw.loose_count)) != 0)) goto bailout; @@ -9593,11 +9700,10 @@ retry: } } else { /* Room for loose pages + temp PNL with same */ - rc = pnl_need(&txn->tw.reclaimed_pglist, 2 * txn->tw.loose_count + 2); + rc = pnl_need(&txn->tw.relist, 2 * txn->tw.loose_count + 2); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; - MDBX_PNL loose = txn->tw.reclaimed_pglist + - MDBX_PNL_ALLOCLEN(txn->tw.reclaimed_pglist) - + MDBX_PNL loose = txn->tw.relist + MDBX_PNL_ALLOCLEN(txn->tw.relist) - txn->tw.loose_count - 1; size_t count = 0; for (MDBX_page *mp = txn->tw.loose_pages; mp; mp = mp->mp_next) { @@ -9607,7 +9713,7 @@ retry: tASSERT(txn, count == txn->tw.loose_count); MDBX_PNL_SETSIZE(loose, count); pnl_sort(loose, txn->mt_next_pgno); - pnl_merge(txn->tw.reclaimed_pglist, loose); + pnl_merge(txn->tw.relist, loose); TRACE("%s: append %zu loose-pages to reclaimed-pages", dbg_prefix_mode, txn->tw.loose_count); } @@ -9652,15 +9758,15 @@ retry: #endif /* MDBX_ENABLE_REFUND */ } - const size_t amount = MDBX_PNL_GETSIZE(txn->tw.reclaimed_pglist); + const size_t amount = MDBX_PNL_GETSIZE(txn->tw.relist); /* handle retired-list - store ones into single gc-record */ if (ctx->retired_stored < MDBX_PNL_GETSIZE(txn->tw.retired_pages)) { if (unlikely(!ctx->retired_stored)) { /* Make sure last page of GC is touched and on retired-list */ - ctx->cursor.outer.mc_flags &= ~C_RECLAIMING; + txn->mt_flags -= MDBX_TXN_UPDATE_GC; rc = page_search(&ctx->cursor.outer, NULL, MDBX_PS_LAST | MDBX_PS_MODIFY); - ctx->cursor.outer.mc_flags |= C_RECLAIMING; + txn->mt_flags += MDBX_TXN_UPDATE_GC; if (unlikely(rc != MDBX_SUCCESS) && rc != MDBX_NOTFOUND) goto bailout; } @@ -9751,10 +9857,10 @@ retry: DEBUG_EXTRA_PRINT(" %" PRIaPGNO, txn->tw.retired_pages[i]); DEBUG_EXTRA_PRINT("%s\n", "."); } - if (unlikely(amount != MDBX_PNL_GETSIZE(txn->tw.reclaimed_pglist) && + if (unlikely(amount != MDBX_PNL_GETSIZE(txn->tw.relist) && ctx->settled)) { TRACE("%s: reclaimed-list changed %zu -> %zu, retry", dbg_prefix_mode, - amount, MDBX_PNL_GETSIZE(txn->tw.reclaimed_pglist)); + amount, MDBX_PNL_GETSIZE(txn->tw.relist)); goto retry /* rare case, but avoids GC fragmentation and one cycle. */ ; @@ -9763,7 +9869,7 @@ retry: } /* handle reclaimed and lost pages - merge and store both into gc */ - tASSERT(txn, pnl_check_allocated(txn->tw.reclaimed_pglist, + tASSERT(txn, pnl_check_allocated(txn->tw.relist, txn->mt_next_pgno - MDBX_ENABLE_REFUND)); tASSERT(txn, txn->tw.loose_count == 0); @@ -9800,7 +9906,7 @@ retry: bool need_cleanup = false; txnid_t snap_oldest; retry_rid: - ctx->cursor.outer.mc_flags &= ~C_RECLAIMING; + txn->mt_flags -= MDBX_TXN_UPDATE_GC; do { snap_oldest = txn_oldest_reader(txn); rc = page_alloc_slowpath(&ctx->cursor.outer, 0, @@ -9818,7 +9924,7 @@ retry: left > (MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed) - ctx->reused_slot) * env->me_maxgc_ov1page); - ctx->cursor.outer.mc_flags |= C_RECLAIMING; + txn->mt_flags += MDBX_TXN_UPDATE_GC; if (likely(rc == MDBX_SUCCESS)) { TRACE("%s: got enough from GC.", dbg_prefix_mode); @@ -9980,10 +10086,9 @@ retry: sizeof(pgno_t)) /* - 1 + span */; if (tail > avail) { for (size_t i = amount - span; i > 0; --i) { - if (MDBX_PNL_ASCENDING - ? (txn->tw.reclaimed_pglist[i] + span) - : (txn->tw.reclaimed_pglist[i] - span) == - txn->tw.reclaimed_pglist[i + span]) { + if (MDBX_PNL_ASCENDING ? (txn->tw.relist[i] + span) + : (txn->tw.relist[i] - span) == + txn->tw.relist[i + span]) { span += 1; avail = ((pgno2bytes(env, span) - PAGEHDRSZ) / sizeof(pgno_t)) - @@ -10031,7 +10136,7 @@ retry: gcu_prepare_backlog(txn, ctx, true); rc = mdbx_cursor_put(&ctx->cursor.outer, &key, &data, MDBX_RESERVE | MDBX_NOOVERWRITE); - tASSERT(txn, pnl_check_allocated(txn->tw.reclaimed_pglist, + tASSERT(txn, pnl_check_allocated(txn->tw.relist, txn->mt_next_pgno - MDBX_ENABLE_REFUND)); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; @@ -10042,11 +10147,11 @@ retry: chunk); if (txn->tw.lifo_reclaimed && - unlikely(amount < MDBX_PNL_GETSIZE(txn->tw.reclaimed_pglist)) && - (ctx->loop < 5 || MDBX_PNL_GETSIZE(txn->tw.reclaimed_pglist) - amount > - env->me_maxgc_ov1page)) { + unlikely(amount < MDBX_PNL_GETSIZE(txn->tw.relist)) && + (ctx->loop < 5 || + MDBX_PNL_GETSIZE(txn->tw.relist) - amount > env->me_maxgc_ov1page)) { NOTICE("** restart: reclaimed-list growth %zu -> %zu", amount, - MDBX_PNL_GETSIZE(txn->tw.reclaimed_pglist)); + MDBX_PNL_GETSIZE(txn->tw.relist)); goto retry; } @@ -10065,15 +10170,15 @@ retry: ? MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed) - ctx->reused_slot : ctx->reused_slot; rc = MDBX_SUCCESS; - tASSERT(txn, pnl_check_allocated(txn->tw.reclaimed_pglist, + tASSERT(txn, pnl_check_allocated(txn->tw.relist, txn->mt_next_pgno - MDBX_ENABLE_REFUND)); tASSERT(txn, dirtylist_check(txn)); - if (MDBX_PNL_GETSIZE(txn->tw.reclaimed_pglist)) { + if (MDBX_PNL_GETSIZE(txn->tw.relist)) { MDBX_val key, data; key.iov_len = data.iov_len = 0; /* avoid MSVC warning */ key.iov_base = data.iov_base = NULL; - const size_t amount = MDBX_PNL_GETSIZE(txn->tw.reclaimed_pglist); + const size_t amount = MDBX_PNL_GETSIZE(txn->tw.relist); size_t left = amount; if (txn->tw.lifo_reclaimed == nullptr) { tASSERT(txn, ctx->lifo == 0); @@ -10087,7 +10192,7 @@ retry: while (true) { txnid_t fill_gc_id; TRACE("%s: left %zu of %zu", dbg_prefix_mode, left, - MDBX_PNL_GETSIZE(txn->tw.reclaimed_pglist)); + MDBX_PNL_GETSIZE(txn->tw.relist)); if (txn->tw.lifo_reclaimed == nullptr) { tASSERT(txn, ctx->lifo == 0); fill_gc_id = unaligned_peek_u64(4, key.iov_base); @@ -10125,7 +10230,7 @@ retry: key.iov_len = sizeof(fill_gc_id); tASSERT(txn, data.iov_len >= sizeof(pgno_t) * 2); - ctx->cursor.outer.mc_flags |= C_GCFREEZE; + txn->mt_flags += MDBX_TXN_FROZEN_RE; size_t chunk = data.iov_len / sizeof(pgno_t) - 1; if (unlikely(chunk > left)) { TRACE("%s: chunk %zu > left %zu, @%" PRIaTXN, dbg_prefix_mode, chunk, @@ -10134,22 +10239,21 @@ retry: chunk - left > env->me_maxgc_ov1page) { data.iov_len = (left + 1) * sizeof(pgno_t); if (ctx->loop < 7) - ctx->cursor.outer.mc_flags &= ~C_GCFREEZE; + txn->mt_flags &= ~MDBX_TXN_FROZEN_RE; } chunk = left; } rc = mdbx_cursor_put(&ctx->cursor.outer, &key, &data, MDBX_CURRENT | MDBX_RESERVE); - ctx->cursor.outer.mc_flags &= ~C_GCFREEZE; + txn->mt_flags &= ~MDBX_TXN_FROZEN_RE; if (unlikely(rc != MDBX_SUCCESS)) goto bailout; gcu_clean_reserved(env, data); if (unlikely(txn->tw.loose_count || - amount != MDBX_PNL_GETSIZE(txn->tw.reclaimed_pglist))) { + amount != MDBX_PNL_GETSIZE(txn->tw.relist))) { NOTICE("** restart: reclaimed-list growth (%zu -> %zu, loose +%zu)", - amount, MDBX_PNL_GETSIZE(txn->tw.reclaimed_pglist), - txn->tw.loose_count); + amount, MDBX_PNL_GETSIZE(txn->tw.relist), txn->tw.loose_count); goto retry; } if (unlikely(txn->tw.lifo_reclaimed @@ -10170,12 +10274,12 @@ retry: pgno_t *dst = data.iov_base; *dst++ = (pgno_t)chunk; - pgno_t *src = MDBX_PNL_BEGIN(txn->tw.reclaimed_pglist) + left - chunk; + pgno_t *src = MDBX_PNL_BEGIN(txn->tw.relist) + left - chunk; memcpy(dst, src, chunk * sizeof(pgno_t)); pgno_t *from = src, *to = src + chunk; TRACE("%s: fill %zu [ %zu:%" PRIaPGNO "...%zu:%" PRIaPGNO "] @%" PRIaTXN, - dbg_prefix_mode, chunk, from - txn->tw.reclaimed_pglist, from[0], - to - txn->tw.reclaimed_pglist, to[-1], fill_gc_id); + dbg_prefix_mode, chunk, from - txn->tw.relist, from[0], + to - txn->tw.relist, to[-1], fill_gc_id); left -= chunk; if (AUDIT_ENABLED()) { @@ -10223,7 +10327,10 @@ retry: bailout: txn->mt_cursors[FREE_DBI] = ctx->cursor.outer.mc_next; - MDBX_PNL_SETSIZE(txn->tw.reclaimed_pglist, 0); + MDBX_PNL_SETSIZE(txn->tw.relist, 0); +#if MDBX_ENABLE_PROFGC + env->me_lck->mti_pgop_stat.gc_prof.wloops += ctx->loop; +#endif /* MDBX_ENABLE_PROFGC */ TRACE("<<< %zu loops, rc = %d", ctx->loop, rc); return rc; } @@ -10304,7 +10411,7 @@ static __inline void txn_merge(MDBX_txn *const parent, MDBX_txn *const txn, } /* Remove reclaimed pages from parent's dirty list */ - const MDBX_PNL reclaimed_list = parent->tw.reclaimed_pglist; + const MDBX_PNL reclaimed_list = parent->tw.relist; dpl_sift(parent, reclaimed_list, false); /* Move retired pages from parent's dirty & spilled list to reclaimed */ @@ -10362,7 +10469,7 @@ static __inline void txn_merge(MDBX_txn *const parent, MDBX_txn *const txn, DEBUG("reclaim retired parent's %u -> %zu %s page %" PRIaPGNO, npages, l, kind, pgno); - int err = pnl_insert_range(&parent->tw.reclaimed_pglist, pgno, l); + int err = pnl_insert_range(&parent->tw.relist, pgno, l); ENSURE(txn->mt_env, err == MDBX_SUCCESS); } MDBX_PNL_SETSIZE(parent->tw.retired_pages, w); @@ -10636,8 +10743,9 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { STATIC_ASSERT(MDBX_TXN_FINISHED == MDBX_TXN_BLOCKED - MDBX_TXN_HAS_CHILD - MDBX_TXN_ERROR); const uint64_t ts_0 = latency ? osal_monotime() : 0; - uint64_t ts_1 = 0, ts_2 = 0, ts_3 = 0, ts_4 = 0, ts_5 = 0; + uint64_t ts_1 = 0, ts_2 = 0, ts_3 = 0, ts_4 = 0, ts_5 = 0, gc_cputime = 0; + MDBX_env *const env = txn->mt_env; int rc = check_txn(txn, MDBX_TXN_FINISHED); if (unlikely(rc != MDBX_SUCCESS)) goto provide_latency; @@ -10647,7 +10755,6 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { goto fail; } - MDBX_env *env = txn->mt_env; #if MDBX_ENV_CHECKPID if (unlikely(env->me_pid != osal_getpid())) { env->me_flags |= MDBX_FATAL_ERROR; @@ -10714,7 +10821,7 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { const size_t retired_delta = MDBX_PNL_GETSIZE(txn->tw.retired_pages) - parent_retired_len; if (retired_delta) { - rc = pnl_need(&txn->tw.reclaimed_pglist, retired_delta); + rc = pnl_need(&txn->tw.relist, retired_delta); if (unlikely(rc != MDBX_SUCCESS)) goto fail; } @@ -10745,9 +10852,9 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { parent->tw.retired_pages = txn->tw.retired_pages; txn->tw.retired_pages = NULL; - pnl_free(parent->tw.reclaimed_pglist); - parent->tw.reclaimed_pglist = txn->tw.reclaimed_pglist; - txn->tw.reclaimed_pglist = NULL; + pnl_free(parent->tw.relist); + parent->tw.relist = txn->tw.relist; + txn->tw.relist = NULL; parent->tw.last_reclaimed = txn->tw.last_reclaimed; parent->mt_geo = txn->mt_geo; @@ -10779,9 +10886,14 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { parent->mt_dbistate[i] = state; } - ts_1 = latency ? osal_monotime() : 0; + if (latency) { + ts_1 = osal_monotime(); + ts_2 = /* no gc-update */ ts_1; + ts_3 = /* no audit */ ts_2; + ts_4 = /* no write */ ts_3; + ts_5 = /* no sync */ ts_4; + } txn_merge(parent, txn, parent_retired_len); - ts_2 = ts_3 = latency ? osal_monotime() : 0; env->me_txn = parent; parent->mt_child = NULL; tASSERT(parent, dirtylist_check(parent)); @@ -10794,13 +10906,12 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { tASSERT(parent, lp->mp_pgno < parent->tw.loose_refund_wl && lp->mp_pgno + 1 < parent->mt_next_pgno); /* Check parent's reclaimed pages not suitable for refund */ - if (MDBX_PNL_GETSIZE(parent->tw.reclaimed_pglist)) - tASSERT(parent, MDBX_PNL_MOST(parent->tw.reclaimed_pglist) + 1 < - parent->mt_next_pgno); + if (MDBX_PNL_GETSIZE(parent->tw.relist)) + tASSERT(parent, + MDBX_PNL_MOST(parent->tw.relist) + 1 < parent->mt_next_pgno); } #endif /* MDBX_ENABLE_REFUND */ - ts_4 = ts_5 = latency ? osal_monotime() : 0; txn->mt_signature = 0; osal_free(txn); tASSERT(parent, audit_ex(parent, 0, false) == 0); @@ -10868,11 +10979,14 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { } ts_1 = latency ? osal_monotime() : 0; + gcu_context_t gcu_ctx; + gc_cputime = latency ? osal_cputime(nullptr) : 0; rc = gcu_context_init(txn, &gcu_ctx); if (unlikely(rc != MDBX_SUCCESS)) goto fail; rc = update_gc(txn, &gcu_ctx); + gc_cputime = latency ? osal_cputime(nullptr) - gc_cputime : 0; if (unlikely(rc != MDBX_SUCCESS)) goto fail; @@ -10884,7 +10998,8 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { ? txn->mt_txnid : txn->mt_dbs[MAIN_DBI].md_mod_txnid; - ts_2 = ts_3 = latency ? osal_monotime() : 0; + ts_2 = latency ? osal_monotime() : 0; + ts_3 = ts_2; if (AUDIT_ENABLED()) { rc = audit_ex(txn, MDBX_PNL_GETSIZE(txn->tw.retired_pages), true); ts_3 = osal_monotime(); @@ -10967,11 +11082,48 @@ done: provide_latency: if (latency) { latency->preparation = ts_1 ? osal_monotime_to_16dot16(ts_1 - ts_0) : 0; - latency->gc = (ts_1 && ts_2) ? osal_monotime_to_16dot16(ts_2 - ts_1) : 0; - latency->audit = - (ts_2 && AUDIT_ENABLED()) ? osal_monotime_to_16dot16(ts_3 - ts_2) : 0; - latency->write = (ts_3 && ts_4) ? osal_monotime_to_16dot16(ts_4 - ts_3) : 0; - latency->sync = (ts_4 && ts_5) ? osal_monotime_to_16dot16(ts_5 - ts_4) : 0; + latency->gc_wallclock = + (ts_2 > ts_1) ? osal_monotime_to_16dot16(ts_2 - ts_1) : 0; + latency->gc_cputime = gc_cputime ? osal_monotime_to_16dot16(gc_cputime) : 0; + latency->audit = (ts_3 > ts_2) ? osal_monotime_to_16dot16(ts_3 - ts_2) : 0; + latency->write = (ts_4 > ts_3) ? osal_monotime_to_16dot16(ts_4 - ts_3) : 0; + latency->sync = (ts_5 > ts_4) ? osal_monotime_to_16dot16(ts_5 - ts_4) : 0; + +#if MDBX_ENABLE_PROFGC + pgop_stat_t *const ptr = &env->me_lck->mti_pgop_stat; + latency->gc_prof.work_counter = ptr->gc_prof.work.spe_counter; + latency->gc_prof.work_rtime_monotonic = + osal_monotime_to_16dot16(ptr->gc_prof.work.rtime_monotonic); + latency->gc_prof.work_xtime_monotonic = + osal_monotime_to_16dot16(ptr->gc_prof.work.xtime_monotonic); + latency->gc_prof.work_rtime_cpu = + osal_monotime_to_16dot16(ptr->gc_prof.work.rtime_cpu); + latency->gc_prof.work_rsteps = ptr->gc_prof.work.rsteps; + latency->gc_prof.work_xpages = ptr->gc_prof.work.xpages; + latency->gc_prof.work_majflt = ptr->gc_prof.work.majflt; + + latency->gc_prof.self_counter = ptr->gc_prof.self.spe_counter; + latency->gc_prof.self_rtime_monotonic = + osal_monotime_to_16dot16(ptr->gc_prof.self.rtime_monotonic); + latency->gc_prof.self_xtime_monotonic = + osal_monotime_to_16dot16(ptr->gc_prof.self.xtime_monotonic); + latency->gc_prof.self_rtime_cpu = + osal_monotime_to_16dot16(ptr->gc_prof.self.rtime_cpu); + latency->gc_prof.self_rsteps = ptr->gc_prof.self.rsteps; + latency->gc_prof.self_xpages = ptr->gc_prof.self.xpages; + latency->gc_prof.self_majflt = ptr->gc_prof.self.majflt; + + latency->gc_prof.wloops = ptr->gc_prof.wloops; + latency->gc_prof.coalescences = ptr->gc_prof.coalescences; + latency->gc_prof.wipes = ptr->gc_prof.wipes; + latency->gc_prof.flushes = ptr->gc_prof.flushes; + latency->gc_prof.kicks = ptr->gc_prof.kicks; + if (txn == env->me_txn0) + memset(&ptr->gc_prof, 0, sizeof(ptr->gc_prof)); +#else + memset(&latency->gc_prof, 0, sizeof(latency->gc_prof)); +#endif /* MDBX_ENABLE_PROFGC */ + const uint64_t ts_6 = osal_monotime(); latency->ending = ts_5 ? osal_monotime_to_16dot16(ts_6 - ts_5) : 0; latency->whole = osal_monotime_to_16dot16_noUnderflow(ts_6 - ts_0); @@ -11586,6 +11738,8 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending, if (!MDBX_AVOID_MSYNC && (flags & MDBX_WRITEMAP)) { #if MDBX_ENABLE_PGOP_STAT env->me_lck->mti_pgop_stat.msync.weak += sync_op; +#else + (void)sync_op; #endif /* MDBX_ENABLE_PGOP_STAT */ rc = osal_msync(&env->me_dxb_mmap, 0, @@ -11593,6 +11747,8 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending, } else { #if MDBX_ENABLE_PGOP_STAT env->me_lck->mti_pgop_stat.fsync.weak += sync_op; +#else + (void)sync_op; #endif /* MDBX_ENABLE_PGOP_STAT */ rc = osal_fsync(env->me_lazy_fd, mode_bits); } @@ -13966,8 +14122,8 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, txn->mt_flags = MDBX_TXN_FINISHED; env->me_txn0 = txn; txn->tw.retired_pages = pnl_alloc(MDBX_PNL_INITIAL); - txn->tw.reclaimed_pglist = pnl_alloc(MDBX_PNL_INITIAL); - if (unlikely(!txn->tw.retired_pages || !txn->tw.reclaimed_pglist)) + txn->tw.relist = pnl_alloc(MDBX_PNL_INITIAL); + if (unlikely(!txn->tw.retired_pages || !txn->tw.relist)) rc = MDBX_ENOMEM; } else rc = MDBX_ENOMEM; @@ -14098,7 +14254,7 @@ __cold static int env_close(MDBX_env *env) { txl_free(env->me_txn0->tw.lifo_reclaimed); pnl_free(env->me_txn0->tw.retired_pages); pnl_free(env->me_txn0->tw.spill_pages); - pnl_free(env->me_txn0->tw.reclaimed_pglist); + pnl_free(env->me_txn0->tw.relist); osal_free(env->me_txn0); env->me_txn0 = nullptr; } @@ -15944,7 +16100,7 @@ static int touch_dbi(MDBX_cursor *mc) { *mc->mc_dbistate |= DBI_DIRTY; mc->mc_txn->mt_flags |= MDBX_TXN_DIRTY; if (mc->mc_dbi >= CORE_DBS) { - cASSERT(mc, (mc->mc_flags & C_RECLAIMING) == 0); + cASSERT(mc, (mc->mc_txn->mt_flags & MDBX_TXN_UPDATE_GC) == 0); /* Touch DB record of named DB */ MDBX_cursor_couple cx; int rc = cursor_init(&cx.outer, mc->mc_txn, MAIN_DBI); @@ -16370,7 +16526,7 @@ __hot int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, /* Is the ov page from this txn (or a parent) and big enough? */ int ovpages = lp.page->mp_pages; if (!IS_FROZEN(mc->mc_txn, lp.page) && - (unlikely(mc->mc_flags & C_GCFREEZE) + (unlikely(mc->mc_txn->mt_flags & MDBX_TXN_FROZEN_RE) ? (ovpages >= dpages) : (ovpages == /* LY: add configurable threshold to keep reserve space */ @@ -17428,9 +17584,8 @@ static __inline int couple_init(MDBX_cursor_couple *couple, const size_t dbi, if (unlikely(*couple->outer.mc_dbistate & DBI_STALE)) { rc = page_search(&couple->outer, NULL, MDBX_PS_ROOTONLY); rc = (rc != MDBX_NOTFOUND) ? rc : MDBX_SUCCESS; - } else if (unlikely(couple->outer.mc_dbx->md_klen_max == 0)) { - rc = setup_dbx(couple->outer.mc_dbx, couple->outer.mc_db, - txn->mt_env->me_psize); + } else if (unlikely(dbx->md_klen_max == 0)) { + rc = setup_dbx(dbx, db, txn->mt_env->me_psize); } if (couple->outer.mc_db->md_flags & MDBX_DUPSORT) { @@ -21114,8 +21269,6 @@ __cold static int fetch_envinfo_ex(const MDBX_env *env, const MDBX_txn *txn, atomic_load64(&lck->mti_pgop_stat.msync, mo_Relaxed); arg->mi_pgop_stat.fsync = atomic_load64(&lck->mti_pgop_stat.fsync, mo_Relaxed); - arg->mi_pgop_stat.gcrtime_seconds16dot16 = osal_monotime_to_16dot16( - atomic_load64(&lck->mti_pgop_stat.gcrtime, mo_Relaxed)); #else memset(&arg->mi_pgop_stat, 0, sizeof(arg->mi_pgop_stat)); #endif /* MDBX_ENABLE_PGOP_STAT*/ @@ -22064,8 +22217,12 @@ __cold static txnid_t kick_longlived_readers(MDBX_env *env, atomic_store64(&stucked->mr_tid, 0, mo_Relaxed); atomic_store32(&stucked->mr_pid, 0, mo_AcquireRelease); } - } else + } else if (!notify_eof_of_loop) { +#if MDBX_ENABLE_PROFGC + env->me_lck->mti_pgop_stat.gc_prof.kicks += 1; +#endif /* MDBX_ENABLE_PROFGC */ notify_eof_of_loop = true; + } } while (++retry < INT_MAX); diff --git a/src/internals.h b/src/internals.h index 73a74a95..28282414 100644 --- a/src/internals.h +++ b/src/internals.h @@ -578,10 +578,30 @@ typedef struct MDBX_page { #pragma pack(pop) -#if MDBX_ENABLE_PGOP_STAT +typedef struct profgc_stat { + /* Монотонное время по "настенным часам" + * затраченное на чтение и поиск внутри GC */ + uint64_t rtime_monotonic; + /* Монотонное время по "настенным часам" затраченное + * на подготовку страниц извлекаемых из GC, включая подкачку с диска. */ + uint64_t xtime_monotonic; + /* Процессорное время в режим пользователя + * затраченное на чтение и поиск внутри GC */ + uint64_t rtime_cpu; + /* Количество итераций чтения-поиска внутри GC при выделении страниц */ + uint32_t rsteps; + /* Количество запросов на выделение последовательностей страниц, + * т.е. когда запрашивает выделение больше одной страницы */ + uint32_t xpages; + /* Счетчик выполнения по медленному пути (slow path execution count) */ + uint32_t spe_counter; + /* page faults (hard page faults) */ + uint32_t majflt; +} profgc_stat_t; + /* Statistics of page operations overall of all (running, completed and aborted) * transactions */ -typedef struct { +typedef struct pgop_stat { MDBX_atomic_uint64_t newly; /* Quantity of a new pages added */ MDBX_atomic_uint64_t cow; /* Quantity of pages copied for update */ MDBX_atomic_uint64_t clone; /* Quantity of parent's dirty pages clones @@ -592,15 +612,32 @@ typedef struct { MDBX_atomic_uint64_t unspill; /* Quantity of unspilled/reloaded pages */ MDBX_atomic_uint64_t wops; /* Number of explicit write operations (not a pages) to a disk */ - MDBX_atomic_uint64_t - gcrtime; /* Time spending for reading/searching GC (aka FreeDB). The - unit/scale is platform-depended, see osal_monotime(). */ MDBX_atomic_uint64_t msync; /* Number of explicit msync/flush-to-disk operations */ MDBX_atomic_uint64_t fsync; /* Number of explicit fsync/flush-to-disk operations */ -} MDBX_pgop_stat_t; -#endif /* MDBX_ENABLE_PGOP_STAT */ + + /* Статистика для профилирования GC. + * Логически эти данные может быть стоит вынести в другую структуру, + * но разница будет сугубо косметическая. */ + struct { + /* Затраты на поддержку данных пользователя */ + profgc_stat_t work; + /* Затраты на поддержку и обновления самой GC */ + profgc_stat_t self; + /* Итераций обновления GC, + * больше 1 если были повторы/перезапуски */ + uint32_t wloops; + /* Итерации слияния записей GC */ + uint32_t coalescences; + /* Уничтожения steady-точек фиксации в MDBX_UTTERLY_NOSYNC */ + uint32_t wipes; + /* Сбросы данные на диск вне MDBX_UTTERLY_NOSYNC */ + uint32_t flushes; + /* Попытки пнуть тормозящих читателей */ + uint32_t kicks; + } gc_prof; +} pgop_stat_t; #if MDBX_LOCKING == MDBX_LOCKING_WIN32FILES #define MDBX_CLOCK_SIGN UINT32_C(0xF10C) @@ -738,11 +775,9 @@ typedef struct MDBX_lockinfo { MDBX_ALIGNAS(MDBX_CACHELINE_SIZE) /* cacheline ----------------------------*/ -#if MDBX_ENABLE_PGOP_STAT /* Statistics of costly ops of all (running, completed and aborted) * transactions */ - MDBX_pgop_stat_t mti_pgop_stat; -#endif /* MDBX_ENABLE_PGOP_STAT*/ + pgop_stat_t mti_pgop_stat; MDBX_ALIGNAS(MDBX_CACHELINE_SIZE) /* cacheline ----------------------------*/ @@ -962,9 +997,13 @@ struct MDBX_txn { /* Additional flag for sync_locked() */ #define MDBX_SHRINK_ALLOWED UINT32_C(0x40000000) +#define MDBX_TXN_UPDATE_GC 0x20 /* GC is being updated */ +#define MDBX_TXN_FROZEN_RE 0x40 /* list of reclaimed-pgno must not altered */ + #define TXN_FLAGS \ (MDBX_TXN_FINISHED | MDBX_TXN_ERROR | MDBX_TXN_DIRTY | MDBX_TXN_SPILLS | \ - MDBX_TXN_HAS_CHILD | MDBX_TXN_INVALID) + MDBX_TXN_HAS_CHILD | MDBX_TXN_INVALID | MDBX_TXN_UPDATE_GC | \ + MDBX_TXN_FROZEN_RE) #if (TXN_FLAGS & (MDBX_TXN_RW_BEGIN_FLAGS | MDBX_TXN_RO_BEGIN_FLAGS)) || \ ((MDBX_TXN_RW_BEGIN_FLAGS | MDBX_TXN_RO_BEGIN_FLAGS | TXN_FLAGS) & \ @@ -1023,8 +1062,8 @@ struct MDBX_txn { struct { meta_troika_t troika; /* In write txns, array of cursors for each DB */ - pgno_t *reclaimed_pglist; /* Reclaimed GC pages */ - txnid_t last_reclaimed; /* ID of last used record */ + pgno_t *relist; /* Reclaimed GC pages */ + txnid_t last_reclaimed; /* ID of last used record */ #if MDBX_ENABLE_REFUND pgno_t loose_refund_wl /* FIXME: describe */; #endif /* MDBX_ENABLE_REFUND */ @@ -1100,9 +1139,7 @@ struct MDBX_cursor { #define C_SUB 0x04 /* Cursor is a sub-cursor */ #define C_DEL 0x08 /* last op was a cursor_del */ #define C_UNTRACK 0x10 /* Un-track cursor when closing */ -#define C_RECLAIMING 0x20 /* GC lookup is prohibited */ -#define C_GCFREEZE 0x40 /* reclaimed_pglist must not be updated */ - uint8_t mc_flags; /* see mdbx_cursor */ + uint8_t mc_flags; /* Cursor checking flags. */ #define CC_BRANCH 0x01 /* same as P_BRANCH for CHECK_LEAF_TYPE() */ @@ -1113,7 +1150,7 @@ struct MDBX_cursor { #define CC_LEAF2 0x20 /* same as P_LEAF2 for CHECK_LEAF_TYPE() */ #define CC_RETIRING 0x40 /* refs to child pages may be invalid */ #define CC_PAGECHECK 0x80 /* perform page checking, see MDBX_VALIDATION */ - uint8_t mc_checking; /* page checking level */ + uint8_t mc_checking; MDBX_page *mc_pg[CURSOR_STACK]; /* stack of pushed pages */ indx_t mc_ki[CURSOR_STACK]; /* stack of page indices */ diff --git a/src/options.h b/src/options.h index 127a13be..11df6967 100644 --- a/src/options.h +++ b/src/options.h @@ -73,6 +73,13 @@ #error MDBX_ENABLE_REFUND must be defined as 0 or 1 #endif /* MDBX_ENABLE_REFUND */ +/** Controls profiling of GC search and updates. */ +#ifndef MDBX_ENABLE_PROFGC +#define MDBX_ENABLE_PROFGC 0 +#elif !(MDBX_ENABLE_PROFGC == 0 || MDBX_ENABLE_PROFGC == 1) +#error MDBX_ENABLE_PROFGC must be defined as 0 or 1 +#endif /* MDBX_ENABLE_PROFGC */ + /** Controls gathering statistics for page operations. */ #ifndef MDBX_ENABLE_PGOP_STAT #define MDBX_ENABLE_PGOP_STAT 1 diff --git a/src/osal.c b/src/osal.c index a29919fa..5d1339d3 100644 --- a/src/osal.c +++ b/src/osal.c @@ -18,6 +18,7 @@ #if defined(_WIN32) || defined(_WIN64) +#include #include #if !MDBX_WITHOUT_MSVC_CRT && defined(_DEBUG) @@ -2700,6 +2701,60 @@ MDBX_INTERNAL_FUNC uint64_t osal_monotime(void) { return 0; } +MDBX_INTERNAL_FUNC uint64_t osal_cputime(size_t *optional_page_faults) { +#if defined(_WIN32) || defined(_WIN64) + if (optional_page_faults) { + PROCESS_MEMORY_COUNTERS pmc; + *optional_page_faults = + GetProcessMemoryInfo(GetCurrentProcess(), &pmc, sizeof(pmc)) + ? pmc.PageFaultCount + : 0; + } + FILETIME unused, usermode; + if (GetThreadTimes(GetCurrentThread(), + /* CreationTime */ &unused, + /* ExitTime */ &unused, + /* KernelTime */ &unused, + /* UserTime */ &usermode)) { + /* one second = 10_000_000 * 100ns = 78125 * (1 << 7) * 100ns; + * result = (h * f / 10_000_000) << 32) + l * f / 10_000_000 = + * = ((h * f) >> 7) / 78125) << 32) + ((l * f) >> 7) / 78125; + * 1) {h, l} *= f; + * 2) {h, l} >>= 7; + * 3) result = ((h / 78125) << 32) + l / 78125; */ + uint64_t l = usermode.dwLowDateTime * performance_frequency.QuadPart; + uint64_t h = usermode.dwHighDateTime * performance_frequency.QuadPart; + l = h << (64 - 7) | l >> 7; + h = h >> 7; + return ((h / 78125) << 32) + l / 78125; + } +#elif defined(RUSAGE_THREAD) || defined(RUSAGE_LWP) +#ifndef RUSAGE_THREAD +#define RUSAGE_THREAD RUSAGE_LWP /* Solaris */ +#endif + struct rusage usage; + if (getrusage(RUSAGE_THREAD, &usage) == 0) { + if (optional_page_faults) + *optional_page_faults = usage.ru_majflt; + return usage.ru_utime.tv_sec * UINT64_C(1000000000) + + usage.ru_utime.tv_usec * 1000u; + } + if (optional_page_faults) + *optional_page_faults = 0; +#elif defined(CLOCK_THREAD_CPUTIME_ID) + if (optional_page_faults) + *optional_page_faults = 0; + struct timespec ts; + if (likely(clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts) == 0)) + return ts.tv_sec * UINT64_C(1000000000) + ts.tv_nsec; +#else + /* FIXME */ + if (optional_page_faults) + *optional_page_faults = 0; +#endif + return 0; +} + /*----------------------------------------------------------------------------*/ static void bootid_shake(bin128_t *p) { diff --git a/src/osal.h b/src/osal.h index d2603334..e8abacf3 100644 --- a/src/osal.h +++ b/src/osal.h @@ -620,6 +620,7 @@ osal_pthread_mutex_lock(pthread_mutex_t *mutex) { #endif /* !Windows */ MDBX_INTERNAL_FUNC uint64_t osal_monotime(void); +MDBX_INTERNAL_FUNC uint64_t osal_cputime(size_t *optional_page_faults); MDBX_INTERNAL_FUNC uint64_t osal_16dot16_to_monotime(uint32_t seconds_16dot16); MDBX_INTERNAL_FUNC uint32_t osal_monotime_to_16dot16(uint64_t monotime);