From 3757eb72f7c6b46862f8f17881ac88e8cecc1979 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Fri, 25 Nov 2022 18:04:43 +0300 Subject: [PATCH] =?UTF-8?q?mdbx:=20=D1=8D=D0=BA=D0=BE=D0=BD=D0=BE=D0=BC?= =?UTF-8?q?=D0=B8=D1=8F=20=D0=BF=D0=BE=D1=81=D0=BB=D0=B5=D0=B4=D0=BE=D0=B2?= =?UTF-8?q?=D0=B0=D1=82=D0=B5=D0=BB=D1=8C=D0=BD=D0=BE=D1=81=D1=82=D0=B5?= =?UTF-8?q?=D0=B9=20=D0=BF=D1=80=D0=B8=20=D0=B2=D1=8B=D0=B4=D0=B5=D0=BB?= =?UTF-8?q?=D0=B5=D0=BD=D0=B8=D0=B8=20=D0=BE=D0=B4=D0=B8=D0=BD=D0=BE=D1=87?= =?UTF-8?q?=D0=BD=D1=8B=D1=85=20=D1=81=D1=82=D1=80=D0=B0=D0=BD=D0=B8=D1=86?= =?UTF-8?q?.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core.c | 215 +++++++++++++++++++++++++++++------------------- src/internals.h | 2 + 2 files changed, 134 insertions(+), 83 deletions(-) diff --git a/src/core.c b/src/core.c index 51184bf1..335da529 100644 --- a/src/core.c +++ b/src/core.c @@ -6516,7 +6516,7 @@ __hot static pgno_t *scan4seq_neon(pgno_t *range, const size_t len, #if defined(__AVX512BW__) && defined(MDBX_ATTRIBUTE_TARGET_AVX512BW) #define scan4seq_default scan4seq_avx512bw -#define scan4seq scan4seq_default +#define scan4seq_impl scan4seq_default #elif defined(__AVX2__) && defined(MDBX_ATTRIBUTE_TARGET_AVX2) #define scan4seq_default scan4seq_avx2 #elif defined(__SSE2__) && defined(MDBX_ATTRIBUTE_TARGET_SSE2) @@ -6533,20 +6533,20 @@ __hot static pgno_t *scan4seq_neon(pgno_t *range, const size_t len, #define scan4seq_default scan4seq_fallback #endif /* scan4seq_default */ -#ifdef scan4seq -/* The scan4seq() is the best or no alternatives */ +#ifdef scan4seq_impl +/* The scan4seq_impl() is the best or no alternatives */ #elif !MDBX_HAVE_BUILTIN_CPU_SUPPORTS /* The scan4seq_default() will be used since no cpu-features detection support * from compiler. Please don't ask to implement cpuid-based detection and don't * make such PRs. */ -#define scan4seq scan4seq_default +#define scan4seq_impl scan4seq_default #else /* Selecting the most appropriate implementation at runtime, * depending on the available CPU features. */ static pgno_t *scan4seq_resolver(pgno_t *range, const size_t len, const size_t seq); -static pgno_t *(*scan4seq)(pgno_t *range, const size_t len, - const size_t seq) = scan4seq_resolver; +static pgno_t *(*scan4seq_impl)(pgno_t *range, const size_t len, + const size_t seq) = scan4seq_resolver; static pgno_t *scan4seq_resolver(pgno_t *range, const size_t len, const size_t seq) { @@ -6569,10 +6569,10 @@ static pgno_t *scan4seq_resolver(pgno_t *range, const size_t len, choice = scan4seq_avx512bw; #endif /* MDBX_ATTRIBUTE_TARGET_AVX512BW */ /* Choosing of another variants should be added here. */ - scan4seq = choice ? choice : scan4seq_default; - return scan4seq(range, len, seq); + scan4seq_impl = choice ? choice : scan4seq_default; + return scan4seq_impl(range, len, seq); } -#endif /* scan4seq */ +#endif /* scan4seq_impl */ //------------------------------------------------------------------------------ @@ -6623,6 +6623,83 @@ __hot static bool is_already_reclaimed(const MDBX_txn *txn, txnid_t id) { return false; } +__hot static pgno_t pnl_get_single(MDBX_PNL pnl) { + const size_t len = MDBX_PNL_GETSIZE(pnl); + assert(len > 0); + pgno_t *target = MDBX_PNL_EDGE(pnl); + const ptrdiff_t dir = MDBX_PNL_ASCENDING ? 1 : -1; + + /* пытаемся пропускать последовательности при наличии одиночных элементов */ + if (likely(len > 2) && unlikely(target[dir] == *target + 1)) { + pgno_t *scan = target + dir + dir; + size_t left = len; + do { + if (likely(scan[-dir] != *scan - 1 && *scan + 1 != scan[dir])) { +#if MDBX_PNL_ASCENDING + target = scan; + break; +#else + /* вырезаем элемент с перемещением хвоста */ + const pgno_t pgno = *scan; + MDBX_PNL_SETSIZE(pnl, len - 1); + while (++scan <= target) + scan[-1] = *scan; + return pgno; +#endif + } + scan += dir; + } while (--left > 2); + } + + const pgno_t pgno = *target; +#if MDBX_PNL_ASCENDING + /* вырезаем элемент с перемещением хвоста */ + MDBX_PNL_SETSIZE(pnl, len - 1); + for (const pgno_t *const end = pnl + len - 1; target <= end; ++target) + *target = target[1]; +#else + /* перемещать хвост не нужно, просто усекам список */ + MDBX_PNL_SETSIZE(pnl, len - 1); +#endif + return pgno; +} + +__hot static pgno_t pnl_get_sequence(MDBX_PNL pnl, const size_t num, + uint8_t flags) { + const size_t len = MDBX_PNL_GETSIZE(pnl); + pgno_t *edge = MDBX_PNL_EDGE(pnl); + assert(len >= num && num > 1); + const size_t seq = num - 1; +#if !MDBX_PNL_ASCENDING + if (edge[-(ptrdiff_t)seq] - *edge == seq) { + if (unlikely(flags & MDBX_ALLOC_RESERVE)) + return P_INVALID; + assert(edge == scan4range_checker(pnl, seq)); + /* перемещать хвост не нужно, просто усекам список */ + MDBX_PNL_SETSIZE(pnl, len - num); + return *edge; + } +#endif + pgno_t *target = scan4seq_impl(edge, len, seq); + assert(target == scan4range_checker(pnl, seq)); + if (target) { + if (unlikely(flags & MDBX_ALLOC_RESERVE)) + return P_INVALID; + const pgno_t pgno = *target; + /* вырезаем найденную последовательность с перемещением хвоста */ + MDBX_PNL_SETSIZE(pnl, len - num); +#if MDBX_PNL_ASCENDING + for (const pgno_t *const end = pnl + len - num; target <= end; ++target) + *target = target[num]; +#else + for (const pgno_t *const end = pnl + len; ++target <= end;) + target[-(ptrdiff_t)num] = *target; +#endif + return pgno; + } + return 0; +} + static pgr_t page_alloc_slowpath(const MDBX_cursor *mc, const size_t num, uint8_t flags) { #if MDBX_ENABLE_PROFGC @@ -6646,25 +6723,22 @@ static pgr_t page_alloc_slowpath(const MDBX_cursor *mc, const size_t num, eASSERT(env, pnl_check_allocated(txn->tw.relist, txn->mt_next_pgno - MDBX_ENABLE_REFUND)); - pgno_t pgno = 0, *range = nullptr; - size_t newnext, re_len = MDBX_PNL_GETSIZE(txn->tw.relist); + pgno_t pgno = 0; + size_t newnext; if (num > 1) { #if MDBX_ENABLE_PROFGC prof->xpages += 1; #endif /* MDBX_ENABLE_PROFGC */ - if (re_len >= num) { + if (MDBX_PNL_GETSIZE(txn->tw.relist) >= num) { eASSERT(env, MDBX_PNL_LAST(txn->tw.relist) < txn->mt_next_pgno && MDBX_PNL_FIRST(txn->tw.relist) < txn->mt_next_pgno); - range = txn->tw.relist + (MDBX_PNL_ASCENDING ? 1 : re_len); - range = scan4seq(range, re_len, num - 1); - eASSERT(env, range == scan4range_checker(txn->tw.relist, num - 1)); - if (likely(range)) { - pgno = *range; + pgno = pnl_get_sequence(txn->tw.relist, num, flags); + if (likely(pgno)) goto done; - } } } else { - eASSERT(env, num == 0 || re_len == 0); + eASSERT(env, num == 0 || MDBX_PNL_GETSIZE(txn->tw.relist) == 0); + eASSERT(env, !(flags & MDBX_ALLOC_RESERVE) || num == 0); } //--------------------------------------------------------------------------- @@ -6681,7 +6755,7 @@ static pgr_t page_alloc_slowpath(const MDBX_cursor *mc, const size_t num, * большего резерва из-за увеличения списка переработанных страниц. */ (flags & MDBX_ALLOC_RESERVE) == 0) { if (txn->mt_dbs[FREE_DBI].md_branch_pages && - re_len < env->me_maxgc_ov1page / 2) + MDBX_PNL_GETSIZE(txn->tw.relist) < env->me_maxgc_ov1page / 2) flags += MDBX_ALLOC_COALESCE; } @@ -6775,40 +6849,38 @@ next_gc:; const size_t gc_len = MDBX_PNL_GETSIZE(gc_pnl); TRACE("gc-read: id #%" PRIaTXN " len %zu, re-list will %zu ", id, gc_len, - gc_len + re_len); + gc_len + MDBX_PNL_GETSIZE(txn->tw.relist)); - eASSERT(env, re_len == MDBX_PNL_GETSIZE(txn->tw.relist)); - if (unlikely(gc_len + re_len >= env->me_maxgc_ov1page)) { + if (unlikely(gc_len + MDBX_PNL_GETSIZE(txn->tw.relist) >= + env->me_maxgc_ov1page)) { /* Don't try to coalesce too much. */ if (flags & MDBX_ALLOC_SHOULD_SCAN) { eASSERT(env, flags & MDBX_ALLOC_COALESCE); + eASSERT(env, !(flags & MDBX_ALLOC_RESERVE)); eASSERT(env, num > 0); #if MDBX_ENABLE_PROFGC env->me_lck->mti_pgop_stat.gc_prof.coalescences += 1; #endif /* MDBX_ENABLE_PROFGC */ TRACE("clear %s %s", "MDBX_ALLOC_COALESCE", "since got threshold"); - if (re_len >= num) { + if (MDBX_PNL_GETSIZE(txn->tw.relist) >= num) { eASSERT(env, MDBX_PNL_LAST(txn->tw.relist) < txn->mt_next_pgno && MDBX_PNL_FIRST(txn->tw.relist) < txn->mt_next_pgno); - range = txn->tw.relist + (MDBX_PNL_ASCENDING ? 1 : re_len); - pgno = *range; - if (num == 1) - goto done; - range = scan4seq(range, re_len, num - 1); - eASSERT(env, range == scan4range_checker(txn->tw.relist, num - 1)); - if (likely(range)) { - pgno = *range; + if (likely(num == 1)) { + pgno = pnl_get_single(txn->tw.relist); goto done; } + pgno = pnl_get_sequence(txn->tw.relist, num, flags); + if (likely(pgno)) + goto done; } flags -= MDBX_ALLOC_COALESCE | MDBX_ALLOC_SHOULD_SCAN; } - if (unlikely(/* list is too long already */ re_len >= - env->me_options.rp_augment_limit) && + if (unlikely(/* list is too long already */ MDBX_PNL_GETSIZE( + txn->tw.relist) >= env->me_options.rp_augment_limit) && ((/* not a slot-request from gc-update */ num && /* have enough unallocated space */ txn->mt_geo.upper >= txn->mt_next_pgno + num) || - gc_len + re_len >= MDBX_PGL_LIMIT)) { + gc_len + MDBX_PNL_GETSIZE(txn->tw.relist) >= MDBX_PGL_LIMIT)) { /* Stop reclaiming to avoid large/overflow the page list. * This is a rare case while search for a continuously multi-page region * in a large database. @@ -6816,7 +6888,8 @@ next_gc:; */ NOTICE("stop reclaiming to avoid PNL overflow: %zu (current) + %zu " "(chunk) -> %zu", - re_len, gc_len, gc_len + re_len); + MDBX_PNL_GETSIZE(txn->tw.relist), gc_len, + gc_len + MDBX_PNL_GETSIZE(txn->tw.relist)); goto depleted_gc; } } @@ -6844,7 +6917,7 @@ next_gc:; } /* Merge in descending sorted order */ - re_len = pnl_merge(txn->tw.relist, gc_pnl); + pnl_merge(txn->tw.relist, gc_pnl); flags |= MDBX_ALLOC_SHOULD_SCAN; if (AUDIT_ENABLED()) { if (unlikely(!pnl_check(txn->tw.relist, txn->mt_next_pgno))) { @@ -6856,15 +6929,13 @@ next_gc:; } eASSERT(env, dirtylist_check(txn)); - eASSERT(env, - re_len == 0 || MDBX_PNL_MOST(txn->tw.relist) < txn->mt_next_pgno); - if (MDBX_ENABLE_REFUND && re_len && + eASSERT(env, MDBX_PNL_GETSIZE(txn->tw.relist) == 0 || + MDBX_PNL_MOST(txn->tw.relist) < txn->mt_next_pgno); + if (MDBX_ENABLE_REFUND && MDBX_PNL_GETSIZE(txn->tw.relist) && unlikely(MDBX_PNL_MOST(txn->tw.relist) == txn->mt_next_pgno - 1)) { /* Refund suitable pages into "unallocated" space */ txn_refund(txn); - re_len = MDBX_PNL_GETSIZE(txn->tw.relist); } - eASSERT(env, re_len == MDBX_PNL_GETSIZE(txn->tw.relist)); eASSERT(env, pnl_check_allocated(txn->tw.relist, txn->mt_next_pgno - MDBX_ENABLE_REFUND)); @@ -6872,7 +6943,7 @@ next_gc:; if (unlikely(num == 0)) { eASSERT(env, ret.err == MDBX_SUCCESS); TRACE("%s: last id #%" PRIaTXN ", re-len %zu", "early-exit for slot", id, - re_len); + MDBX_PNL_GETSIZE(txn->tw.relist)); goto early_exit; } @@ -6881,31 +6952,29 @@ next_gc:; eASSERT(env, op == MDBX_PREV || op == MDBX_NEXT); if (flags & MDBX_ALLOC_COALESCE) { TRACE("%s: last id #%" PRIaTXN ", re-len %zu", "coalesce-continue", id, - re_len); + MDBX_PNL_GETSIZE(txn->tw.relist)); goto next_gc; } scan: eASSERT(env, flags & MDBX_ALLOC_SHOULD_SCAN); eASSERT(env, num > 0); - if (re_len >= num) { + if (MDBX_PNL_GETSIZE(txn->tw.relist) >= num) { eASSERT(env, MDBX_PNL_LAST(txn->tw.relist) < txn->mt_next_pgno && MDBX_PNL_FIRST(txn->tw.relist) < txn->mt_next_pgno); - range = txn->tw.relist + (MDBX_PNL_ASCENDING ? 1 : re_len); - pgno = *range; - if (num == 1) - goto done; - range = scan4seq(range, re_len, num - 1); - eASSERT(env, range == scan4range_checker(txn->tw.relist, num - 1)); - if (likely(range)) { - pgno = *range; + if (likely(num == 1)) { + eASSERT(env, !(flags & MDBX_ALLOC_RESERVE)); + pgno = pnl_get_single(txn->tw.relist); goto done; } + pgno = pnl_get_sequence(txn->tw.relist, num, flags); + if (likely(pgno)) + goto done; } flags -= MDBX_ALLOC_SHOULD_SCAN; if (ret.err == MDBX_SUCCESS) { TRACE("%s: last id #%" PRIaTXN ", re-len %zu", "continue-search", id, - re_len); + MDBX_PNL_GETSIZE(txn->tw.relist)); goto next_gc; } @@ -6924,7 +6993,7 @@ depleted_gc: * - extend the database file. */ /* Will use new pages from the map if nothing is suitable in the GC. */ - newnext = (pgno = txn->mt_next_pgno) + num; + newnext = txn->mt_next_pgno + num; /* Does reclaiming stopped at the last steady point? */ const meta_ptr_t recent = meta_recent(env, &txn->tw.troika); @@ -7001,7 +7070,7 @@ depleted_gc: /* Avoid kick lagging reader(s) if is enough unallocated space * at the end of database file. */ if (!(flags & MDBX_ALLOC_RESERVE) && newnext <= txn->mt_end_pgno) { - eASSERT(env, range == nullptr); + eASSERT(env, pgno == 0); goto done; } @@ -7014,13 +7083,14 @@ depleted_gc: //--------------------------------------------------------------------------- no_gc: + eASSERT(env, pgno == 0); if (flags & MDBX_ALLOC_RESERVE) { ret.err = MDBX_NOTFOUND; goto fail; } /* Will use new pages from the map if nothing is suitable in the GC. */ - newnext = (pgno = txn->mt_next_pgno) + num; + newnext = txn->mt_next_pgno + num; if (newnext <= txn->mt_end_pgno) goto done; @@ -7052,27 +7122,15 @@ no_gc: goto fail; } env->me_txn->mt_end_pgno = (pgno_t)aligned; + eASSERT(env, pgno == 0); //--------------------------------------------------------------------------- done: ret.err = MDBX_SUCCESS; if (likely((flags & MDBX_ALLOC_RESERVE) == 0)) { - ENSURE(env, pgno >= NUM_METAS); - if (range) { - eASSERT(env, pgno == *range); + if (pgno) { eASSERT(env, pgno + num <= txn->mt_next_pgno && pgno >= NUM_METAS); - eASSERT(env, re_len == MDBX_PNL_GETSIZE(txn->tw.relist)); - /* Cutoff allocated pages from tw.relist */ -#if MDBX_PNL_ASCENDING - for (const pgno_t *const end = re_list + re_len - num; range <= end; - ++range) - *range = range[num]; -#else - for (const pgno_t *const end = txn->tw.relist + re_len; ++range <= end;) - range[-(ptrdiff_t)num] = *range; -#endif - MDBX_PNL_SETSIZE(txn->tw.relist, re_len -= num); eASSERT(env, pnl_check_allocated(txn->tw.relist, txn->mt_next_pgno - MDBX_ENABLE_REFUND)); } else { @@ -7082,6 +7140,7 @@ done: eASSERT(env, pgno >= NUM_METAS && pgno + num <= txn->mt_next_pgno); } + ENSURE(env, pgno >= NUM_METAS); #if MDBX_ENABLE_PROFGC if (!monotime_shot) monotime_shot = osal_monotime(); @@ -7182,20 +7241,10 @@ __hot static pgr_t page_alloc(const MDBX_cursor *mc) { return ret; } - MDBX_PNL pnl = txn->tw.relist; - const size_t len = MDBX_PNL_GETSIZE(pnl); - if (likely(len > 0)) { + if (likely(MDBX_PNL_GETSIZE(txn->tw.relist) > 0)) { + const pgno_t pgno = pnl_get_single(txn->tw.relist); MDBX_env *const env = txn->mt_env; - MDBX_PNL_SETSIZE(pnl, len - 1); -#if MDBX_PNL_ASCENDING - const pgno_t pgno = pnl[1]; - for (size_t i = 1; i < len; ++i) - pnl[i] = pnl[i + 1]; -#else - const pgno_t pgno = pnl[len]; -#endif - #if MDBX_ENABLE_PROFGC const uint64_t monotime_before = osal_monotime(); size_t majflt_before; diff --git a/src/internals.h b/src/internals.h index e6bcdd9a..383581f8 100644 --- a/src/internals.h +++ b/src/internals.h @@ -949,9 +949,11 @@ typedef struct MDBX_dpl { #define MDBX_PNL_END(pl) (&(pl)[MDBX_PNL_GETSIZE(pl) + 1]) #if MDBX_PNL_ASCENDING +#define MDBX_PNL_EDGE(pl) ((pl) + 1) #define MDBX_PNL_LEAST(pl) MDBX_PNL_FIRST(pl) #define MDBX_PNL_MOST(pl) MDBX_PNL_LAST(pl) #else +#define MDBX_PNL_EDGE(pl) ((pl) + MDBX_PNL_GETSIZE(pl)) #define MDBX_PNL_LEAST(pl) MDBX_PNL_LAST(pl) #define MDBX_PNL_MOST(pl) MDBX_PNL_FIRST(pl) #endif