mdbx: добавление me_madv_threshold и рефакторинг/упрощение.

Для уменьшения затрат на MDBX_SHRINK_ALLOWED.
This commit is contained in:
Леонид Юрьев (Leonid Yuriev) 2023-01-03 20:20:03 +03:00
parent 24f2e878c1
commit c6b73c8a24
2 changed files with 130 additions and 124 deletions

View File

@ -5651,8 +5651,8 @@ static txnid_t txn_oldest_reader(const MDBX_txn *const txn) {
} }
/* Find largest mvcc-snapshot still referenced. */ /* Find largest mvcc-snapshot still referenced. */
__cold static pgno_t find_largest_snapshot(const MDBX_env *env, static pgno_t find_largest_snapshot(const MDBX_env *env,
pgno_t last_used_page) { pgno_t last_used_page) {
MDBX_lockinfo *const lck = env->me_lck_mmap.lck; MDBX_lockinfo *const lck = env->me_lck_mmap.lck;
if (likely(lck != NULL /* check for exclusive without-lck mode */)) { if (likely(lck != NULL /* check for exclusive without-lck mode */)) {
retry:; retry:;
@ -5981,7 +5981,7 @@ __cold static unsigned default_rp_augment_limit(const MDBX_env *env) {
(augment > MDBX_PNL_INITIAL) ? augment : MDBX_PNL_INITIAL)); (augment > MDBX_PNL_INITIAL) ? augment : MDBX_PNL_INITIAL));
} }
__cold static bool default_prefault_write(const MDBX_env *env) { static bool default_prefault_write(const MDBX_env *env) {
return !MDBX_MMAP_INCOHERENT_FILE_WRITE && !env->me_incore && return !MDBX_MMAP_INCOHERENT_FILE_WRITE && !env->me_incore &&
(env->me_flags & (MDBX_WRITEMAP | MDBX_RDONLY)) == MDBX_WRITEMAP; (env->me_flags & (MDBX_WRITEMAP | MDBX_RDONLY)) == MDBX_WRITEMAP;
} }
@ -5991,6 +5991,21 @@ static void adjust_defaults(MDBX_env *env) {
env->me_options.rp_augment_limit = default_rp_augment_limit(env); env->me_options.rp_augment_limit = default_rp_augment_limit(env);
if (!env->me_options.flags.non_auto.prefault_write) if (!env->me_options.flags.non_auto.prefault_write)
env->me_options.prefault_write = default_prefault_write(env); env->me_options.prefault_write = default_prefault_write(env);
const size_t basis = env->me_dbgeo.now;
/* TODO: use options? */
const unsigned factor = 9;
size_t threshold = (basis < (65536ul << factor))
? 65536 /* minimal threshold */
: (basis > (MEGABYTE * 4 << factor))
? MEGABYTE * 4 /* maximal threshold */
: basis >> factor;
threshold = (threshold < env->me_dbgeo.shrink || !env->me_dbgeo.shrink)
? threshold
: env->me_dbgeo.shrink;
env->me_madv_threshold =
bytes2pgno(env, bytes_align2os_bytes(env, threshold));
} }
__cold static int map_resize(MDBX_env *env, const pgno_t used_pgno, __cold static int map_resize(MDBX_env *env, const pgno_t used_pgno,
@ -12435,20 +12450,6 @@ __cold static MDBX_meta *init_metas(const MDBX_env *env, void *buffer) {
return page_meta(page2); return page_meta(page2);
} }
#if MDBX_ENABLE_MADVISE && !(defined(_WIN32) || defined(_WIN64))
static size_t madvise_threshold(const MDBX_env *env,
const size_t largest_bytes) {
/* TODO: use options */
const unsigned factor = 9;
const size_t threshold = (largest_bytes < (65536ul << factor))
? 65536 /* minimal threshold */
: (largest_bytes > (MEGABYTE * 4 << factor))
? MEGABYTE * 4 /* maximal threshold */
: largest_bytes >> factor;
return bytes_align2os_bytes(env, threshold);
}
#endif /* MDBX_ENABLE_MADVISE */
static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending, static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending,
meta_troika_t *const troika) { meta_troika_t *const troika) {
eASSERT(env, ((env->me_flags ^ flags) & MDBX_WRITEMAP) == 0); eASSERT(env, ((env->me_flags ^ flags) & MDBX_WRITEMAP) == 0);
@ -12482,127 +12483,131 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending,
pgno_t shrink = 0; pgno_t shrink = 0;
if (flags & MDBX_SHRINK_ALLOWED) { if (flags & MDBX_SHRINK_ALLOWED) {
/* LY: check conditions to discard unused pages */ const size_t prev_discarded_pgno =
const pgno_t largest_pgno = find_largest_snapshot( atomic_load32(&env->me_lck->mti_discarded_tail, mo_Relaxed);
env, (head.ptr_c->mm_geo.next > pending->mm_geo.next) if (prev_discarded_pgno < pending->mm_geo.next)
? head.ptr_c->mm_geo.next env->me_lck->mti_discarded_tail.weak = pending->mm_geo.next;
: pending->mm_geo.next); else if (prev_discarded_pgno >=
eASSERT(env, largest_pgno >= NUM_METAS); pending->mm_geo.next + env->me_madv_threshold) {
/* LY: check conditions to discard unused pages */
const pgno_t largest_pgno = find_largest_snapshot(
env, (head.ptr_c->mm_geo.next > pending->mm_geo.next)
? head.ptr_c->mm_geo.next
: pending->mm_geo.next);
eASSERT(env, largest_pgno >= NUM_METAS);
#if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__) #if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__)
const pgno_t edge = env->me_poison_edge; const pgno_t edge = env->me_poison_edge;
if (edge > largest_pgno) { if (edge > largest_pgno) {
env->me_poison_edge = largest_pgno; env->me_poison_edge = largest_pgno;
VALGRIND_MAKE_MEM_NOACCESS( VALGRIND_MAKE_MEM_NOACCESS(
ptr_disp(env->me_map, pgno2bytes(env, largest_pgno)), ptr_disp(env->me_map, pgno2bytes(env, largest_pgno)),
pgno2bytes(env, edge - largest_pgno)); pgno2bytes(env, edge - largest_pgno));
MDBX_ASAN_POISON_MEMORY_REGION( MDBX_ASAN_POISON_MEMORY_REGION(
ptr_disp(env->me_map, pgno2bytes(env, largest_pgno)), ptr_disp(env->me_map, pgno2bytes(env, largest_pgno)),
pgno2bytes(env, edge - largest_pgno)); pgno2bytes(env, edge - largest_pgno));
} }
#endif /* MDBX_USE_VALGRIND || __SANITIZE_ADDRESS__ */ #endif /* MDBX_USE_VALGRIND || __SANITIZE_ADDRESS__ */
#if MDBX_ENABLE_MADVISE && \ #if MDBX_ENABLE_MADVISE && \
(defined(MADV_DONTNEED) || defined(POSIX_MADV_DONTNEED)) (defined(MADV_DONTNEED) || defined(POSIX_MADV_DONTNEED))
const size_t largest_bytes = pgno2bytes(env, largest_pgno); const size_t discard_edge_pgno = pgno_align2os_pgno(env, largest_pgno);
/* threshold to avoid unreasonable frequent madvise() calls */ if (prev_discarded_pgno >= discard_edge_pgno + env->me_madv_threshold) {
const size_t threshold = madvise_threshold(env, largest_bytes); const size_t prev_discarded_bytes =
const size_t discard_edge_bytes = bytes_align2os_bytes( pgno_align2os_bytes(env, prev_discarded_pgno);
env, ((MDBX_RDONLY & const size_t discard_edge_bytes = pgno2bytes(env, discard_edge_pgno);
(env->me_lck_mmap.lck ? env->me_lck_mmap.lck->mti_envmode.weak /* из-за выравнивания prev_discarded_bytes и discard_edge_bytes
: env->me_flags)) * могут быть равны */
? largest_bytes if (prev_discarded_bytes > discard_edge_bytes) {
: largest_bytes + threshold)); NOTICE("shrink-MADV_%s %zu..%zu", "DONTNEED", discard_edge_pgno,
const pgno_t discard_edge_pgno = bytes2pgno(env, discard_edge_bytes); prev_discarded_pgno);
const pgno_t prev_discarded_pgno = munlock_after(env, discard_edge_pgno,
atomic_load32(&env->me_lck->mti_discarded_tail, mo_Relaxed); bytes_align2os_bytes(env, env->me_dxb_mmap.current));
if (prev_discarded_pgno >= discard_edge_pgno + bytes2pgno(env, threshold)) { const uint32_t munlocks_before =
NOTICE("shrink-MADV_%s %u..%u", "DONTNEED", largest_pgno, atomic_load32(&env->me_lck->mti_mlcnt[1], mo_Relaxed);
prev_discarded_pgno);
atomic_store32(&env->me_lck->mti_discarded_tail, discard_edge_pgno,
mo_Relaxed);
const size_t prev_discarded_bytes =
ceil_powerof2(pgno2bytes(env, prev_discarded_pgno), env->me_os_psize);
ENSURE(env, prev_discarded_bytes > discard_edge_bytes);
munlock_after(env, discard_edge_pgno,
bytes_align2os_bytes(env, env->me_dxb_mmap.current));
const uint32_t munlocks_before =
atomic_load32(&env->me_lck->mti_mlcnt[1], mo_Relaxed);
#if defined(MADV_DONTNEED) #if defined(MADV_DONTNEED)
int advise = MADV_DONTNEED; int advise = MADV_DONTNEED;
#if defined(MADV_FREE) && \ #if defined(MADV_FREE) && \
0 /* MADV_FREE works for only anonymous vma at the moment */ 0 /* MADV_FREE works for only anonymous vma at the moment */
if ((env->me_flags & MDBX_WRITEMAP) && linux_kernel_version > 0x04050000) if ((env->me_flags & MDBX_WRITEMAP) &&
advise = MADV_FREE; linux_kernel_version > 0x04050000)
advise = MADV_FREE;
#endif /* MADV_FREE */ #endif /* MADV_FREE */
int err = madvise(ptr_disp(env->me_map, discard_edge_bytes), int err = madvise(ptr_disp(env->me_map, discard_edge_bytes),
prev_discarded_bytes - discard_edge_bytes, advise) prev_discarded_bytes - discard_edge_bytes, advise)
? ignore_enosys(errno) ? ignore_enosys(errno)
: MDBX_SUCCESS; : MDBX_SUCCESS;
#else #else
int err = ignore_enosys(posix_madvise( int err = ignore_enosys(posix_madvise(
ptr_disp(env->me_map, discard_edge_bytes), ptr_disp(env->me_map, discard_edge_bytes),
prev_discarded_bytes - discard_edge_bytes, POSIX_MADV_DONTNEED)); prev_discarded_bytes - discard_edge_bytes, POSIX_MADV_DONTNEED));
#endif #endif
if (unlikely(MDBX_IS_ERROR(err))) { if (unlikely(MDBX_IS_ERROR(err))) {
const uint32_t mlocks_after = const uint32_t mlocks_after =
atomic_load32(&env->me_lck->mti_mlcnt[0], mo_Relaxed); atomic_load32(&env->me_lck->mti_mlcnt[0], mo_Relaxed);
if (err == MDBX_EINVAL) { if (err == MDBX_EINVAL) {
const int severity = (mlocks_after - munlocks_before) const int severity = (mlocks_after - munlocks_before)
? MDBX_LOG_NOTICE ? MDBX_LOG_NOTICE
: MDBX_LOG_WARN; : MDBX_LOG_WARN;
if (LOG_ENABLED(severity)) if (LOG_ENABLED(severity))
debug_log(severity, __func__, __LINE__, debug_log(
"%s-madvise: ignore EINVAL (%d) since some pages maybe " severity, __func__, __LINE__,
"locked (%u/%u mlcnt-processes)", "%s-madvise: ignore EINVAL (%d) since some pages maybe "
"shrink", err, mlocks_after, munlocks_before); "locked (%u/%u mlcnt-processes)",
} else { "shrink", err, mlocks_after, munlocks_before);
ERROR("%s-madvise(%s, %zu, +%zu), %u/%u mlcnt-processes, err %d", } else {
"shrink", "DONTNEED", discard_edge_bytes, ERROR("%s-madvise(%s, %zu, +%zu), %u/%u mlcnt-processes, err %d",
prev_discarded_bytes - discard_edge_bytes, mlocks_after, "shrink", "DONTNEED", discard_edge_bytes,
munlocks_before, err); prev_discarded_bytes - discard_edge_bytes, mlocks_after,
return err; munlocks_before, err);
return err;
}
} else
env->me_lck->mti_discarded_tail.weak = discard_edge_pgno;
} }
} else }
env->me_lck->mti_discarded_tail.weak = discard_edge_pgno;
}
#endif /* MDBX_ENABLE_MADVISE && (MADV_DONTNEED || POSIX_MADV_DONTNEED) */ #endif /* MDBX_ENABLE_MADVISE && (MADV_DONTNEED || POSIX_MADV_DONTNEED) */
/* LY: check conditions to shrink datafile */ /* LY: check conditions to shrink datafile */
const pgno_t backlog_gap = 3 + pending->mm_dbs[FREE_DBI].md_depth * 3; const pgno_t backlog_gap = 3 + pending->mm_dbs[FREE_DBI].md_depth * 3;
pgno_t shrink_step = 0; pgno_t shrink_step = 0;
if (pending->mm_geo.shrink_pv && if (pending->mm_geo.shrink_pv &&
pending->mm_geo.now - pending->mm_geo.next > pending->mm_geo.now - pending->mm_geo.next >
(shrink_step = pv2pages(pending->mm_geo.shrink_pv)) + backlog_gap) { (shrink_step = pv2pages(pending->mm_geo.shrink_pv)) +
if (pending->mm_geo.now > largest_pgno && backlog_gap) {
pending->mm_geo.now - largest_pgno > shrink_step + backlog_gap) { if (pending->mm_geo.now > largest_pgno &&
const pgno_t aligner = pending->mm_geo.now - largest_pgno > shrink_step + backlog_gap) {
pending->mm_geo.grow_pv const pgno_t aligner =
? /* grow_step */ pv2pages(pending->mm_geo.grow_pv) pending->mm_geo.grow_pv
: shrink_step; ? /* grow_step */ pv2pages(pending->mm_geo.grow_pv)
const pgno_t with_backlog_gap = largest_pgno + backlog_gap; : shrink_step;
const pgno_t aligned = pgno_align2os_pgno( const pgno_t with_backlog_gap = largest_pgno + backlog_gap;
env, with_backlog_gap + aligner - with_backlog_gap % aligner); const pgno_t aligned = pgno_align2os_pgno(
const pgno_t bottom = env, with_backlog_gap + aligner - with_backlog_gap % aligner);
(aligned > pending->mm_geo.lower) ? aligned : pending->mm_geo.lower; const pgno_t bottom = (aligned > pending->mm_geo.lower)
if (pending->mm_geo.now > bottom) { ? aligned
if (TROIKA_HAVE_STEADY(troika)) : pending->mm_geo.lower;
/* force steady, but only if steady-checkpoint is present */ if (pending->mm_geo.now > bottom) {
flags &= MDBX_WRITEMAP | MDBX_SHRINK_ALLOWED; if (TROIKA_HAVE_STEADY(troika))
shrink = pending->mm_geo.now - bottom; /* force steady, but only if steady-checkpoint is present */
pending->mm_geo.now = bottom; flags &= MDBX_WRITEMAP | MDBX_SHRINK_ALLOWED;
if (unlikely(head.txnid == pending->unsafe_txnid)) { shrink = pending->mm_geo.now - bottom;
const txnid_t txnid = safe64_txnid_next(pending->unsafe_txnid); pending->mm_geo.now = bottom;
NOTICE("force-forward pending-txn %" PRIaTXN " -> %" PRIaTXN, if (unlikely(head.txnid == pending->unsafe_txnid)) {
pending->unsafe_txnid, txnid); const txnid_t txnid = safe64_txnid_next(pending->unsafe_txnid);
ENSURE(env, !env->me_txn0 || NOTICE("force-forward pending-txn %" PRIaTXN " -> %" PRIaTXN,
(env->me_txn0->mt_owner != osal_thread_self() && pending->unsafe_txnid, txnid);
!env->me_txn)); ENSURE(env, !env->me_txn0 ||
if (unlikely(txnid > MAX_TXNID)) { (env->me_txn0->mt_owner != osal_thread_self() &&
rc = MDBX_TXN_FULL; !env->me_txn));
ERROR("txnid overflow, raise %d", rc); if (unlikely(txnid > MAX_TXNID)) {
goto fail; rc = MDBX_TXN_FULL;
ERROR("txnid overflow, raise %d", rc);
goto fail;
}
meta_set_txnid(env, pending, txnid);
eASSERT(env, coherency_check_meta(env, pending, true));
} }
meta_set_txnid(env, pending, txnid);
eASSERT(env, coherency_check_meta(env, pending, true));
} }
} }
} }

View File

@ -1398,6 +1398,7 @@ struct MDBX_env {
uint32_t me_live_reader; /* have liveness lock in reader table */ uint32_t me_live_reader; /* have liveness lock in reader table */
void *me_userctx; /* User-settable context */ void *me_userctx; /* User-settable context */
MDBX_hsr_func *me_hsr_callback; /* Callback for kicking laggard readers */ MDBX_hsr_func *me_hsr_callback; /* Callback for kicking laggard readers */
size_t me_madv_threshold;
struct { struct {
unsigned dp_reserve_limit; unsigned dp_reserve_limit;