mdbx: добавление me_madv_threshold и рефакторинг/упрощение.

Для уменьшения затрат на MDBX_SHRINK_ALLOWED.
This commit is contained in:
Леонид Юрьев (Leonid Yuriev) 2023-01-03 20:20:03 +03:00
parent 24f2e878c1
commit c6b73c8a24
2 changed files with 130 additions and 124 deletions

View File

@ -5651,7 +5651,7 @@ static txnid_t txn_oldest_reader(const MDBX_txn *const txn) {
} }
/* Find largest mvcc-snapshot still referenced. */ /* Find largest mvcc-snapshot still referenced. */
__cold static pgno_t find_largest_snapshot(const MDBX_env *env, static pgno_t find_largest_snapshot(const MDBX_env *env,
pgno_t last_used_page) { pgno_t last_used_page) {
MDBX_lockinfo *const lck = env->me_lck_mmap.lck; MDBX_lockinfo *const lck = env->me_lck_mmap.lck;
if (likely(lck != NULL /* check for exclusive without-lck mode */)) { if (likely(lck != NULL /* check for exclusive without-lck mode */)) {
@ -5981,7 +5981,7 @@ __cold static unsigned default_rp_augment_limit(const MDBX_env *env) {
(augment > MDBX_PNL_INITIAL) ? augment : MDBX_PNL_INITIAL)); (augment > MDBX_PNL_INITIAL) ? augment : MDBX_PNL_INITIAL));
} }
__cold static bool default_prefault_write(const MDBX_env *env) { static bool default_prefault_write(const MDBX_env *env) {
return !MDBX_MMAP_INCOHERENT_FILE_WRITE && !env->me_incore && return !MDBX_MMAP_INCOHERENT_FILE_WRITE && !env->me_incore &&
(env->me_flags & (MDBX_WRITEMAP | MDBX_RDONLY)) == MDBX_WRITEMAP; (env->me_flags & (MDBX_WRITEMAP | MDBX_RDONLY)) == MDBX_WRITEMAP;
} }
@ -5991,6 +5991,21 @@ static void adjust_defaults(MDBX_env *env) {
env->me_options.rp_augment_limit = default_rp_augment_limit(env); env->me_options.rp_augment_limit = default_rp_augment_limit(env);
if (!env->me_options.flags.non_auto.prefault_write) if (!env->me_options.flags.non_auto.prefault_write)
env->me_options.prefault_write = default_prefault_write(env); env->me_options.prefault_write = default_prefault_write(env);
const size_t basis = env->me_dbgeo.now;
/* TODO: use options? */
const unsigned factor = 9;
size_t threshold = (basis < (65536ul << factor))
? 65536 /* minimal threshold */
: (basis > (MEGABYTE * 4 << factor))
? MEGABYTE * 4 /* maximal threshold */
: basis >> factor;
threshold = (threshold < env->me_dbgeo.shrink || !env->me_dbgeo.shrink)
? threshold
: env->me_dbgeo.shrink;
env->me_madv_threshold =
bytes2pgno(env, bytes_align2os_bytes(env, threshold));
} }
__cold static int map_resize(MDBX_env *env, const pgno_t used_pgno, __cold static int map_resize(MDBX_env *env, const pgno_t used_pgno,
@ -12435,20 +12450,6 @@ __cold static MDBX_meta *init_metas(const MDBX_env *env, void *buffer) {
return page_meta(page2); return page_meta(page2);
} }
#if MDBX_ENABLE_MADVISE && !(defined(_WIN32) || defined(_WIN64))
static size_t madvise_threshold(const MDBX_env *env,
const size_t largest_bytes) {
/* TODO: use options */
const unsigned factor = 9;
const size_t threshold = (largest_bytes < (65536ul << factor))
? 65536 /* minimal threshold */
: (largest_bytes > (MEGABYTE * 4 << factor))
? MEGABYTE * 4 /* maximal threshold */
: largest_bytes >> factor;
return bytes_align2os_bytes(env, threshold);
}
#endif /* MDBX_ENABLE_MADVISE */
static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending, static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending,
meta_troika_t *const troika) { meta_troika_t *const troika) {
eASSERT(env, ((env->me_flags ^ flags) & MDBX_WRITEMAP) == 0); eASSERT(env, ((env->me_flags ^ flags) & MDBX_WRITEMAP) == 0);
@ -12482,12 +12483,19 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending,
pgno_t shrink = 0; pgno_t shrink = 0;
if (flags & MDBX_SHRINK_ALLOWED) { if (flags & MDBX_SHRINK_ALLOWED) {
const size_t prev_discarded_pgno =
atomic_load32(&env->me_lck->mti_discarded_tail, mo_Relaxed);
if (prev_discarded_pgno < pending->mm_geo.next)
env->me_lck->mti_discarded_tail.weak = pending->mm_geo.next;
else if (prev_discarded_pgno >=
pending->mm_geo.next + env->me_madv_threshold) {
/* LY: check conditions to discard unused pages */ /* LY: check conditions to discard unused pages */
const pgno_t largest_pgno = find_largest_snapshot( const pgno_t largest_pgno = find_largest_snapshot(
env, (head.ptr_c->mm_geo.next > pending->mm_geo.next) env, (head.ptr_c->mm_geo.next > pending->mm_geo.next)
? head.ptr_c->mm_geo.next ? head.ptr_c->mm_geo.next
: pending->mm_geo.next); : pending->mm_geo.next);
eASSERT(env, largest_pgno >= NUM_METAS); eASSERT(env, largest_pgno >= NUM_METAS);
#if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__) #if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__)
const pgno_t edge = env->me_poison_edge; const pgno_t edge = env->me_poison_edge;
if (edge > largest_pgno) { if (edge > largest_pgno) {
@ -12500,28 +12508,19 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending,
pgno2bytes(env, edge - largest_pgno)); pgno2bytes(env, edge - largest_pgno));
} }
#endif /* MDBX_USE_VALGRIND || __SANITIZE_ADDRESS__ */ #endif /* MDBX_USE_VALGRIND || __SANITIZE_ADDRESS__ */
#if MDBX_ENABLE_MADVISE && \ #if MDBX_ENABLE_MADVISE && \
(defined(MADV_DONTNEED) || defined(POSIX_MADV_DONTNEED)) (defined(MADV_DONTNEED) || defined(POSIX_MADV_DONTNEED))
const size_t largest_bytes = pgno2bytes(env, largest_pgno); const size_t discard_edge_pgno = pgno_align2os_pgno(env, largest_pgno);
/* threshold to avoid unreasonable frequent madvise() calls */ if (prev_discarded_pgno >= discard_edge_pgno + env->me_madv_threshold) {
const size_t threshold = madvise_threshold(env, largest_bytes);
const size_t discard_edge_bytes = bytes_align2os_bytes(
env, ((MDBX_RDONLY &
(env->me_lck_mmap.lck ? env->me_lck_mmap.lck->mti_envmode.weak
: env->me_flags))
? largest_bytes
: largest_bytes + threshold));
const pgno_t discard_edge_pgno = bytes2pgno(env, discard_edge_bytes);
const pgno_t prev_discarded_pgno =
atomic_load32(&env->me_lck->mti_discarded_tail, mo_Relaxed);
if (prev_discarded_pgno >= discard_edge_pgno + bytes2pgno(env, threshold)) {
NOTICE("shrink-MADV_%s %u..%u", "DONTNEED", largest_pgno,
prev_discarded_pgno);
atomic_store32(&env->me_lck->mti_discarded_tail, discard_edge_pgno,
mo_Relaxed);
const size_t prev_discarded_bytes = const size_t prev_discarded_bytes =
ceil_powerof2(pgno2bytes(env, prev_discarded_pgno), env->me_os_psize); pgno_align2os_bytes(env, prev_discarded_pgno);
ENSURE(env, prev_discarded_bytes > discard_edge_bytes); const size_t discard_edge_bytes = pgno2bytes(env, discard_edge_pgno);
/* из-за выравнивания prev_discarded_bytes и discard_edge_bytes
* могут быть равны */
if (prev_discarded_bytes > discard_edge_bytes) {
NOTICE("shrink-MADV_%s %zu..%zu", "DONTNEED", discard_edge_pgno,
prev_discarded_pgno);
munlock_after(env, discard_edge_pgno, munlock_after(env, discard_edge_pgno,
bytes_align2os_bytes(env, env->me_dxb_mmap.current)); bytes_align2os_bytes(env, env->me_dxb_mmap.current));
const uint32_t munlocks_before = const uint32_t munlocks_before =
@ -12530,7 +12529,8 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending,
int advise = MADV_DONTNEED; int advise = MADV_DONTNEED;
#if defined(MADV_FREE) && \ #if defined(MADV_FREE) && \
0 /* MADV_FREE works for only anonymous vma at the moment */ 0 /* MADV_FREE works for only anonymous vma at the moment */
if ((env->me_flags & MDBX_WRITEMAP) && linux_kernel_version > 0x04050000) if ((env->me_flags & MDBX_WRITEMAP) &&
linux_kernel_version > 0x04050000)
advise = MADV_FREE; advise = MADV_FREE;
#endif /* MADV_FREE */ #endif /* MADV_FREE */
int err = madvise(ptr_disp(env->me_map, discard_edge_bytes), int err = madvise(ptr_disp(env->me_map, discard_edge_bytes),
@ -12550,7 +12550,8 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending,
? MDBX_LOG_NOTICE ? MDBX_LOG_NOTICE
: MDBX_LOG_WARN; : MDBX_LOG_WARN;
if (LOG_ENABLED(severity)) if (LOG_ENABLED(severity))
debug_log(severity, __func__, __LINE__, debug_log(
severity, __func__, __LINE__,
"%s-madvise: ignore EINVAL (%d) since some pages maybe " "%s-madvise: ignore EINVAL (%d) since some pages maybe "
"locked (%u/%u mlcnt-processes)", "locked (%u/%u mlcnt-processes)",
"shrink", err, mlocks_after, munlocks_before); "shrink", err, mlocks_after, munlocks_before);
@ -12564,6 +12565,7 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending,
} else } else
env->me_lck->mti_discarded_tail.weak = discard_edge_pgno; env->me_lck->mti_discarded_tail.weak = discard_edge_pgno;
} }
}
#endif /* MDBX_ENABLE_MADVISE && (MADV_DONTNEED || POSIX_MADV_DONTNEED) */ #endif /* MDBX_ENABLE_MADVISE && (MADV_DONTNEED || POSIX_MADV_DONTNEED) */
/* LY: check conditions to shrink datafile */ /* LY: check conditions to shrink datafile */
@ -12571,7 +12573,8 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending,
pgno_t shrink_step = 0; pgno_t shrink_step = 0;
if (pending->mm_geo.shrink_pv && if (pending->mm_geo.shrink_pv &&
pending->mm_geo.now - pending->mm_geo.next > pending->mm_geo.now - pending->mm_geo.next >
(shrink_step = pv2pages(pending->mm_geo.shrink_pv)) + backlog_gap) { (shrink_step = pv2pages(pending->mm_geo.shrink_pv)) +
backlog_gap) {
if (pending->mm_geo.now > largest_pgno && if (pending->mm_geo.now > largest_pgno &&
pending->mm_geo.now - largest_pgno > shrink_step + backlog_gap) { pending->mm_geo.now - largest_pgno > shrink_step + backlog_gap) {
const pgno_t aligner = const pgno_t aligner =
@ -12581,8 +12584,9 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending,
const pgno_t with_backlog_gap = largest_pgno + backlog_gap; const pgno_t with_backlog_gap = largest_pgno + backlog_gap;
const pgno_t aligned = pgno_align2os_pgno( const pgno_t aligned = pgno_align2os_pgno(
env, with_backlog_gap + aligner - with_backlog_gap % aligner); env, with_backlog_gap + aligner - with_backlog_gap % aligner);
const pgno_t bottom = const pgno_t bottom = (aligned > pending->mm_geo.lower)
(aligned > pending->mm_geo.lower) ? aligned : pending->mm_geo.lower; ? aligned
: pending->mm_geo.lower;
if (pending->mm_geo.now > bottom) { if (pending->mm_geo.now > bottom) {
if (TROIKA_HAVE_STEADY(troika)) if (TROIKA_HAVE_STEADY(troika))
/* force steady, but only if steady-checkpoint is present */ /* force steady, but only if steady-checkpoint is present */
@ -12608,6 +12612,7 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending,
} }
} }
} }
}
/* LY: step#1 - sync previously written/updated data-pages */ /* LY: step#1 - sync previously written/updated data-pages */
rc = MDBX_RESULT_FALSE /* carry steady */; rc = MDBX_RESULT_FALSE /* carry steady */;

View File

@ -1398,6 +1398,7 @@ struct MDBX_env {
uint32_t me_live_reader; /* have liveness lock in reader table */ uint32_t me_live_reader; /* have liveness lock in reader table */
void *me_userctx; /* User-settable context */ void *me_userctx; /* User-settable context */
MDBX_hsr_func *me_hsr_callback; /* Callback for kicking laggard readers */ MDBX_hsr_func *me_hsr_callback; /* Callback for kicking laggard readers */
size_t me_madv_threshold;
struct { struct {
unsigned dp_reserve_limit; unsigned dp_reserve_limit;