mirror of
https://github.com/isar/libmdbx.git
synced 2025-02-28 15:58:15 +08:00
mdbx: добавление me_madv_threshold
и рефакторинг/упрощение.
Для уменьшения затрат на MDBX_SHRINK_ALLOWED.
This commit is contained in:
parent
24f2e878c1
commit
c6b73c8a24
253
src/core.c
253
src/core.c
@ -5651,8 +5651,8 @@ static txnid_t txn_oldest_reader(const MDBX_txn *const txn) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* Find largest mvcc-snapshot still referenced. */
|
/* Find largest mvcc-snapshot still referenced. */
|
||||||
__cold static pgno_t find_largest_snapshot(const MDBX_env *env,
|
static pgno_t find_largest_snapshot(const MDBX_env *env,
|
||||||
pgno_t last_used_page) {
|
pgno_t last_used_page) {
|
||||||
MDBX_lockinfo *const lck = env->me_lck_mmap.lck;
|
MDBX_lockinfo *const lck = env->me_lck_mmap.lck;
|
||||||
if (likely(lck != NULL /* check for exclusive without-lck mode */)) {
|
if (likely(lck != NULL /* check for exclusive without-lck mode */)) {
|
||||||
retry:;
|
retry:;
|
||||||
@ -5981,7 +5981,7 @@ __cold static unsigned default_rp_augment_limit(const MDBX_env *env) {
|
|||||||
(augment > MDBX_PNL_INITIAL) ? augment : MDBX_PNL_INITIAL));
|
(augment > MDBX_PNL_INITIAL) ? augment : MDBX_PNL_INITIAL));
|
||||||
}
|
}
|
||||||
|
|
||||||
__cold static bool default_prefault_write(const MDBX_env *env) {
|
static bool default_prefault_write(const MDBX_env *env) {
|
||||||
return !MDBX_MMAP_INCOHERENT_FILE_WRITE && !env->me_incore &&
|
return !MDBX_MMAP_INCOHERENT_FILE_WRITE && !env->me_incore &&
|
||||||
(env->me_flags & (MDBX_WRITEMAP | MDBX_RDONLY)) == MDBX_WRITEMAP;
|
(env->me_flags & (MDBX_WRITEMAP | MDBX_RDONLY)) == MDBX_WRITEMAP;
|
||||||
}
|
}
|
||||||
@ -5991,6 +5991,21 @@ static void adjust_defaults(MDBX_env *env) {
|
|||||||
env->me_options.rp_augment_limit = default_rp_augment_limit(env);
|
env->me_options.rp_augment_limit = default_rp_augment_limit(env);
|
||||||
if (!env->me_options.flags.non_auto.prefault_write)
|
if (!env->me_options.flags.non_auto.prefault_write)
|
||||||
env->me_options.prefault_write = default_prefault_write(env);
|
env->me_options.prefault_write = default_prefault_write(env);
|
||||||
|
|
||||||
|
const size_t basis = env->me_dbgeo.now;
|
||||||
|
/* TODO: use options? */
|
||||||
|
const unsigned factor = 9;
|
||||||
|
size_t threshold = (basis < (65536ul << factor))
|
||||||
|
? 65536 /* minimal threshold */
|
||||||
|
: (basis > (MEGABYTE * 4 << factor))
|
||||||
|
? MEGABYTE * 4 /* maximal threshold */
|
||||||
|
: basis >> factor;
|
||||||
|
threshold = (threshold < env->me_dbgeo.shrink || !env->me_dbgeo.shrink)
|
||||||
|
? threshold
|
||||||
|
: env->me_dbgeo.shrink;
|
||||||
|
|
||||||
|
env->me_madv_threshold =
|
||||||
|
bytes2pgno(env, bytes_align2os_bytes(env, threshold));
|
||||||
}
|
}
|
||||||
|
|
||||||
__cold static int map_resize(MDBX_env *env, const pgno_t used_pgno,
|
__cold static int map_resize(MDBX_env *env, const pgno_t used_pgno,
|
||||||
@ -12435,20 +12450,6 @@ __cold static MDBX_meta *init_metas(const MDBX_env *env, void *buffer) {
|
|||||||
return page_meta(page2);
|
return page_meta(page2);
|
||||||
}
|
}
|
||||||
|
|
||||||
#if MDBX_ENABLE_MADVISE && !(defined(_WIN32) || defined(_WIN64))
|
|
||||||
static size_t madvise_threshold(const MDBX_env *env,
|
|
||||||
const size_t largest_bytes) {
|
|
||||||
/* TODO: use options */
|
|
||||||
const unsigned factor = 9;
|
|
||||||
const size_t threshold = (largest_bytes < (65536ul << factor))
|
|
||||||
? 65536 /* minimal threshold */
|
|
||||||
: (largest_bytes > (MEGABYTE * 4 << factor))
|
|
||||||
? MEGABYTE * 4 /* maximal threshold */
|
|
||||||
: largest_bytes >> factor;
|
|
||||||
return bytes_align2os_bytes(env, threshold);
|
|
||||||
}
|
|
||||||
#endif /* MDBX_ENABLE_MADVISE */
|
|
||||||
|
|
||||||
static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending,
|
static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending,
|
||||||
meta_troika_t *const troika) {
|
meta_troika_t *const troika) {
|
||||||
eASSERT(env, ((env->me_flags ^ flags) & MDBX_WRITEMAP) == 0);
|
eASSERT(env, ((env->me_flags ^ flags) & MDBX_WRITEMAP) == 0);
|
||||||
@ -12482,127 +12483,131 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending,
|
|||||||
|
|
||||||
pgno_t shrink = 0;
|
pgno_t shrink = 0;
|
||||||
if (flags & MDBX_SHRINK_ALLOWED) {
|
if (flags & MDBX_SHRINK_ALLOWED) {
|
||||||
/* LY: check conditions to discard unused pages */
|
const size_t prev_discarded_pgno =
|
||||||
const pgno_t largest_pgno = find_largest_snapshot(
|
atomic_load32(&env->me_lck->mti_discarded_tail, mo_Relaxed);
|
||||||
env, (head.ptr_c->mm_geo.next > pending->mm_geo.next)
|
if (prev_discarded_pgno < pending->mm_geo.next)
|
||||||
? head.ptr_c->mm_geo.next
|
env->me_lck->mti_discarded_tail.weak = pending->mm_geo.next;
|
||||||
: pending->mm_geo.next);
|
else if (prev_discarded_pgno >=
|
||||||
eASSERT(env, largest_pgno >= NUM_METAS);
|
pending->mm_geo.next + env->me_madv_threshold) {
|
||||||
|
/* LY: check conditions to discard unused pages */
|
||||||
|
const pgno_t largest_pgno = find_largest_snapshot(
|
||||||
|
env, (head.ptr_c->mm_geo.next > pending->mm_geo.next)
|
||||||
|
? head.ptr_c->mm_geo.next
|
||||||
|
: pending->mm_geo.next);
|
||||||
|
eASSERT(env, largest_pgno >= NUM_METAS);
|
||||||
|
|
||||||
#if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__)
|
#if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__)
|
||||||
const pgno_t edge = env->me_poison_edge;
|
const pgno_t edge = env->me_poison_edge;
|
||||||
if (edge > largest_pgno) {
|
if (edge > largest_pgno) {
|
||||||
env->me_poison_edge = largest_pgno;
|
env->me_poison_edge = largest_pgno;
|
||||||
VALGRIND_MAKE_MEM_NOACCESS(
|
VALGRIND_MAKE_MEM_NOACCESS(
|
||||||
ptr_disp(env->me_map, pgno2bytes(env, largest_pgno)),
|
ptr_disp(env->me_map, pgno2bytes(env, largest_pgno)),
|
||||||
pgno2bytes(env, edge - largest_pgno));
|
pgno2bytes(env, edge - largest_pgno));
|
||||||
MDBX_ASAN_POISON_MEMORY_REGION(
|
MDBX_ASAN_POISON_MEMORY_REGION(
|
||||||
ptr_disp(env->me_map, pgno2bytes(env, largest_pgno)),
|
ptr_disp(env->me_map, pgno2bytes(env, largest_pgno)),
|
||||||
pgno2bytes(env, edge - largest_pgno));
|
pgno2bytes(env, edge - largest_pgno));
|
||||||
}
|
}
|
||||||
#endif /* MDBX_USE_VALGRIND || __SANITIZE_ADDRESS__ */
|
#endif /* MDBX_USE_VALGRIND || __SANITIZE_ADDRESS__ */
|
||||||
|
|
||||||
#if MDBX_ENABLE_MADVISE && \
|
#if MDBX_ENABLE_MADVISE && \
|
||||||
(defined(MADV_DONTNEED) || defined(POSIX_MADV_DONTNEED))
|
(defined(MADV_DONTNEED) || defined(POSIX_MADV_DONTNEED))
|
||||||
const size_t largest_bytes = pgno2bytes(env, largest_pgno);
|
const size_t discard_edge_pgno = pgno_align2os_pgno(env, largest_pgno);
|
||||||
/* threshold to avoid unreasonable frequent madvise() calls */
|
if (prev_discarded_pgno >= discard_edge_pgno + env->me_madv_threshold) {
|
||||||
const size_t threshold = madvise_threshold(env, largest_bytes);
|
const size_t prev_discarded_bytes =
|
||||||
const size_t discard_edge_bytes = bytes_align2os_bytes(
|
pgno_align2os_bytes(env, prev_discarded_pgno);
|
||||||
env, ((MDBX_RDONLY &
|
const size_t discard_edge_bytes = pgno2bytes(env, discard_edge_pgno);
|
||||||
(env->me_lck_mmap.lck ? env->me_lck_mmap.lck->mti_envmode.weak
|
/* из-за выравнивания prev_discarded_bytes и discard_edge_bytes
|
||||||
: env->me_flags))
|
* могут быть равны */
|
||||||
? largest_bytes
|
if (prev_discarded_bytes > discard_edge_bytes) {
|
||||||
: largest_bytes + threshold));
|
NOTICE("shrink-MADV_%s %zu..%zu", "DONTNEED", discard_edge_pgno,
|
||||||
const pgno_t discard_edge_pgno = bytes2pgno(env, discard_edge_bytes);
|
prev_discarded_pgno);
|
||||||
const pgno_t prev_discarded_pgno =
|
munlock_after(env, discard_edge_pgno,
|
||||||
atomic_load32(&env->me_lck->mti_discarded_tail, mo_Relaxed);
|
bytes_align2os_bytes(env, env->me_dxb_mmap.current));
|
||||||
if (prev_discarded_pgno >= discard_edge_pgno + bytes2pgno(env, threshold)) {
|
const uint32_t munlocks_before =
|
||||||
NOTICE("shrink-MADV_%s %u..%u", "DONTNEED", largest_pgno,
|
atomic_load32(&env->me_lck->mti_mlcnt[1], mo_Relaxed);
|
||||||
prev_discarded_pgno);
|
|
||||||
atomic_store32(&env->me_lck->mti_discarded_tail, discard_edge_pgno,
|
|
||||||
mo_Relaxed);
|
|
||||||
const size_t prev_discarded_bytes =
|
|
||||||
ceil_powerof2(pgno2bytes(env, prev_discarded_pgno), env->me_os_psize);
|
|
||||||
ENSURE(env, prev_discarded_bytes > discard_edge_bytes);
|
|
||||||
munlock_after(env, discard_edge_pgno,
|
|
||||||
bytes_align2os_bytes(env, env->me_dxb_mmap.current));
|
|
||||||
const uint32_t munlocks_before =
|
|
||||||
atomic_load32(&env->me_lck->mti_mlcnt[1], mo_Relaxed);
|
|
||||||
#if defined(MADV_DONTNEED)
|
#if defined(MADV_DONTNEED)
|
||||||
int advise = MADV_DONTNEED;
|
int advise = MADV_DONTNEED;
|
||||||
#if defined(MADV_FREE) && \
|
#if defined(MADV_FREE) && \
|
||||||
0 /* MADV_FREE works for only anonymous vma at the moment */
|
0 /* MADV_FREE works for only anonymous vma at the moment */
|
||||||
if ((env->me_flags & MDBX_WRITEMAP) && linux_kernel_version > 0x04050000)
|
if ((env->me_flags & MDBX_WRITEMAP) &&
|
||||||
advise = MADV_FREE;
|
linux_kernel_version > 0x04050000)
|
||||||
|
advise = MADV_FREE;
|
||||||
#endif /* MADV_FREE */
|
#endif /* MADV_FREE */
|
||||||
int err = madvise(ptr_disp(env->me_map, discard_edge_bytes),
|
int err = madvise(ptr_disp(env->me_map, discard_edge_bytes),
|
||||||
prev_discarded_bytes - discard_edge_bytes, advise)
|
prev_discarded_bytes - discard_edge_bytes, advise)
|
||||||
? ignore_enosys(errno)
|
? ignore_enosys(errno)
|
||||||
: MDBX_SUCCESS;
|
: MDBX_SUCCESS;
|
||||||
#else
|
#else
|
||||||
int err = ignore_enosys(posix_madvise(
|
int err = ignore_enosys(posix_madvise(
|
||||||
ptr_disp(env->me_map, discard_edge_bytes),
|
ptr_disp(env->me_map, discard_edge_bytes),
|
||||||
prev_discarded_bytes - discard_edge_bytes, POSIX_MADV_DONTNEED));
|
prev_discarded_bytes - discard_edge_bytes, POSIX_MADV_DONTNEED));
|
||||||
#endif
|
#endif
|
||||||
if (unlikely(MDBX_IS_ERROR(err))) {
|
if (unlikely(MDBX_IS_ERROR(err))) {
|
||||||
const uint32_t mlocks_after =
|
const uint32_t mlocks_after =
|
||||||
atomic_load32(&env->me_lck->mti_mlcnt[0], mo_Relaxed);
|
atomic_load32(&env->me_lck->mti_mlcnt[0], mo_Relaxed);
|
||||||
if (err == MDBX_EINVAL) {
|
if (err == MDBX_EINVAL) {
|
||||||
const int severity = (mlocks_after - munlocks_before)
|
const int severity = (mlocks_after - munlocks_before)
|
||||||
? MDBX_LOG_NOTICE
|
? MDBX_LOG_NOTICE
|
||||||
: MDBX_LOG_WARN;
|
: MDBX_LOG_WARN;
|
||||||
if (LOG_ENABLED(severity))
|
if (LOG_ENABLED(severity))
|
||||||
debug_log(severity, __func__, __LINE__,
|
debug_log(
|
||||||
"%s-madvise: ignore EINVAL (%d) since some pages maybe "
|
severity, __func__, __LINE__,
|
||||||
"locked (%u/%u mlcnt-processes)",
|
"%s-madvise: ignore EINVAL (%d) since some pages maybe "
|
||||||
"shrink", err, mlocks_after, munlocks_before);
|
"locked (%u/%u mlcnt-processes)",
|
||||||
} else {
|
"shrink", err, mlocks_after, munlocks_before);
|
||||||
ERROR("%s-madvise(%s, %zu, +%zu), %u/%u mlcnt-processes, err %d",
|
} else {
|
||||||
"shrink", "DONTNEED", discard_edge_bytes,
|
ERROR("%s-madvise(%s, %zu, +%zu), %u/%u mlcnt-processes, err %d",
|
||||||
prev_discarded_bytes - discard_edge_bytes, mlocks_after,
|
"shrink", "DONTNEED", discard_edge_bytes,
|
||||||
munlocks_before, err);
|
prev_discarded_bytes - discard_edge_bytes, mlocks_after,
|
||||||
return err;
|
munlocks_before, err);
|
||||||
|
return err;
|
||||||
|
}
|
||||||
|
} else
|
||||||
|
env->me_lck->mti_discarded_tail.weak = discard_edge_pgno;
|
||||||
}
|
}
|
||||||
} else
|
}
|
||||||
env->me_lck->mti_discarded_tail.weak = discard_edge_pgno;
|
|
||||||
}
|
|
||||||
#endif /* MDBX_ENABLE_MADVISE && (MADV_DONTNEED || POSIX_MADV_DONTNEED) */
|
#endif /* MDBX_ENABLE_MADVISE && (MADV_DONTNEED || POSIX_MADV_DONTNEED) */
|
||||||
|
|
||||||
/* LY: check conditions to shrink datafile */
|
/* LY: check conditions to shrink datafile */
|
||||||
const pgno_t backlog_gap = 3 + pending->mm_dbs[FREE_DBI].md_depth * 3;
|
const pgno_t backlog_gap = 3 + pending->mm_dbs[FREE_DBI].md_depth * 3;
|
||||||
pgno_t shrink_step = 0;
|
pgno_t shrink_step = 0;
|
||||||
if (pending->mm_geo.shrink_pv &&
|
if (pending->mm_geo.shrink_pv &&
|
||||||
pending->mm_geo.now - pending->mm_geo.next >
|
pending->mm_geo.now - pending->mm_geo.next >
|
||||||
(shrink_step = pv2pages(pending->mm_geo.shrink_pv)) + backlog_gap) {
|
(shrink_step = pv2pages(pending->mm_geo.shrink_pv)) +
|
||||||
if (pending->mm_geo.now > largest_pgno &&
|
backlog_gap) {
|
||||||
pending->mm_geo.now - largest_pgno > shrink_step + backlog_gap) {
|
if (pending->mm_geo.now > largest_pgno &&
|
||||||
const pgno_t aligner =
|
pending->mm_geo.now - largest_pgno > shrink_step + backlog_gap) {
|
||||||
pending->mm_geo.grow_pv
|
const pgno_t aligner =
|
||||||
? /* grow_step */ pv2pages(pending->mm_geo.grow_pv)
|
pending->mm_geo.grow_pv
|
||||||
: shrink_step;
|
? /* grow_step */ pv2pages(pending->mm_geo.grow_pv)
|
||||||
const pgno_t with_backlog_gap = largest_pgno + backlog_gap;
|
: shrink_step;
|
||||||
const pgno_t aligned = pgno_align2os_pgno(
|
const pgno_t with_backlog_gap = largest_pgno + backlog_gap;
|
||||||
env, with_backlog_gap + aligner - with_backlog_gap % aligner);
|
const pgno_t aligned = pgno_align2os_pgno(
|
||||||
const pgno_t bottom =
|
env, with_backlog_gap + aligner - with_backlog_gap % aligner);
|
||||||
(aligned > pending->mm_geo.lower) ? aligned : pending->mm_geo.lower;
|
const pgno_t bottom = (aligned > pending->mm_geo.lower)
|
||||||
if (pending->mm_geo.now > bottom) {
|
? aligned
|
||||||
if (TROIKA_HAVE_STEADY(troika))
|
: pending->mm_geo.lower;
|
||||||
/* force steady, but only if steady-checkpoint is present */
|
if (pending->mm_geo.now > bottom) {
|
||||||
flags &= MDBX_WRITEMAP | MDBX_SHRINK_ALLOWED;
|
if (TROIKA_HAVE_STEADY(troika))
|
||||||
shrink = pending->mm_geo.now - bottom;
|
/* force steady, but only if steady-checkpoint is present */
|
||||||
pending->mm_geo.now = bottom;
|
flags &= MDBX_WRITEMAP | MDBX_SHRINK_ALLOWED;
|
||||||
if (unlikely(head.txnid == pending->unsafe_txnid)) {
|
shrink = pending->mm_geo.now - bottom;
|
||||||
const txnid_t txnid = safe64_txnid_next(pending->unsafe_txnid);
|
pending->mm_geo.now = bottom;
|
||||||
NOTICE("force-forward pending-txn %" PRIaTXN " -> %" PRIaTXN,
|
if (unlikely(head.txnid == pending->unsafe_txnid)) {
|
||||||
pending->unsafe_txnid, txnid);
|
const txnid_t txnid = safe64_txnid_next(pending->unsafe_txnid);
|
||||||
ENSURE(env, !env->me_txn0 ||
|
NOTICE("force-forward pending-txn %" PRIaTXN " -> %" PRIaTXN,
|
||||||
(env->me_txn0->mt_owner != osal_thread_self() &&
|
pending->unsafe_txnid, txnid);
|
||||||
!env->me_txn));
|
ENSURE(env, !env->me_txn0 ||
|
||||||
if (unlikely(txnid > MAX_TXNID)) {
|
(env->me_txn0->mt_owner != osal_thread_self() &&
|
||||||
rc = MDBX_TXN_FULL;
|
!env->me_txn));
|
||||||
ERROR("txnid overflow, raise %d", rc);
|
if (unlikely(txnid > MAX_TXNID)) {
|
||||||
goto fail;
|
rc = MDBX_TXN_FULL;
|
||||||
|
ERROR("txnid overflow, raise %d", rc);
|
||||||
|
goto fail;
|
||||||
|
}
|
||||||
|
meta_set_txnid(env, pending, txnid);
|
||||||
|
eASSERT(env, coherency_check_meta(env, pending, true));
|
||||||
}
|
}
|
||||||
meta_set_txnid(env, pending, txnid);
|
|
||||||
eASSERT(env, coherency_check_meta(env, pending, true));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1398,6 +1398,7 @@ struct MDBX_env {
|
|||||||
uint32_t me_live_reader; /* have liveness lock in reader table */
|
uint32_t me_live_reader; /* have liveness lock in reader table */
|
||||||
void *me_userctx; /* User-settable context */
|
void *me_userctx; /* User-settable context */
|
||||||
MDBX_hsr_func *me_hsr_callback; /* Callback for kicking laggard readers */
|
MDBX_hsr_func *me_hsr_callback; /* Callback for kicking laggard readers */
|
||||||
|
size_t me_madv_threshold;
|
||||||
|
|
||||||
struct {
|
struct {
|
||||||
unsigned dp_reserve_limit;
|
unsigned dp_reserve_limit;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user