mdbx: rework (NO)READAHEAD handling.

Resolves https://github.com/erthink/libmdbx/issues/164

---

NOTE: Seems there is a bug in the Mach/Darwin/OSX kernel,
because MADV_WILLNEED with offset != 0 may cause SIGBUS
on following access to the hinted region.

19.6.0 Darwin Kernel Version 19.6.0: Tue Jan 12 22:13:05 PST 2021; root:xnu-6153.141.16~1/RELEASE_X86_64 x86_64

Change-Id: I11ebbf2bd35e3dba9d078be16cb5678aecf8329c
This commit is contained in:
Leonid Yuriev 2021-04-17 00:13:51 +03:00
parent 28affe79d8
commit 3e0fad1cf6
3 changed files with 130 additions and 66 deletions

View File

@ -1896,6 +1896,7 @@ xkeep
XLB
xmerge
xml
xnu
XOPEN
xp
XSI

View File

@ -5434,36 +5434,86 @@ static __always_inline __maybe_unused int ignore_enosys(int err) {
#if MDBX_ENABLE_MADVISE
/* Turn on/off readahead. It's harmful when the DB is larger than RAM. */
static __cold int mdbx_set_readahead(MDBX_env *env, const size_t offset,
const size_t length, const bool enable) {
assert(length > 0);
static __cold int mdbx_set_readahead(MDBX_env *env, const pgno_t edge,
const bool enable,
const bool force_whole) {
mdbx_assert(env, edge >= NUM_METAS && edge <= MAX_PAGENO);
mdbx_assert(env, (enable & 1) == (enable != 0));
const bool toggle = force_whole ||
((enable ^ *env->me_readahead_anchor) & 1) ||
!*env->me_readahead_anchor;
const pgno_t prev_edge = *env->me_readahead_anchor >> 1;
const size_t limit = env->me_dxb_mmap.limit;
size_t offset =
toggle ? 0
: pgno_align2os_bytes(env, (prev_edge < edge) ? prev_edge : edge);
offset = (offset < limit) ? offset : limit;
size_t length =
pgno_align2os_bytes(env, (prev_edge < edge) ? edge : prev_edge);
length = (length < limit) ? length : limit;
length -= offset;
mdbx_assert(env, 0 <= (intptr_t)length);
if (length == 0)
return MDBX_SUCCESS;
int err;
mdbx_notice("readahead %s %u..%u", enable ? "ON" : "OFF",
bytes2pgno(env, offset), bytes2pgno(env, offset + length));
#if defined(F_RDAHEAD)
if (unlikely(fcntl(env->me_lazy_fd, F_RDAHEAD, enable) == -1))
return errno;
if (toggle && unlikely(fcntl(env->me_lazy_fd, F_RDAHEAD, enable) == -1)) {
err = errno;
goto bailout;
}
#endif /* F_RDAHEAD */
if (enable) {
#if defined(MADV_NORMAL)
err = madvise(env->me_map + offset, length, MADV_NORMAL)
? ignore_enosys(errno)
: MDBX_SUCCESS;
if (unlikely(MDBX_IS_ERROR(err)))
goto bailout;
#elif defined(POSIX_MADV_NORMAL)
err = ignore_enosys(
posix_madvise(env->me_map + offset, length, POSIX_MADV_NORMAL));
if (unlikely(MDBX_IS_ERROR(err)))
goto bailout;
#elif defined(POSIX_FADV_NORMAL) && defined(POSIX_FADV_WILLNEED)
err = ignore_enosys(
posix_fadvise(env->me_lazy_fd, offset, length, POSIX_FADV_NORMAL));
if (unlikely(MDBX_IS_ERROR(err)))
goto bailout;
#elif defined(_WIN32) || defined(_WIN64)
/* no madvise on Windows */
#else
#warning "FIXME"
#endif
if (toggle) {
/* NOTE: Seems there is a bug in the Mach/Darwin/OSX kernel,
* because MADV_WILLNEED with offset != 0 may cause SIGBUS
* on following access to the hinted region.
* 19.6.0 Darwin Kernel Version 19.6.0: Tue Jan 12 22:13:05 PST 2021;
* root:xnu-6153.141.16~1/RELEASE_X86_64 x86_64 */
#if defined(F_RDADVISE)
struct radvisory hint;
hint.ra_offset = offset;
hint.ra_count = length;
(void)/* Ignore ENOTTY for DB on the ram-disk and so on */ fcntl(
env->me_lazy_fd, F_RDADVISE, &hint);
#endif /* F_RDADVISE */
#if defined(MADV_WILLNEED)
int err = madvise(env->me_map + offset, length, MADV_WILLNEED)
#elif defined(MADV_WILLNEED)
err = madvise(env->me_map + offset, length, MADV_WILLNEED)
? ignore_enosys(errno)
: MDBX_SUCCESS;
if (unlikely(MDBX_IS_ERROR(err)))
return err;
goto bailout;
#elif defined(POSIX_MADV_WILLNEED)
int err = ignore_enosys(
err = ignore_enosys(
posix_madvise(env->me_map + offset, length, POSIX_MADV_WILLNEED));
if (unlikely(MDBX_IS_ERROR(err)))
return err;
goto bailout;
#elif defined(_WIN32) || defined(_WIN64)
if (mdbx_PrefetchVirtualMemory) {
WIN32_MEMORY_RANGE_ENTRY hint;
@ -5472,31 +5522,42 @@ static __cold int mdbx_set_readahead(MDBX_env *env, const size_t offset,
(void)mdbx_PrefetchVirtualMemory(GetCurrentProcess(), 1, &hint, 0);
}
#elif defined(POSIX_FADV_WILLNEED)
int err = ignore_enosys(
err = ignore_enosys(
posix_fadvise(env->me_lazy_fd, offset, length, POSIX_FADV_WILLNEED));
if (unlikely(MDBX_IS_ERROR(err)))
return err;
#endif /* MADV_WILLNEED */
goto bailout;
#else
#warning "FIXME"
#endif
}
} else {
#if defined(MADV_RANDOM)
int err = madvise(env->me_map + offset, length, MADV_RANDOM)
err = madvise(env->me_map + offset, length, MADV_RANDOM)
? ignore_enosys(errno)
: MDBX_SUCCESS;
if (unlikely(MDBX_IS_ERROR(err)))
return err;
goto bailout;
#elif defined(POSIX_MADV_RANDOM)
int err = ignore_enosys(
err = ignore_enosys(
posix_madvise(env->me_map + offset, length, POSIX_MADV_RANDOM));
if (unlikely(MDBX_IS_ERROR(err)))
return err;
goto bailout;
#elif defined(POSIX_FADV_RANDOM)
int err = ignore_enosys(
err = ignore_enosys(
posix_fadvise(env->me_lazy_fd, offset, length, POSIX_FADV_RANDOM));
if (unlikely(MDBX_IS_ERROR(err)))
return err;
goto bailout;
#elif defined(_WIN32) || defined(_WIN64)
/* no madvise on Windows */
#else
#warning "FIXME"
#endif /* MADV_RANDOM */
}
return MDBX_SUCCESS;
*env->me_readahead_anchor = (enable & 1) + (edge << 1);
err = MDBX_SUCCESS;
bailout:
return err;
}
#endif /* MDBX_ENABLE_MADVISE */
@ -5636,28 +5697,18 @@ static __cold int mdbx_mapresize(MDBX_env *env, const pgno_t used_pgno,
mapping_can_be_moved);
#if MDBX_ENABLE_MADVISE
if (rc == MDBX_SUCCESS && (env->me_flags & MDBX_NORDAHEAD) == 0) {
const int readahead =
if (rc == MDBX_SUCCESS) {
env->me_discarded_tail->weak = size_pgno;
const bool readahead =
!(env->me_flags & MDBX_NORDAHEAD) &&
mdbx_is_readahead_reasonable(size_bytes, -(intptr_t)prev_size);
if (readahead == MDBX_RESULT_FALSE)
rc = mdbx_set_readahead(
env, 0, (size_bytes > prev_size) ? size_bytes : prev_size, false);
else if (readahead == MDBX_RESULT_TRUE) {
const size_t readahead_pivot =
(limit_bytes != prev_limit || env->me_dxb_mmap.address != prev_addr
const bool force = limit_bytes != prev_limit ||
env->me_dxb_mmap.address != prev_addr
#if defined(_WIN32) || defined(_WIN64)
|| prev_size > size_bytes
#endif /* Windows */
)
? 0 /* reassign readahead to the entire map
because it was remapped */
: prev_size;
if (size_bytes > readahead_pivot) {
env->me_discarded_tail->weak = size_pgno;
rc = mdbx_set_readahead(env, readahead_pivot,
size_bytes - readahead_pivot, true);
}
}
;
rc = mdbx_set_readahead(env, size_pgno, readahead, force);
}
#endif /* MDBX_ENABLE_MADVISE */
@ -11141,7 +11192,7 @@ static __cold int mdbx_setup_dxb(MDBX_env *env, const int lck_rc) {
#if MDBX_ENABLE_MADVISE
/* calculate readahead hint before mmap with zero redundant pages */
const bool readahead =
(env->me_flags & MDBX_NORDAHEAD) == 0 &&
!(env->me_flags & MDBX_NORDAHEAD) &&
mdbx_is_readahead_reasonable(used_bytes, 0) == MDBX_RESULT_TRUE;
#endif /* MDBX_ENABLE_MADVISE */
@ -11375,9 +11426,9 @@ static __cold int mdbx_setup_dxb(MDBX_env *env, const int lck_rc) {
atomic_store32(env->me_discarded_tail, bytes2pgno(env, used_aligned2os_bytes),
mo_Relaxed);
#if MDBX_ENABLE_MADVISE
if (used_aligned2os_bytes < env->me_dxb_mmap.current) {
if (lck_rc && used_aligned2os_bytes < env->me_dxb_mmap.current) {
#if defined(MADV_REMOVE)
if (lck_rc && (env->me_flags & MDBX_WRITEMAP) != 0 &&
if ((env->me_flags & MDBX_WRITEMAP) != 0 &&
/* not recovery mode */ env->me_stuck_meta < 0) {
mdbx_notice("open-MADV_%s %u..%u", "REMOVE (deallocate file space)",
env->me_discarded_tail->weak,
@ -11416,8 +11467,8 @@ static __cold int mdbx_setup_dxb(MDBX_env *env, const int lck_rc) {
#endif /* MADV_DONTNEED */
}
err = mdbx_set_readahead(env, 0, used_bytes, readahead);
if (err != MDBX_SUCCESS && lck_rc == /* lck exclusive */ MDBX_RESULT_TRUE)
err = mdbx_set_readahead(env, bytes2pgno(env, used_bytes), readahead, true);
if (unlikely(err != MDBX_SUCCESS))
return err;
#endif /* MDBX_ENABLE_MADVISE */
@ -11470,6 +11521,7 @@ static __cold int mdbx_setup_lck(MDBX_env *env, char *lck_pathname,
env->me_unsynced_pages = &env->me_lckless_stub.autosync_pending;
env->me_autosync_threshold = &env->me_lckless_stub.autosync_threshold;
env->me_discarded_tail = &env->me_lckless_stub.discarded_tail;
env->me_readahead_anchor = &env->me_lckless_stub.readahead_anchor;
env->me_meta_sync_txnid = &env->me_lckless_stub.meta_sync_txnid;
env->me_maxreaders = UINT_MAX;
#if MDBX_LOCKING > 0
@ -11569,6 +11621,10 @@ static __cold int mdbx_setup_lck(MDBX_env *env, char *lck_pathname,
: MDBX_SUCCESS;
if (unlikely(MDBX_IS_ERROR(err)))
goto bailout;
#elif defined(POSIX_MADV_WILLNEED)
err = ignore_enosys(posix_madvise(env->me_lck, size, POSIX_MADV_WILLNEED));
if (unlikely(MDBX_IS_ERROR(err)))
goto bailout;
#endif /* MADV_WILLNEED */
#endif /* MDBX_ENABLE_MADVISE */
@ -11623,6 +11679,7 @@ static __cold int mdbx_setup_lck(MDBX_env *env, char *lck_pathname,
env->me_unsynced_pages = &lck->mti_unsynced_pages;
env->me_autosync_threshold = &lck->mti_autosync_threshold;
env->me_discarded_tail = &lck->mti_discarded_tail;
env->me_readahead_anchor = &lck->mti_readahead_anchor;
env->me_meta_sync_txnid = &lck->mti_meta_sync_txnid;
#if MDBX_LOCKING > 0
env->me_wlock = &lck->mti_wlock;
@ -12216,6 +12273,7 @@ static __cold int mdbx_env_close0(MDBX_env *env) {
env->me_unsynced_pages = nullptr;
env->me_autosync_threshold = nullptr;
env->me_discarded_tail = nullptr;
env->me_readahead_anchor = nullptr;
env->me_meta_sync_txnid = nullptr;
if (env->me_flags & MDBX_ENV_TXKEY)
mdbx_rthc_remove(env->me_txkey);

View File

@ -536,7 +536,7 @@ typedef struct MDBX_lockinfo {
* Zero means timed auto-sync is disabled. */
MDBX_atomic_uint64_t mti_autosync_period;
/* Marker to distinguish uniqueness of DB/CLK.*/
/* Marker to distinguish uniqueness of DB/CLK. */
MDBX_atomic_uint64_t mti_bait_uniqueness;
alignas(MDBX_CACHELINE_SIZE) /* cacheline ---------------------------------*/
@ -562,6 +562,9 @@ typedef struct MDBX_lockinfo {
/* Timestamp of the last readers check. */
MDBX_atomic_uint64_t mti_reader_check_timestamp;
/* Shared anchor for tracking readahead edge and enabled/disabled status. */
pgno_t mti_readahead_anchor;
alignas(MDBX_CACHELINE_SIZE) /* cacheline ---------------------------------*/
/* Readeaders registration lock. */
@ -975,6 +978,7 @@ struct MDBX_env {
atomic_pgno_t *me_unsynced_pages;
atomic_pgno_t *me_autosync_threshold;
atomic_pgno_t *me_discarded_tail;
pgno_t *me_readahead_anchor;
MDBX_atomic_uint32_t *me_meta_sync_txnid;
MDBX_hsr_func *me_hsr_callback; /* Callback for kicking laggard readers */
unsigned me_dp_reserve_len;
@ -998,6 +1002,7 @@ struct MDBX_env {
atomic_pgno_t autosync_pending;
atomic_pgno_t autosync_threshold;
atomic_pgno_t discarded_tail;
pgno_t readahead_anchor;
MDBX_atomic_uint32_t meta_sync_txnid;
} me_lckless_stub;
#if MDBX_DEBUG