mdbx: добавление mdbx_env_warmup()

This commit is contained in:
Леонид Юрьев (Leonid Yuriev) 2022-10-24 01:02:38 +03:00
parent b04f7814ef
commit d661d4bac7
4 changed files with 465 additions and 13 deletions

88
mdbx.h
View File

@ -2808,6 +2808,94 @@ LIBMDBX_INLINE_API(int, mdbx_env_close, (MDBX_env * env)) {
return mdbx_env_close_ex(env, false); return mdbx_env_close_ex(env, false);
} }
/** \brief Warming up options
* \ingroup c_settings
* \anchor warmup_flags
* \see mdbx_env_warmup() */
enum MDBX_warmup_flags_t {
/** By default \ref mdbx_env_warmup() just ask OS kernel to asynchronously
* prefetch database pages. */
MDBX_warmup_default = 0,
/** Peeking all pages of allocated portion of the database
* to force ones to be loaded into memory. However, the pages are just peeks
* sequentially, so unused pages that are in GC will be loaded in the same
* way as those that contain payload. */
MDBX_warmup_force = 1,
/** Using system calls to peeks pages instead of directly accessing ones,
* which at the cost of additional overhead avoids killing the current
* process by OOM-killer in a lack of memory condition.
* \note Has effect only on POSIX (non-Windows) systems with conjunction
* to \ref MDBX_warmup_force option. */
MDBX_warmup_oomsafe = 2,
/** Try to lock database pages in memory by `mlock()` on POSIX-systems
* or `VirtualLock()` on Windows. Please refer to description of these
* functions for reasonability of such locking and the information of
* effects, including the system as a whole.
*
* Such locking in memory requires that the corresponding resource limits
* (e.g. `RLIMIT_RSS`, `RLIMIT_MEMLOCK` or process working set size)
* and the availability of system RAM are sufficiently high.
*
* On successful, all currently allocated pages, both unused in GC and
* containing payload, will be locked in memory until the environment closes,
* or explicitly unblocked by using \ref MDBX_warmup_release, or the
* database geomenry will changed, including its auto-shrinking. */
MDBX_warmup_lock = 4,
/** Alters corresponding current resource limits to be enough for lock pages
* by \ref MDBX_warmup_lock. However, this option should be used in simpliest
* applications since takes into account only current size of this environment
* disregarding all other factors. For real-world database application you
* will need full-fledged management of resources and their limits with
* respective engineering. */
MDBX_warmup_touchlimit = 8,
/** Release the lock that was performed before by \ref MDBX_warmup_lock. */
MDBX_warmup_release = 16,
};
#ifndef __cplusplus
typedef enum MDBX_warmup_flags_t MDBX_warmup_flags_t;
#else
DEFINE_ENUM_FLAG_OPERATORS(MDBX_warmup_flags_t)
#endif
/** \brief Warms up the database by loading pages into memory, optionally lock
* ones. \ingroup c_settings
*
* Depending on the specified flags, notifies OS kernel about following access,
* force loads the database pages, including locks ones in memory or releases
* such a lock. However, the function does not analyze the b-tree nor the GC.
* Therefore an unused pages that are in GC handled (i.e. will be loaded) in
* the same way as those that contain payload.
*
* At least one of env or txn argument must be non-null.
*
* \param [in] env An environment handle returned
* by \ref mdbx_env_create().
* \param [in] txn A transaction handle returned
* by \ref mdbx_txn_begin().
* \param [in] flags The \ref warmup_flags, bitwise OR'ed together.
*
* \param [in] timeout_seconds_16dot16 Optional timeout which checking only
* during explicitly peeking database pages
* for loading ones if the \ref MDBX_warmup_force
* option was spefified.
*
* \returns A non-zero error value on failure and 0 on success.
* Some possible errors are:
*
* \retval MDBX_ENOSYS The system does not support requested
* operation(s).
*
* \retval MDBX_RESULT_TRUE The specified timeout is reached during load
* data into memory. */
LIBMDBX_API int mdbx_env_warmup(const MDBX_env *env, const MDBX_txn *txn,
MDBX_warmup_flags_t flags,
unsigned timeout_seconds_16dot16);
/** \brief Set environment flags. /** \brief Set environment flags.
* \ingroup c_settings * \ingroup c_settings
* *

View File

@ -263,8 +263,10 @@ __extern_C key_t ftok(const char *, int);
#include <sys/ipc.h> #include <sys/ipc.h>
#include <sys/mman.h> #include <sys/mman.h>
#include <sys/param.h> #include <sys/param.h>
#include <sys/resource.h>
#include <sys/stat.h> #include <sys/stat.h>
#include <sys/statvfs.h> #include <sys/statvfs.h>
#include <sys/time.h>
#include <sys/uio.h> #include <sys/uio.h>
#endif /*---------------------------------------------------------------------*/ #endif /*---------------------------------------------------------------------*/

View File

@ -5561,7 +5561,7 @@ MDBX_MAYBE_UNUSED static __always_inline int ignore_enosys(int err) {
#if MDBX_ENABLE_MADVISE #if MDBX_ENABLE_MADVISE
/* Turn on/off readahead. It's harmful when the DB is larger than RAM. */ /* Turn on/off readahead. It's harmful when the DB is larger than RAM. */
__cold static int set_readahead(MDBX_env *env, const pgno_t edge, __cold static int set_readahead(const MDBX_env *env, const pgno_t edge,
const bool enable, const bool force_whole) { const bool enable, const bool force_whole) {
eASSERT(env, edge >= NUM_METAS && edge <= MAX_PAGENO + 1); eASSERT(env, edge >= NUM_METAS && edge <= MAX_PAGENO + 1);
eASSERT(env, (enable & 1) == (enable != 0)); eASSERT(env, (enable & 1) == (enable != 0));
@ -5687,6 +5687,82 @@ __cold static int set_readahead(MDBX_env *env, const pgno_t edge,
} }
#endif /* MDBX_ENABLE_MADVISE */ #endif /* MDBX_ENABLE_MADVISE */
__cold static void update_mlocked(const MDBX_env *env,
const pgno_t new_aligned_mlocked_pgno,
const bool lock_not_release) {
for (;;) {
const pgno_t mlock_pgno_snap =
atomic_load32(&env->me_mlocked_pgno, mo_AcquireRelease);
eASSERT(env, pgno_align2os_pgno(env, mlock_pgno_snap) == mlock_pgno_snap);
eASSERT(env, pgno_align2os_pgno(env, new_aligned_mlocked_pgno) ==
new_aligned_mlocked_pgno);
if (lock_not_release ? (mlock_pgno_snap >= new_aligned_mlocked_pgno)
: (mlock_pgno_snap <= new_aligned_mlocked_pgno))
break;
if (likely(atomic_cas32(&((MDBX_env *)env)->me_mlocked_pgno,
mlock_pgno_snap, new_aligned_mlocked_pgno)))
for (;;) {
MDBX_atomic_uint32_t *const mlock_counter =
&env->me_lck->mti_mlock_counter;
const uint32_t snap_counter = atomic_load32(mlock_counter, mo_Relaxed);
if (mlock_pgno_snap == 0 && snap_counter < INT_MAX) {
eASSERT(env, lock_not_release);
if (unlikely(
!atomic_cas32(mlock_counter, snap_counter, snap_counter + 1)))
continue;
}
if (new_aligned_mlocked_pgno == 0 && snap_counter > 0) {
eASSERT(env, !lock_not_release);
if (unlikely(
!atomic_cas32(mlock_counter, snap_counter, snap_counter - 1)))
continue;
}
NOTICE("%s-pages %u..%u, mlocked-process(es) %u -> %u",
lock_not_release ? "lock" : "unlock",
lock_not_release ? mlock_pgno_snap : new_aligned_mlocked_pgno,
lock_not_release ? new_aligned_mlocked_pgno : mlock_pgno_snap,
snap_counter, atomic_load32(mlock_counter, mo_Relaxed));
return;
}
}
}
__cold static void munlock_after(const MDBX_env *env, const pgno_t aligned_pgno,
const size_t end_bytes) {
if (atomic_load32(&env->me_mlocked_pgno, mo_AcquireRelease) > aligned_pgno) {
int err = MDBX_ENOSYS;
const size_t munlock_begin = pgno2bytes(env, aligned_pgno);
const size_t munlock_size = end_bytes - munlock_begin;
eASSERT(env, end_bytes % env->me_os_psize == 0 &&
munlock_begin % env->me_os_psize == 0 &&
munlock_size % env->me_os_psize == 0);
#if defined(_WIN32) || defined(_WIN64)
err = VirtualUnlock(env->me_map + munlock_begin, munlock_size)
? MDBX_SUCCESS
: (int)GetLastError();
if (err == ERROR_NOT_LOCKED)
err = MDBX_SUCCESS;
#elif defined(_POSIX_MEMLOCK_RANGE)
err = munlock(env->me_map + munlock_begin, munlock_size) ? errno
: MDBX_SUCCESS;
#endif
if (likely(err == MDBX_SUCCESS))
update_mlocked(env, aligned_pgno, false);
else {
#if defined(_WIN32) || defined(_WIN64)
WARNING("VirtualUnlock(%zu, %zu) error %d", munlock_begin, munlock_size,
err);
#else
WARNING("munlock(%zu, %zu) error %d", munlock_begin, munlock_size, err);
#endif
}
}
}
__cold static void munlock_all(const MDBX_env *env) {
munlock_after(env, 0, bytes_align2os_bytes(env, env->me_dxb_mmap.current));
}
__cold static int map_resize(MDBX_env *env, const pgno_t used_pgno, __cold static int map_resize(MDBX_env *env, const pgno_t used_pgno,
const pgno_t size_pgno, const pgno_t limit_pgno, const pgno_t size_pgno, const pgno_t limit_pgno,
const bool implicit) { const bool implicit) {
@ -5790,6 +5866,12 @@ __cold static int map_resize(MDBX_env *env, const pgno_t used_pgno,
goto bailout; goto bailout;
} }
const pgno_t aligned_munlock_pgno =
(mresize_flags & (MDBX_MRESIZE_MAY_UNMAP | MDBX_MRESIZE_MAY_MOVE))
? 0
: bytes2pgno(env, size_bytes);
munlock_after(env, aligned_munlock_pgno, size_bytes);
#if MDBX_ENABLE_MADVISE #if MDBX_ENABLE_MADVISE
if (size_bytes < prev_size) { if (size_bytes < prev_size) {
NOTICE("resize-MADV_%s %u..%u", NOTICE("resize-MADV_%s %u..%u",
@ -5820,10 +5902,23 @@ __cold static int map_resize(MDBX_env *env, const pgno_t used_pgno,
prev_size - size_bytes, prev_size - size_bytes,
POSIX_FADV_DONTNEED)); POSIX_FADV_DONTNEED));
#endif /* MADV_DONTNEED */ #endif /* MADV_DONTNEED */
if (unlikely(MDBX_IS_ERROR(rc))) uint32_t snap_mlock_counter;
goto bailout; if (unlikely(rc == MDBX_EINVAL) &&
if (env->me_lck->mti_discarded_tail.weak > size_pgno) (snap_mlock_counter =
env->me_lck->mti_discarded_tail.weak = size_pgno; atomic_load32(&env->me_lck->mti_mlock_counter, mo_Relaxed)) > 0) {
NOTICE("%s-madvise: ignore EINVAL (%d) since some pages locked (have %u "
"mlocked-process(es))",
"resize", rc, snap_mlock_counter);
} else {
if (unlikely(MDBX_IS_ERROR(rc))) {
ERROR("%s-madvise(%s, %zu..%zu), %u mlocked-process(es), err %d",
"mresize", "DONTNEED", size_bytes, prev_size - size_bytes,
atomic_load32(&env->me_lck->mti_mlock_counter, mo_Relaxed), rc);
goto bailout;
}
if (env->me_lck->mti_discarded_tail.weak > size_pgno)
env->me_lck->mti_discarded_tail.weak = size_pgno;
}
} }
#endif /* MDBX_ENABLE_MADVISE */ #endif /* MDBX_ENABLE_MADVISE */
@ -11368,13 +11463,15 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending,
const pgno_t prev_discarded_pgno = const pgno_t prev_discarded_pgno =
atomic_load32(&env->me_lck->mti_discarded_tail, mo_Relaxed); atomic_load32(&env->me_lck->mti_discarded_tail, mo_Relaxed);
if (prev_discarded_pgno >= discard_edge_pgno + bytes2pgno(env, threshold)) { if (prev_discarded_pgno >= discard_edge_pgno + bytes2pgno(env, threshold)) {
NOTICE("open-MADV_%s %u..%u", "DONTNEED", largest_pgno, NOTICE("shrink-MADV_%s %u..%u", "DONTNEED", largest_pgno,
prev_discarded_pgno); prev_discarded_pgno);
atomic_store32(&env->me_lck->mti_discarded_tail, discard_edge_pgno, atomic_store32(&env->me_lck->mti_discarded_tail, discard_edge_pgno,
mo_Relaxed); mo_Relaxed);
const size_t prev_discarded_bytes = const size_t prev_discarded_bytes =
ceil_powerof2(pgno2bytes(env, prev_discarded_pgno), env->me_os_psize); ceil_powerof2(pgno2bytes(env, prev_discarded_pgno), env->me_os_psize);
ENSURE(env, prev_discarded_bytes > discard_edge_bytes); ENSURE(env, prev_discarded_bytes > discard_edge_bytes);
munlock_after(env, discard_edge_pgno,
bytes_align2os_bytes(env, env->me_dxb_mmap.current));
#if defined(MADV_DONTNEED) #if defined(MADV_DONTNEED)
int advise = MADV_DONTNEED; int advise = MADV_DONTNEED;
#if defined(MADV_FREE) && \ #if defined(MADV_FREE) && \
@ -11391,8 +11488,23 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending,
env->me_map + discard_edge_bytes, env->me_map + discard_edge_bytes,
prev_discarded_bytes - discard_edge_bytes, POSIX_MADV_DONTNEED)); prev_discarded_bytes - discard_edge_bytes, POSIX_MADV_DONTNEED));
#endif #endif
if (unlikely(MDBX_IS_ERROR(err))) uint32_t snap_mlock_counter;
if (unlikely(err == MDBX_EINVAL) &&
(snap_mlock_counter = atomic_load32(&env->me_lck->mti_mlock_counter,
mo_Relaxed)) > 0) {
NOTICE("%s-madvise: ignore EINVAL (%d) since some pages locked (have "
"%u mlocked-process(es))",
"shrink", err, snap_mlock_counter);
} else if (unlikely(MDBX_IS_ERROR(err))) {
ERROR("%s-madvise(%s, %zu..%zu), err %d", "shrink", "DONTNEED",
discard_edge_bytes, prev_discarded_bytes - discard_edge_bytes,
err);
ERROR("%s-madvise(%s, %zu..%zu), %u mlocked-process(es), err %d",
"shrink", "DONTNEED", discard_edge_bytes,
prev_discarded_bytes - discard_edge_bytes,
atomic_load32(&env->me_lck->mti_mlock_counter, mo_Relaxed), err);
return err; return err;
}
} }
#endif /* MDBX_ENABLE_MADVISE && (MADV_DONTNEED || POSIX_MADV_DONTNEED) */ #endif /* MDBX_ENABLE_MADVISE && (MADV_DONTNEED || POSIX_MADV_DONTNEED) */
@ -13896,17 +14008,21 @@ __cold static int env_close(MDBX_env *env) {
} }
env->me_flags &= ~ENV_INTERNAL_FLAGS; env->me_flags &= ~ENV_INTERNAL_FLAGS;
env->me_lck = nullptr;
if (flags & MDBX_ENV_TXKEY) { if (flags & MDBX_ENV_TXKEY) {
rthc_remove(env->me_txkey); rthc_remove(env->me_txkey);
env->me_txkey = (osal_thread_key_t)0; env->me_txkey = (osal_thread_key_t)0;
} }
munlock_all(env);
osal_ioring_destroy(&env->me_ioring);
lcklist_lock(); lcklist_lock();
const int rc = lcklist_detach_locked(env); const int rc = lcklist_detach_locked(env);
lcklist_unlock(); lcklist_unlock();
osal_ioring_destroy(&env->me_ioring); env->me_lck = nullptr;
if (env->me_lck_mmap.lck)
osal_munmap(&env->me_lck_mmap);
if (env->me_map) { if (env->me_map) {
osal_munmap(&env->me_dxb_mmap); osal_munmap(&env->me_dxb_mmap);
@ -13934,9 +14050,6 @@ __cold static int env_close(MDBX_env *env) {
env->me_lazy_fd = INVALID_HANDLE_VALUE; env->me_lazy_fd = INVALID_HANDLE_VALUE;
} }
if (env->me_lck_mmap.lck)
osal_munmap(&env->me_lck_mmap);
if (env->me_lfd != INVALID_HANDLE_VALUE) { if (env->me_lfd != INVALID_HANDLE_VALUE) {
(void)osal_closefile(env->me_lfd); (void)osal_closefile(env->me_lfd);
env->me_lfd = INVALID_HANDLE_VALUE; env->me_lfd = INVALID_HANDLE_VALUE;
@ -23589,6 +23702,249 @@ __cold int mdbx_env_get_option(const MDBX_env *env, const MDBX_option_t option,
return MDBX_SUCCESS; return MDBX_SUCCESS;
} }
static size_t estimate_rss(size_t database_bytes) {
return database_bytes + database_bytes / 64 +
(512 + MDBX_WORDBITS * 16) * MEGABYTE;
}
__cold int mdbx_env_warmup(const MDBX_env *env, const MDBX_txn *txn,
MDBX_warmup_flags_t flags,
unsigned timeout_seconds_16dot16) {
if (unlikely(env == NULL && txn == NULL))
return MDBX_EINVAL;
if (unlikely(flags >
(MDBX_warmup_force | MDBX_warmup_oomsafe | MDBX_warmup_lock |
MDBX_warmup_touchlimit | MDBX_warmup_release)))
return MDBX_EINVAL;
if (txn) {
int err = check_txn(txn, MDBX_TXN_BLOCKED - MDBX_TXN_ERROR);
if (unlikely(err != MDBX_SUCCESS))
return err;
}
if (env) {
int err = check_env(env, false);
if (unlikely(err != MDBX_SUCCESS))
return err;
if (txn && unlikely(txn->mt_env != env))
return MDBX_EINVAL;
} else {
env = txn->mt_env;
}
const uint64_t timeout_monotime =
(timeout_seconds_16dot16 && (flags & MDBX_warmup_force))
? osal_monotime() + osal_16dot16_to_monotime(timeout_seconds_16dot16)
: 0;
if (flags & MDBX_warmup_release)
munlock_all(env);
pgno_t used_pgno;
if (txn) {
used_pgno = txn->mt_geo.next;
} else {
const meta_troika_t troika = meta_tap(env);
used_pgno = meta_recent(env, &troika).ptr_v->mm_geo.next;
}
const size_t used_range = pgno_align2os_bytes(env, used_pgno);
const pgno_t mlock_pgno = bytes2pgno(env, used_range);
int rc = MDBX_SUCCESS;
if (flags & MDBX_warmup_touchlimit) {
const size_t estimated_rss = estimate_rss(used_range);
#if defined(_WIN32) || defined(_WIN64)
SIZE_T current_ws_lower, current_ws_upper;
if (GetProcessWorkingSetSize(GetCurrentProcess(), &current_ws_lower,
&current_ws_upper) &&
current_ws_lower < estimated_rss) {
const SIZE_T ws_lower = estimated_rss;
const SIZE_T ws_upper =
(MDBX_WORDBITS == 32 && ws_lower > MEGABYTE * 2048)
? ws_lower
: ws_lower + MDBX_WORDBITS * MEGABYTE * 32;
if (!SetProcessWorkingSetSize(GetCurrentProcess(), ws_lower, ws_upper)) {
rc = (int)GetLastError();
WARNING("SetProcessWorkingSetSize(%zu, %zu) error %d", ws_lower,
ws_upper, rc);
}
}
#endif /* Windows */
#ifdef RLIMIT_RSS
struct rlimit rss;
if (getrlimit(RLIMIT_RSS, &rss) == 0 && rss.rlim_cur < estimated_rss) {
rss.rlim_cur = estimated_rss;
if (rss.rlim_max < estimated_rss)
rss.rlim_max = used_range;
if (setrlimit(RLIMIT_RSS, &rss)) {
rc = errno;
WARNING("setrlimit(%s, {%zu, %zu}) error %d", "RLIMIT_RSS",
(size_t)rss.rlim_cur, (size_t)rss.rlim_max, rc);
}
}
#endif /* RLIMIT_RSS */
#ifdef RLIMIT_MEMLOCK
if (flags & MDBX_warmup_lock) {
struct rlimit memlock;
if (getrlimit(RLIMIT_MEMLOCK, &memlock) == 0 &&
memlock.rlim_cur < estimated_rss) {
memlock.rlim_cur = estimated_rss;
if (memlock.rlim_max < estimated_rss)
memlock.rlim_max = estimated_rss;
if (setrlimit(RLIMIT_MEMLOCK, &memlock)) {
rc = errno;
WARNING("setrlimit(%s, {%zu, %zu}) error %d", "RLIMIT_MEMLOCK",
(size_t)memlock.rlim_cur, (size_t)memlock.rlim_max, rc);
}
}
}
#endif /* RLIMIT_MEMLOCK */
(void)estimated_rss;
}
#if defined(MLOCK_ONFAULT) && \
((defined(_GNU_SOURCE) && __GLIBC_PREREQ(2, 27)) || \
(defined(__ANDROID_API__) && __ANDROID_API__ >= 30)) && \
(defined(__linux__) || defined(__gnu_linux__))
if ((flags & MDBX_warmup_lock) != 0 && linux_kernel_version >= 0x04040000 &&
atomic_load32(&env->me_mlocked_pgno, mo_AcquireRelease) < mlock_pgno) {
if (mlock2(env->me_map, used_range, MLOCK_ONFAULT)) {
rc = errno;
WARNING("mlock2(%zu, %s) error %d", used_range, "MLOCK_ONFAULT", rc);
} else {
update_mlocked(env, mlock_pgno, true);
rc = MDBX_SUCCESS;
}
if (rc != EINVAL)
flags -= MDBX_warmup_lock;
}
#endif /* MLOCK_ONFAULT */
int err = MDBX_ENOSYS;
#if MDBX_ENABLE_MADVISE
err = set_readahead(env, used_pgno, true, true);
#else
#if defined(_WIN32) || defined(_WIN64)
if (mdbx_PrefetchVirtualMemory) {
WIN32_MEMORY_RANGE_ENTRY hint;
hint.VirtualAddress = env->me_map;
hint.NumberOfBytes = used_range;
if (mdbx_PrefetchVirtualMemory(GetCurrentProcess(), 1, &hint, 0))
err = MDBX_SUCCESS;
else {
err = (int)GetLastError();
ERROR("%s(%zu) error %d", "PrefetchVirtualMemory", used_range, err);
}
}
#endif /* Windows */
#if defined(POSIX_MADV_WILLNEED)
err = posix_madvise(env->me_map, used_range, POSIX_MADV_WILLNEED)
? ignore_enosys(errno)
: MDBX_SUCCESS;
#elif defined(MADV_WILLNEED)
err = madvise(env->me_map, used_range, MADV_WILLNEED) ? ignore_enosys(errno)
: MDBX_SUCCESS;
#endif
#if defined(F_RDADVISE)
if (err) {
fcntl(env->me_lazy_fd, F_RDAHEAD, true);
struct radvisory hint;
hint.ra_offset = 0;
hint.ra_count = unlikely(used_range > INT_MAX &&
sizeof(used_range) > sizeof(hint.ra_count))
? INT_MAX
: (int)used_range;
err = fcntl(env->me_lazy_fd, F_RDADVISE, &hint) ? ignore_enosys(errno)
: MDBX_SUCCESS;
if (err == ENOTTY)
err = MDBX_SUCCESS /* Ignore ENOTTY for DB on the ram-disk */;
}
#endif /* F_RDADVISE */
#endif /* MDBX_ENABLE_MADVISE */
if (err != MDBX_SUCCESS && rc == MDBX_SUCCESS)
rc = err;
if ((flags & MDBX_warmup_force) != 0 &&
(rc == MDBX_SUCCESS || rc == MDBX_ENOSYS)) {
const volatile uint8_t *ptr = env->me_map;
size_t offset = 0, unused = 42;
#if !(defined(_WIN32) || defined(_WIN64))
if (flags & MDBX_warmup_oomsafe) {
const int null_fd = open("/dev/null", O_WRONLY);
if (unlikely(null_fd < 0))
rc = errno;
else {
struct iovec iov[MDBX_AUXILARY_IOV_MAX];
for (;;) {
unsigned i;
for (i = 0; i < MDBX_AUXILARY_IOV_MAX && offset < used_range; ++i) {
iov[i].iov_base = (void *)(ptr + offset);
iov[i].iov_len = 1;
offset += env->me_os_psize;
}
if (unlikely(writev(null_fd, iov, i) < 0)) {
rc = errno;
if (rc == EFAULT)
rc = ENOMEM;
break;
}
if (offset >= used_range) {
rc = MDBX_SUCCESS;
break;
}
if (timeout_seconds_16dot16 && osal_monotime() > timeout_monotime) {
rc = MDBX_RESULT_TRUE;
break;
}
}
close(null_fd);
}
} else
#endif /* Windows */
for (;;) {
unused += ptr[offset];
offset += env->me_os_psize;
if (offset >= used_range) {
rc = MDBX_SUCCESS;
break;
}
if (timeout_seconds_16dot16 && osal_monotime() > timeout_monotime) {
rc = MDBX_RESULT_TRUE;
break;
}
}
(void)unused;
}
if ((flags & MDBX_warmup_lock) != 0 &&
(rc == MDBX_SUCCESS || rc == MDBX_ENOSYS) &&
atomic_load32(&env->me_mlocked_pgno, mo_AcquireRelease) < mlock_pgno) {
#if defined(_WIN32) || defined(_WIN64)
if (VirtualLock(env->me_map, used_range)) {
update_mlocked(env, mlock_pgno, true);
rc = MDBX_SUCCESS;
} else {
rc = (int)GetLastError();
WARNING("%s(%zu) error %d", "VirtualLock", used_range, rc);
}
#elif defined(_POSIX_MEMLOCK_RANGE)
if (mlock(env->me_map, used_range) == 0) {
update_mlocked(env, mlock_pgno, true);
rc = MDBX_SUCCESS;
} else {
rc = errno;
WARNING("%s(%zu) error %d", "mlock", used_range, rc);
}
#else
rc = MDBX_ENOSYS;
#endif
}
return rc;
}
__cold void global_ctor(void) { __cold void global_ctor(void) {
rthc_limit = RTHC_INITIAL_LIMIT; rthc_limit = RTHC_INITIAL_LIMIT;
rthc_table = rthc_table_static; rthc_table = rthc_table_static;

View File

@ -731,6 +731,11 @@ typedef struct MDBX_lockinfo {
/* Marker to distinguish uniqueness of DB/CLK. */ /* Marker to distinguish uniqueness of DB/CLK. */
MDBX_atomic_uint64_t mti_bait_uniqueness; MDBX_atomic_uint64_t mti_bait_uniqueness;
/* Counter of processes which had mlock()'ed some of mmapped DB pages.
* Non-zero means at least one process lock at leat one page,
* and therefore madvise() could return EINVAL. */
MDBX_atomic_uint32_t mti_mlock_counter;
MDBX_ALIGNAS(MDBX_CACHELINE_SIZE) /* cacheline ----------------------------*/ MDBX_ALIGNAS(MDBX_CACHELINE_SIZE) /* cacheline ----------------------------*/
#if MDBX_ENABLE_PGOP_STAT #if MDBX_ENABLE_PGOP_STAT
@ -1169,7 +1174,8 @@ struct MDBX_env {
unsigned me_psize; /* DB page size, initialized from me_os_psize */ unsigned me_psize; /* DB page size, initialized from me_os_psize */
unsigned me_leaf_nodemax; /* max size of a leaf-node */ unsigned me_leaf_nodemax; /* max size of a leaf-node */
unsigned me_branch_nodemax; /* max size of a branch-node */ unsigned me_branch_nodemax; /* max size of a branch-node */
uint8_t me_psize2log; /* log2 of DB page size */ atomic_pgno_t me_mlocked_pgno;
uint8_t me_psize2log; /* log2 of DB page size */
int8_t me_stuck_meta; /* recovery-only: target meta page or less that zero */ int8_t me_stuck_meta; /* recovery-only: target meta page or less that zero */
uint16_t me_merge_threshold, uint16_t me_merge_threshold,
me_merge_threshold_gc; /* pages emptier than this are candidates for me_merge_threshold_gc; /* pages emptier than this are candidates for