mdbx: refine discarding of unused pages (MADV_FREE/MADV_REMOVE/MADV_DONTNEED).

Change-Id: I657eb7ef9060214d6ed3d75a2deeebc9ff3df5f5
This commit is contained in:
Leonid Yuriev 2019-10-26 23:57:49 +03:00
parent 40f31ea936
commit 9c89e7c739
3 changed files with 109 additions and 101 deletions

View File

@ -3422,8 +3422,8 @@ __cold static int mdbx_mapresize(MDBX_env *env, const pgno_t size_pgno,
env->me_dbgeo.now, size_bytes, env->me_dbgeo.upper, limit_bytes);
mdbx_assert(env, limit_bytes >= size_bytes);
mdbx_assert(env, bytes2pgno(env, size_bytes) == size_pgno);
mdbx_assert(env, bytes2pgno(env, limit_bytes) == limit_pgno);
mdbx_assert(env, bytes2pgno(env, size_bytes) >= size_pgno);
mdbx_assert(env, bytes2pgno(env, limit_bytes) >= limit_pgno);
#if defined(_WIN32) || defined(_WIN64)
/* Acquire guard in exclusive mode for:
@ -3459,11 +3459,35 @@ __cold static int mdbx_mapresize(MDBX_env *env, const pgno_t size_pgno,
int rc = mdbx_fastmutex_acquire(&env->me_remap_guard);
if (rc != MDBX_SUCCESS)
return rc;
if (limit_bytes == env->me_dxb_mmap.length &&
bytes2pgno(env, size_bytes) == env->me_dbgeo.now)
if (limit_bytes == env->me_dxb_mmap.length && size_bytes == env->me_dbgeo.now)
goto bailout;
#endif /* Windows */
if (size_bytes < env->me_dbgeo.now) {
mdbx_notice("resize-MADV_%s %u..%u",
(env->me_flags & MDBX_WRITEMAP) ? "REMOVE" : "DONTNEED",
size_pgno, bytes2pgno(env, env->me_dbgeo.now));
#if defined(MADV_REMOVE)
if ((env->me_flags & MDBX_WRITEMAP) == 0 ||
madvise(env->me_map + size_bytes, env->me_dbgeo.now - size_bytes,
MADV_REMOVE) != 0)
#endif
#if defined(MADV_DONTNEED)
(void)madvise(env->me_map + size_bytes, env->me_dbgeo.now - size_bytes,
MADV_DONTNEED);
#elif defined(POSIX_MADV_DONTNEED)
(void)posix_madvise(env->me_map + size_bytes,
env->me_dbgeo.now - size_bytes, POSIX_MADV_DONTNEED);
#elif defined(POSIX_FADV_DONTNEED)
(void)posix_fadvise(env->me_fd, size_bytes, env->me_dbgeo.now - size_bytes,
POSIX_FADV_DONTNEED);
#else
__noop();
#endif /* MADV_DONTNEED */
if (*env->me_discarded_tail > size_pgno)
*env->me_discarded_tail = size_pgno;
}
rc = mdbx_mresize(env->me_flags, &env->me_dxb_mmap, size_bytes, limit_bytes);
bailout:
@ -4685,17 +4709,20 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) {
goto bailout;
}
}
txn->mt_owner = mdbx_thread_self();
if (txn->mt_flags & MDBX_RDONLY) {
#if defined(_WIN32) || defined(_WIN64)
if ((txn->mt_flags & MDBX_RDONLY) != 0 && size > env->me_dbgeo.lower &&
env->me_dbgeo.shrink) {
txn->mt_flags |= MDBX_SHRINK_ALLOWED;
mdbx_srwlock_AcquireShared(&env->me_remap_guard);
}
if (size > env->me_dbgeo.lower && env->me_dbgeo.shrink) {
txn->mt_flags |= MDBX_SHRINK_ALLOWED;
mdbx_srwlock_AcquireShared(&env->me_remap_guard);
}
#endif
} else {
env->me_dbgeo.now = size;
}
#if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__)
mdbx_txn_valgrind(env, txn);
#endif
txn->mt_owner = mdbx_thread_self();
return MDBX_SUCCESS;
}
bailout:
@ -6963,7 +6990,7 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags,
pgno2bytes(env, edge - largest_pgno));
}
#endif /* MDBX_USE_VALGRIND */
#if defined(MADV_REMOVE_OR_FREE_OR_DONTNEED)
#if defined(MADV_DONTNEED)
const size_t largest_aligned2os_bytes =
pgno_align2os_bytes(env, largest_pgno);
const pgno_t largest_aligned2os_pgno =
@ -6971,17 +6998,29 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags,
const pgno_t prev_discarded_pgno = *env->me_discarded_tail;
if (prev_discarded_pgno >
largest_aligned2os_pgno +
/* 256Kb threshold to avoid unreasonable madvise() call */
bytes2pgno(env, 256 * 1024)) {
/* 1M threshold to avoid unreasonable madvise() call */
bytes2pgno(env, MEGABYTE)) {
mdbx_notice("open-MADV_%s %u..%u", "DONTNEED", *env->me_discarded_tail,
largest_pgno);
*env->me_discarded_tail = largest_aligned2os_pgno;
const size_t prev_discarded_bytes =
pgno_align2os_bytes(env, prev_discarded_pgno);
mdbx_ensure(env, prev_discarded_bytes > largest_aligned2os_bytes);
(void)madvise(env->me_map + largest_aligned2os_bytes,
prev_discarded_bytes - largest_aligned2os_bytes,
MADV_REMOVE_OR_FREE_OR_DONTNEED);
int advise = MADV_DONTNEED;
#if defined(MADV_FREE) && \
0 /* MADV_FREE works for only anon vma at the moment */
if ((env->me_flags & MDBX_WRITEMAP) &&
mdbx_linux_kernel_version > 0x04050000)
advise = MADV_FREE;
#endif /* MADV_FREE */
int err = madvise(env->me_map + largest_aligned2os_bytes,
prev_discarded_bytes - largest_aligned2os_bytes, advise)
? errno
: MDBX_SUCCESS;
mdbx_assert(env, err == MDBX_SUCCESS);
(void)err;
}
#endif /* MADV_REMOVE_OR_FREE_OR_DONTNEED */
#endif /* MADV_FREE || MADV_DONTNEED */
/* LY: check conditions to shrink datafile */
const pgno_t backlog_gap =
@ -7332,82 +7371,29 @@ static int __cold mdbx_env_map(MDBX_env *env, const int is_exclusive,
: MADV_DONTDUMP);
#endif
#if defined(MADV_REMOVE_OR_FREE_OR_DONTNEED)
if (is_exclusive && (env->me_flags & MDBX_WRITEMAP) != 0) {
const size_t used_aligned2os_bytes =
roundup_powerof2(usedsize, env->me_os_psize);
*env->me_discarded_tail = bytes2pgno(env, used_aligned2os_bytes);
if (used_aligned2os_bytes < env->me_mapsize) {
const size_t used_aligned2os_bytes =
roundup_powerof2(usedsize, env->me_os_psize);
*env->me_discarded_tail = bytes2pgno(env, used_aligned2os_bytes);
if (used_aligned2os_bytes < env->me_dbgeo.now) {
#if defined(MADV_REMOVE)
if (is_exclusive && (env->me_flags & MDBX_WRITEMAP) != 0)
(void)madvise(env->me_map + used_aligned2os_bytes,
env->me_mapsize - used_aligned2os_bytes,
MADV_REMOVE_OR_FREE_OR_DONTNEED);
}
}
env->me_dbgeo.now - used_aligned2os_bytes, MADV_REMOVE);
#else
(void)is_exclusive;
#endif /* MADV_REMOVE_OR_FREE_OR_DONTNEED */
#ifdef POSIX_FADV_RANDOM
/* this also checks that the file size is valid for a particular FS */
rc = posix_fadvise(env->me_fd, 0, env->me_dbgeo.upper, POSIX_FADV_RANDOM);
if (unlikely(rc != 0))
return rc;
#elif defined(F_RDAHEAD)
if (unlikely(fcntl(env->me_fd, F_RDAHEAD, 0) == -1))
return errno;
#endif
/* Turn on/off readahead. It's harmful when the DB is larger than RAM. */
if (env->me_flags & MDBX_NORDAHEAD) {
#if defined(MADV_RANDOM)
if (unlikely(madvise(env->me_map, env->me_mapsize, MADV_RANDOM) != 0))
return errno;
#elif defined(POSIX_MADV_RANDOM)
rc = posix_madvise(env->me_map, env->me_mapsize, POSIX_MADV_RANDOM);
if (unlikely(rc != 0))
return errno;
#endif
#ifdef POSIX_FADV_DONTNEED
rc = posix_fadvise(env->me_fd, 0, env->me_mapsize, POSIX_FADV_DONTNEED);
if (unlikely(rc != 0))
return rc;
#endif
(void)is_exclusive;
#endif /* MADV_REMOVE */
#if defined(MADV_DONTNEED)
if (unlikely(madvise(env->me_map, env->me_mapsize, MADV_DONTNEED) != 0))
return errno;
(void)madvise(env->me_map + used_aligned2os_bytes,
env->me_dbgeo.now - used_aligned2os_bytes, MADV_DONTNEED);
#elif defined(POSIX_MADV_DONTNEED)
rc = posix_madvise(env->me_map, env->me_mapsize, POSIX_MADV_DONTNEED);
if (unlikely(rc != 0))
return errno;
#endif
} else {
#ifdef POSIX_FADV_WILLNEED
rc = posix_fadvise(env->me_fd, 0, usedsize, POSIX_FADV_WILLNEED);
if (unlikely(rc != 0))
return rc;
#elif defined(F_RDADVISE)
struct radvisory hint;
hint.ra_offset = 0;
hint.ra_count = usedsize;
(void)/* Ignore ENOTTY for DB on the ram-disk and so on */ fcntl(
env->me_fd, F_RDADVISE, &hint);
#endif
#if defined(MADV_WILLNEED)
if (unlikely(madvise(env->me_map, usedsize, MADV_WILLNEED) != 0))
return errno;
#elif defined(POSIX_MADV_WILLNEED)
rc = posix_madvise(env->me_map, usedsize, POSIX_MADV_WILLNEED);
if (unlikely(rc != 0))
return errno;
#endif
#if defined(_WIN32) || defined(_WIN64)
if (mdbx_PrefetchVirtualMemory) {
WIN32_MEMORY_RANGE_ENTRY hint;
hint.VirtualAddress = env->me_map;
hint.NumberOfBytes = usedsize;
(void)mdbx_PrefetchVirtualMemory(GetCurrentProcess(), 1, &hint, 0);
}
#endif /* Windows */
(void)madvise(env->me_map + used_aligned2os_bytes,
env->me_dbgeo.now - used_aligned2os_bytes,
POSIX_MADV_DONTNEED);
#elif defined(POSIX_FADV_DONTNEED)
(void)posix_fadvise(env->me_fd, used_aligned2os_bytes,
env->me_dbgeo.now - used_aligned2os_bytes,
POSIX_FADV_DONTNEED);
#endif /* MADV_DONTNEED */
}
#ifdef MDBX_USE_VALGRIND

View File

@ -689,12 +689,14 @@ MDBX_srwlock_function mdbx_srwlock_Init, mdbx_srwlock_AcquireShared,
/*----------------------------------------------------------------------------*/
#if 0 /* LY: unused for now */
static DWORD WINAPI stub_DiscardVirtualMemory(PVOID VirtualAddress,
SIZE_T Size) {
return VirtualAlloc(VirtualAddress, Size, MEM_RESET, PAGE_NOACCESS)
? ERROR_SUCCESS
: GetLastError();
}
#endif /* unused for now */
/*----------------------------------------------------------------------------*/
#ifndef MDBX_ALLOY
@ -702,9 +704,13 @@ MDBX_GetFileInformationByHandleEx mdbx_GetFileInformationByHandleEx;
MDBX_GetVolumeInformationByHandleW mdbx_GetVolumeInformationByHandleW;
MDBX_GetFinalPathNameByHandleW mdbx_GetFinalPathNameByHandleW;
MDBX_SetFileInformationByHandle mdbx_SetFileInformationByHandle;
MDBX_PrefetchVirtualMemory mdbx_PrefetchVirtualMemory;
MDBX_DiscardVirtualMemory mdbx_DiscardVirtualMemory;
MDBX_NtFsControlFile mdbx_NtFsControlFile;
MDBX_PrefetchVirtualMemory mdbx_PrefetchVirtualMemory;
#if 0 /* LY: unused for now */
MDBX_DiscardVirtualMemory mdbx_DiscardVirtualMemory;
MDBX_OfferVirtualMemory mdbx_OfferVirtualMemory;
MDBX_ReclaimVirtualMemory mdbx_ReclaimVirtualMemory;
#endif /* unused for now */
#endif /* MDBX_ALLOY */
static void mdbx_winnt_import(void) {
@ -736,9 +742,13 @@ static void mdbx_winnt_import(void) {
GET_KERNEL32_PROC(GetFinalPathNameByHandleW);
GET_KERNEL32_PROC(SetFileInformationByHandle);
GET_KERNEL32_PROC(PrefetchVirtualMemory);
#if 0 /* LY: unused for now */
GET_KERNEL32_PROC(DiscardVirtualMemory);
if (!mdbx_DiscardVirtualMemory)
mdbx_DiscardVirtualMemory = stub_DiscardVirtualMemory;
GET_KERNEL32_PROC(OfferVirtualMemory);
GET_KERNEL32_PROC(ReclaimVirtualMemory);
#endif /* unused for now */
#undef GET_KERNEL32_PROC
const HINSTANCE hNtdll = GetModuleHandleA("ntdll.dll");

View File

@ -239,16 +239,6 @@ typedef pthread_mutex_t mdbx_fastmutex_t;
#define MADV_DONTDUMP MADV_NOCORE
#endif /* MADV_NOCORE -> MADV_DONTDUMP */
#ifndef MADV_REMOVE_OR_FREE_OR_DONTNEED
#ifdef MADV_REMOVE
#define MADV_REMOVE_OR_FREE_OR_DONTNEED MADV_REMOVE
#elif defined(MADV_FREE)
#define MADV_REMOVE_OR_FREE_OR_DONTNEED MADV_FREE
#elif defined(MADV_DONTNEED)
#define MADV_REMOVE_OR_FREE_OR_DONTNEED MADV_DONTNEED
#endif
#endif /* MADV_REMOVE_OR_FREE_OR_DONTNEED */
#if defined(i386) || defined(__386) || defined(__i386) || defined(__i386__) || \
defined(i486) || defined(__i486) || defined(__i486__) || \
defined(i586) | defined(__i586) || defined(__i586__) || defined(i686) || \
@ -860,10 +850,32 @@ typedef BOOL(WINAPI *MDBX_PrefetchVirtualMemory)(
PWIN32_MEMORY_RANGE_ENTRY VirtualAddresses, ULONG Flags);
MDBX_INTERNAL_VAR MDBX_PrefetchVirtualMemory mdbx_PrefetchVirtualMemory;
#if 0 /* LY: unused for now */
#if !defined(_WIN32_WINNT_WIN81) || _WIN32_WINNT < _WIN32_WINNT_WIN81
typedef enum OFFER_PRIORITY {
VmOfferPriorityVeryLow = 1,
VmOfferPriorityLow,
VmOfferPriorityBelowNormal,
VmOfferPriorityNormal
} OFFER_PRIORITY;
#endif /* Windows 8.1 */
typedef DWORD(WINAPI *MDBX_DiscardVirtualMemory)(PVOID VirtualAddress,
SIZE_T Size);
MDBX_INTERNAL_VAR MDBX_DiscardVirtualMemory mdbx_DiscardVirtualMemory;
typedef DWORD(WINAPI *MDBX_ReclaimVirtualMemory)(PVOID VirtualAddress,
SIZE_T Size);
MDBX_INTERNAL_VAR MDBX_ReclaimVirtualMemory mdbx_ReclaimVirtualMemory;
typedef DWORD(WINAPI *MDBX_OfferVirtualMemory(
PVOID VirtualAddress,
SIZE_T Size,
OFFER_PRIORITY Priority
);
MDBX_INTERNAL_VAR MDBX_OfferVirtualMemory mdbx_OfferVirtualMemory;
#endif /* unused for now */
#endif /* Windows */
/*----------------------------------------------------------------------------*/