From 9c89e7c7391cba911c67f8dcaf975c833411d0dc Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Sat, 26 Oct 2019 23:57:49 +0300 Subject: [PATCH] mdbx: refine discarding of unused pages (MADV_FREE/MADV_REMOVE/MADV_DONTNEED). Change-Id: I657eb7ef9060214d6ed3d75a2deeebc9ff3df5f5 --- src/elements/core.c | 164 +++++++++++++++++-------------------- src/elements/lck-windows.c | 14 +++- src/elements/osal.h | 32 +++++--- 3 files changed, 109 insertions(+), 101 deletions(-) diff --git a/src/elements/core.c b/src/elements/core.c index 97721876..1210fc63 100644 --- a/src/elements/core.c +++ b/src/elements/core.c @@ -3422,8 +3422,8 @@ __cold static int mdbx_mapresize(MDBX_env *env, const pgno_t size_pgno, env->me_dbgeo.now, size_bytes, env->me_dbgeo.upper, limit_bytes); mdbx_assert(env, limit_bytes >= size_bytes); - mdbx_assert(env, bytes2pgno(env, size_bytes) == size_pgno); - mdbx_assert(env, bytes2pgno(env, limit_bytes) == limit_pgno); + mdbx_assert(env, bytes2pgno(env, size_bytes) >= size_pgno); + mdbx_assert(env, bytes2pgno(env, limit_bytes) >= limit_pgno); #if defined(_WIN32) || defined(_WIN64) /* Acquire guard in exclusive mode for: @@ -3459,11 +3459,35 @@ __cold static int mdbx_mapresize(MDBX_env *env, const pgno_t size_pgno, int rc = mdbx_fastmutex_acquire(&env->me_remap_guard); if (rc != MDBX_SUCCESS) return rc; - if (limit_bytes == env->me_dxb_mmap.length && - bytes2pgno(env, size_bytes) == env->me_dbgeo.now) + if (limit_bytes == env->me_dxb_mmap.length && size_bytes == env->me_dbgeo.now) goto bailout; #endif /* Windows */ + if (size_bytes < env->me_dbgeo.now) { + mdbx_notice("resize-MADV_%s %u..%u", + (env->me_flags & MDBX_WRITEMAP) ? "REMOVE" : "DONTNEED", + size_pgno, bytes2pgno(env, env->me_dbgeo.now)); +#if defined(MADV_REMOVE) + if ((env->me_flags & MDBX_WRITEMAP) == 0 || + madvise(env->me_map + size_bytes, env->me_dbgeo.now - size_bytes, + MADV_REMOVE) != 0) +#endif +#if defined(MADV_DONTNEED) + (void)madvise(env->me_map + size_bytes, env->me_dbgeo.now - size_bytes, + MADV_DONTNEED); +#elif defined(POSIX_MADV_DONTNEED) + (void)posix_madvise(env->me_map + size_bytes, + env->me_dbgeo.now - size_bytes, POSIX_MADV_DONTNEED); +#elif defined(POSIX_FADV_DONTNEED) + (void)posix_fadvise(env->me_fd, size_bytes, env->me_dbgeo.now - size_bytes, + POSIX_FADV_DONTNEED); +#else + __noop(); +#endif /* MADV_DONTNEED */ + if (*env->me_discarded_tail > size_pgno) + *env->me_discarded_tail = size_pgno; + } + rc = mdbx_mresize(env->me_flags, &env->me_dxb_mmap, size_bytes, limit_bytes); bailout: @@ -4685,17 +4709,20 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) { goto bailout; } } - txn->mt_owner = mdbx_thread_self(); + if (txn->mt_flags & MDBX_RDONLY) { #if defined(_WIN32) || defined(_WIN64) - if ((txn->mt_flags & MDBX_RDONLY) != 0 && size > env->me_dbgeo.lower && - env->me_dbgeo.shrink) { - txn->mt_flags |= MDBX_SHRINK_ALLOWED; - mdbx_srwlock_AcquireShared(&env->me_remap_guard); - } + if (size > env->me_dbgeo.lower && env->me_dbgeo.shrink) { + txn->mt_flags |= MDBX_SHRINK_ALLOWED; + mdbx_srwlock_AcquireShared(&env->me_remap_guard); + } #endif + } else { + env->me_dbgeo.now = size; + } #if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__) mdbx_txn_valgrind(env, txn); #endif + txn->mt_owner = mdbx_thread_self(); return MDBX_SUCCESS; } bailout: @@ -6963,7 +6990,7 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, pgno2bytes(env, edge - largest_pgno)); } #endif /* MDBX_USE_VALGRIND */ -#if defined(MADV_REMOVE_OR_FREE_OR_DONTNEED) +#if defined(MADV_DONTNEED) const size_t largest_aligned2os_bytes = pgno_align2os_bytes(env, largest_pgno); const pgno_t largest_aligned2os_pgno = @@ -6971,17 +6998,29 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, const pgno_t prev_discarded_pgno = *env->me_discarded_tail; if (prev_discarded_pgno > largest_aligned2os_pgno + - /* 256Kb threshold to avoid unreasonable madvise() call */ - bytes2pgno(env, 256 * 1024)) { + /* 1M threshold to avoid unreasonable madvise() call */ + bytes2pgno(env, MEGABYTE)) { + mdbx_notice("open-MADV_%s %u..%u", "DONTNEED", *env->me_discarded_tail, + largest_pgno); *env->me_discarded_tail = largest_aligned2os_pgno; const size_t prev_discarded_bytes = pgno_align2os_bytes(env, prev_discarded_pgno); mdbx_ensure(env, prev_discarded_bytes > largest_aligned2os_bytes); - (void)madvise(env->me_map + largest_aligned2os_bytes, - prev_discarded_bytes - largest_aligned2os_bytes, - MADV_REMOVE_OR_FREE_OR_DONTNEED); + int advise = MADV_DONTNEED; +#if defined(MADV_FREE) && \ + 0 /* MADV_FREE works for only anon vma at the moment */ + if ((env->me_flags & MDBX_WRITEMAP) && + mdbx_linux_kernel_version > 0x04050000) + advise = MADV_FREE; +#endif /* MADV_FREE */ + int err = madvise(env->me_map + largest_aligned2os_bytes, + prev_discarded_bytes - largest_aligned2os_bytes, advise) + ? errno + : MDBX_SUCCESS; + mdbx_assert(env, err == MDBX_SUCCESS); + (void)err; } -#endif /* MADV_REMOVE_OR_FREE_OR_DONTNEED */ +#endif /* MADV_FREE || MADV_DONTNEED */ /* LY: check conditions to shrink datafile */ const pgno_t backlog_gap = @@ -7332,82 +7371,29 @@ static int __cold mdbx_env_map(MDBX_env *env, const int is_exclusive, : MADV_DONTDUMP); #endif -#if defined(MADV_REMOVE_OR_FREE_OR_DONTNEED) - if (is_exclusive && (env->me_flags & MDBX_WRITEMAP) != 0) { - const size_t used_aligned2os_bytes = - roundup_powerof2(usedsize, env->me_os_psize); - *env->me_discarded_tail = bytes2pgno(env, used_aligned2os_bytes); - if (used_aligned2os_bytes < env->me_mapsize) { + const size_t used_aligned2os_bytes = + roundup_powerof2(usedsize, env->me_os_psize); + *env->me_discarded_tail = bytes2pgno(env, used_aligned2os_bytes); + if (used_aligned2os_bytes < env->me_dbgeo.now) { +#if defined(MADV_REMOVE) + if (is_exclusive && (env->me_flags & MDBX_WRITEMAP) != 0) (void)madvise(env->me_map + used_aligned2os_bytes, - env->me_mapsize - used_aligned2os_bytes, - MADV_REMOVE_OR_FREE_OR_DONTNEED); - } - } + env->me_dbgeo.now - used_aligned2os_bytes, MADV_REMOVE); #else - (void)is_exclusive; -#endif /* MADV_REMOVE_OR_FREE_OR_DONTNEED */ - -#ifdef POSIX_FADV_RANDOM - /* this also checks that the file size is valid for a particular FS */ - rc = posix_fadvise(env->me_fd, 0, env->me_dbgeo.upper, POSIX_FADV_RANDOM); - if (unlikely(rc != 0)) - return rc; -#elif defined(F_RDAHEAD) - if (unlikely(fcntl(env->me_fd, F_RDAHEAD, 0) == -1)) - return errno; -#endif - - /* Turn on/off readahead. It's harmful when the DB is larger than RAM. */ - if (env->me_flags & MDBX_NORDAHEAD) { -#if defined(MADV_RANDOM) - if (unlikely(madvise(env->me_map, env->me_mapsize, MADV_RANDOM) != 0)) - return errno; -#elif defined(POSIX_MADV_RANDOM) - rc = posix_madvise(env->me_map, env->me_mapsize, POSIX_MADV_RANDOM); - if (unlikely(rc != 0)) - return errno; -#endif -#ifdef POSIX_FADV_DONTNEED - rc = posix_fadvise(env->me_fd, 0, env->me_mapsize, POSIX_FADV_DONTNEED); - if (unlikely(rc != 0)) - return rc; -#endif + (void)is_exclusive; +#endif /* MADV_REMOVE */ #if defined(MADV_DONTNEED) - if (unlikely(madvise(env->me_map, env->me_mapsize, MADV_DONTNEED) != 0)) - return errno; + (void)madvise(env->me_map + used_aligned2os_bytes, + env->me_dbgeo.now - used_aligned2os_bytes, MADV_DONTNEED); #elif defined(POSIX_MADV_DONTNEED) - rc = posix_madvise(env->me_map, env->me_mapsize, POSIX_MADV_DONTNEED); - if (unlikely(rc != 0)) - return errno; -#endif - } else { -#ifdef POSIX_FADV_WILLNEED - rc = posix_fadvise(env->me_fd, 0, usedsize, POSIX_FADV_WILLNEED); - if (unlikely(rc != 0)) - return rc; -#elif defined(F_RDADVISE) - struct radvisory hint; - hint.ra_offset = 0; - hint.ra_count = usedsize; - (void)/* Ignore ENOTTY for DB on the ram-disk and so on */ fcntl( - env->me_fd, F_RDADVISE, &hint); -#endif -#if defined(MADV_WILLNEED) - if (unlikely(madvise(env->me_map, usedsize, MADV_WILLNEED) != 0)) - return errno; -#elif defined(POSIX_MADV_WILLNEED) - rc = posix_madvise(env->me_map, usedsize, POSIX_MADV_WILLNEED); - if (unlikely(rc != 0)) - return errno; -#endif -#if defined(_WIN32) || defined(_WIN64) - if (mdbx_PrefetchVirtualMemory) { - WIN32_MEMORY_RANGE_ENTRY hint; - hint.VirtualAddress = env->me_map; - hint.NumberOfBytes = usedsize; - (void)mdbx_PrefetchVirtualMemory(GetCurrentProcess(), 1, &hint, 0); - } -#endif /* Windows */ + (void)madvise(env->me_map + used_aligned2os_bytes, + env->me_dbgeo.now - used_aligned2os_bytes, + POSIX_MADV_DONTNEED); +#elif defined(POSIX_FADV_DONTNEED) + (void)posix_fadvise(env->me_fd, used_aligned2os_bytes, + env->me_dbgeo.now - used_aligned2os_bytes, + POSIX_FADV_DONTNEED); +#endif /* MADV_DONTNEED */ } #ifdef MDBX_USE_VALGRIND diff --git a/src/elements/lck-windows.c b/src/elements/lck-windows.c index c2fc0c55..500a9fc6 100644 --- a/src/elements/lck-windows.c +++ b/src/elements/lck-windows.c @@ -689,12 +689,14 @@ MDBX_srwlock_function mdbx_srwlock_Init, mdbx_srwlock_AcquireShared, /*----------------------------------------------------------------------------*/ +#if 0 /* LY: unused for now */ static DWORD WINAPI stub_DiscardVirtualMemory(PVOID VirtualAddress, SIZE_T Size) { return VirtualAlloc(VirtualAddress, Size, MEM_RESET, PAGE_NOACCESS) ? ERROR_SUCCESS : GetLastError(); } +#endif /* unused for now */ /*----------------------------------------------------------------------------*/ #ifndef MDBX_ALLOY @@ -702,9 +704,13 @@ MDBX_GetFileInformationByHandleEx mdbx_GetFileInformationByHandleEx; MDBX_GetVolumeInformationByHandleW mdbx_GetVolumeInformationByHandleW; MDBX_GetFinalPathNameByHandleW mdbx_GetFinalPathNameByHandleW; MDBX_SetFileInformationByHandle mdbx_SetFileInformationByHandle; -MDBX_PrefetchVirtualMemory mdbx_PrefetchVirtualMemory; -MDBX_DiscardVirtualMemory mdbx_DiscardVirtualMemory; MDBX_NtFsControlFile mdbx_NtFsControlFile; +MDBX_PrefetchVirtualMemory mdbx_PrefetchVirtualMemory; +#if 0 /* LY: unused for now */ +MDBX_DiscardVirtualMemory mdbx_DiscardVirtualMemory; +MDBX_OfferVirtualMemory mdbx_OfferVirtualMemory; +MDBX_ReclaimVirtualMemory mdbx_ReclaimVirtualMemory; +#endif /* unused for now */ #endif /* MDBX_ALLOY */ static void mdbx_winnt_import(void) { @@ -736,9 +742,13 @@ static void mdbx_winnt_import(void) { GET_KERNEL32_PROC(GetFinalPathNameByHandleW); GET_KERNEL32_PROC(SetFileInformationByHandle); GET_KERNEL32_PROC(PrefetchVirtualMemory); +#if 0 /* LY: unused for now */ GET_KERNEL32_PROC(DiscardVirtualMemory); if (!mdbx_DiscardVirtualMemory) mdbx_DiscardVirtualMemory = stub_DiscardVirtualMemory; + GET_KERNEL32_PROC(OfferVirtualMemory); + GET_KERNEL32_PROC(ReclaimVirtualMemory); +#endif /* unused for now */ #undef GET_KERNEL32_PROC const HINSTANCE hNtdll = GetModuleHandleA("ntdll.dll"); diff --git a/src/elements/osal.h b/src/elements/osal.h index f6b5e8c5..32988b01 100644 --- a/src/elements/osal.h +++ b/src/elements/osal.h @@ -239,16 +239,6 @@ typedef pthread_mutex_t mdbx_fastmutex_t; #define MADV_DONTDUMP MADV_NOCORE #endif /* MADV_NOCORE -> MADV_DONTDUMP */ -#ifndef MADV_REMOVE_OR_FREE_OR_DONTNEED -#ifdef MADV_REMOVE -#define MADV_REMOVE_OR_FREE_OR_DONTNEED MADV_REMOVE -#elif defined(MADV_FREE) -#define MADV_REMOVE_OR_FREE_OR_DONTNEED MADV_FREE -#elif defined(MADV_DONTNEED) -#define MADV_REMOVE_OR_FREE_OR_DONTNEED MADV_DONTNEED -#endif -#endif /* MADV_REMOVE_OR_FREE_OR_DONTNEED */ - #if defined(i386) || defined(__386) || defined(__i386) || defined(__i386__) || \ defined(i486) || defined(__i486) || defined(__i486__) || \ defined(i586) | defined(__i586) || defined(__i586__) || defined(i686) || \ @@ -860,10 +850,32 @@ typedef BOOL(WINAPI *MDBX_PrefetchVirtualMemory)( PWIN32_MEMORY_RANGE_ENTRY VirtualAddresses, ULONG Flags); MDBX_INTERNAL_VAR MDBX_PrefetchVirtualMemory mdbx_PrefetchVirtualMemory; +#if 0 /* LY: unused for now */ +#if !defined(_WIN32_WINNT_WIN81) || _WIN32_WINNT < _WIN32_WINNT_WIN81 +typedef enum OFFER_PRIORITY { + VmOfferPriorityVeryLow = 1, + VmOfferPriorityLow, + VmOfferPriorityBelowNormal, + VmOfferPriorityNormal +} OFFER_PRIORITY; +#endif /* Windows 8.1 */ + typedef DWORD(WINAPI *MDBX_DiscardVirtualMemory)(PVOID VirtualAddress, SIZE_T Size); MDBX_INTERNAL_VAR MDBX_DiscardVirtualMemory mdbx_DiscardVirtualMemory; +typedef DWORD(WINAPI *MDBX_ReclaimVirtualMemory)(PVOID VirtualAddress, + SIZE_T Size); +MDBX_INTERNAL_VAR MDBX_ReclaimVirtualMemory mdbx_ReclaimVirtualMemory; + +typedef DWORD(WINAPI *MDBX_OfferVirtualMemory( + PVOID VirtualAddress, + SIZE_T Size, + OFFER_PRIORITY Priority +); +MDBX_INTERNAL_VAR MDBX_OfferVirtualMemory mdbx_OfferVirtualMemory; +#endif /* unused for now */ + #endif /* Windows */ /*----------------------------------------------------------------------------*/