mdbx: dynamically discarding unused tail pages of DB file.

Change-Id: I1a0eee50cd27de26521e65c9f7ea51a527a0424e
This commit is contained in:
Leonid Yuriev 2019-08-28 04:57:07 +03:00
parent 327e5feb97
commit 51e7159f36
4 changed files with 82 additions and 34 deletions

View File

@ -475,6 +475,9 @@ typedef struct MDBX_lockinfo {
/* Number un-synced-with-disk pages for auto-sync feature. */
volatile pgno_t mti_unsynced_pages;
/* Number of page which was discarded last time by madvise(MADV_FREE). */
volatile pgno_t mti_discarded_tail;
alignas(MDBX_CACHELINE_SIZE) /* cacheline ---------------------------------*/
#ifdef MDBX_OSAL_LOCK
@ -820,6 +823,7 @@ struct MDBX_env {
volatile uint64_t *me_autosync_period;
volatile pgno_t *me_unsynced_pages;
volatile pgno_t *me_autosync_threshold;
volatile pgno_t *me_discarded_tail;
MDBX_oom_func *me_oom_func; /* Callback for kicking laggard readers */
struct {
#ifdef MDBX_OSAL_LOCK
@ -830,6 +834,7 @@ struct MDBX_env {
uint64_t autosync_period;
pgno_t autosync_pending;
pgno_t autosync_threshold;
pgno_t discarded_tail;
} me_lckless_stub;
#if MDBX_DEBUG
MDBX_assert_func *me_assert_func; /* Callback for assertion failures */

View File

@ -663,11 +663,21 @@ MDBX_srwlock_function mdbx_srwlock_Init, mdbx_srwlock_AcquireShared,
/*----------------------------------------------------------------------------*/
static DWORD WINAPI stub_DiscardVirtualMemory(PVOID VirtualAddress,
SIZE_T Size) {
return VirtualAlloc(VirtualAddress, Size, MEM_RESET, PAGE_NOACCESS)
? ERROR_SUCCESS
: GetLastError();
}
/*----------------------------------------------------------------------------*/
MDBX_GetFileInformationByHandleEx mdbx_GetFileInformationByHandleEx;
MDBX_GetVolumeInformationByHandleW mdbx_GetVolumeInformationByHandleW;
MDBX_GetFinalPathNameByHandleW mdbx_GetFinalPathNameByHandleW;
MDBX_SetFileInformationByHandle mdbx_SetFileInformationByHandle;
MDBX_PrefetchVirtualMemory mdbx_PrefetchVirtualMemory;
MDBX_DiscardVirtualMemory mdbx_DiscardVirtualMemory;
MDBX_NtFsControlFile mdbx_NtFsControlFile;
static void mdbx_winnt_import(void) {
@ -700,6 +710,9 @@ static void mdbx_winnt_import(void) {
GET_KERNEL32_PROC(GetFinalPathNameByHandleW);
GET_KERNEL32_PROC(SetFileInformationByHandle);
GET_KERNEL32_PROC(PrefetchVirtualMemory);
GET_KERNEL32_PROC(DiscardVirtualMemory);
if (!mdbx_DiscardVirtualMemory)
mdbx_DiscardVirtualMemory = stub_DiscardVirtualMemory;
const HINSTANCE hNtdll = GetModuleHandleA("ntdll.dll");
mdbx_NtFsControlFile =

View File

@ -5400,31 +5400,51 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags,
flags &= MDBX_WRITEMAP | MDBX_SHRINK_ALLOWED; /* force steady */
}
/* LY: check conditions to shrink datafile */
const pgno_t backlog_gap =
pending->mm_dbs[FREE_DBI].md_depth + mdbx_backlog_extragap(env);
pgno_t shrink = 0;
if ((flags & MDBX_SHRINK_ALLOWED) && pending->mm_geo.shrink &&
pending->mm_geo.now - pending->mm_geo.next >
pending->mm_geo.shrink + backlog_gap) {
const pgno_t largest = mdbx_find_largest(
if (flags & MDBX_SHRINK_ALLOWED) {
/* LY: check conditions to discard unused pages */
const pgno_t largest_pgno = mdbx_find_largest(
env, (head->mm_geo.next > pending->mm_geo.next) ? head->mm_geo.next
: pending->mm_geo.next);
if (pending->mm_geo.now > largest &&
pending->mm_geo.now - largest > pending->mm_geo.shrink + backlog_gap) {
const pgno_t aligner =
pending->mm_geo.grow ? pending->mm_geo.grow : pending->mm_geo.shrink;
const pgno_t with_backlog_gap = largest + backlog_gap;
const pgno_t aligned = pgno_align2os_pgno(
env, with_backlog_gap + aligner - with_backlog_gap % aligner);
const pgno_t bottom =
(aligned > pending->mm_geo.lower) ? aligned : pending->mm_geo.lower;
if (pending->mm_geo.now > bottom) {
flags &= MDBX_WRITEMAP | MDBX_SHRINK_ALLOWED; /* force steady */
shrink = pending->mm_geo.now - bottom;
pending->mm_geo.now = bottom;
if (mdbx_meta_txnid_stable(env, head) == pending->mm_txnid_a)
mdbx_meta_set_txnid(env, pending, pending->mm_txnid_a + 1);
const size_t largest_aligned2os_bytes =
pgno_align2os_bytes(env, largest_pgno);
const pgno_t largest_aligned2os_pgno =
bytes2pgno(env, largest_aligned2os_bytes);
const pgno_t prev_discarded_pgno = *env->me_discarded_tail;
*env->me_discarded_tail = largest_aligned2os_pgno;
if (prev_discarded_pgno > largest_aligned2os_pgno) {
const size_t prev_discarded_bytes =
pgno_align2os_bytes(env, prev_discarded_pgno);
mdbx_ensure(env, prev_discarded_bytes > largest_aligned2os_bytes);
#if defined(MADV_REMOVE_OR_FREE_OR_DONTNEED)
(void)madvise(env->me_map + largest_aligned2os_bytes,
prev_discarded_bytes - largest_aligned2os_bytes,
MADV_REMOVE_OR_FREE_OR_DONTNEED);
#endif /* MADV_REMOVE_OR_FREE_OR_DONTNEED */
}
/* LY: check conditions to shrink datafile */
const pgno_t backlog_gap =
pending->mm_dbs[FREE_DBI].md_depth + mdbx_backlog_extragap(env);
if (pending->mm_geo.shrink && pending->mm_geo.now - pending->mm_geo.next >
pending->mm_geo.shrink + backlog_gap) {
if (pending->mm_geo.now > largest_pgno &&
pending->mm_geo.now - largest_pgno >
pending->mm_geo.shrink + backlog_gap) {
const pgno_t aligner = pending->mm_geo.grow ? pending->mm_geo.grow
: pending->mm_geo.shrink;
const pgno_t with_backlog_gap = largest_pgno + backlog_gap;
const pgno_t aligned = pgno_align2os_pgno(
env, with_backlog_gap + aligner - with_backlog_gap % aligner);
const pgno_t bottom =
(aligned > pending->mm_geo.lower) ? aligned : pending->mm_geo.lower;
if (pending->mm_geo.now > bottom) {
flags &= MDBX_WRITEMAP | MDBX_SHRINK_ALLOWED; /* force steady */
shrink = pending->mm_geo.now - bottom;
pending->mm_geo.now = bottom;
if (mdbx_meta_txnid_stable(env, head) == pending->mm_txnid_a)
mdbx_meta_set_txnid(env, pending, pending->mm_txnid_a + 1);
}
}
}
}
@ -5755,14 +5775,16 @@ static int __cold mdbx_env_map(MDBX_env *env, const int is_exclusive,
#endif
if (is_exclusive && (env->me_flags & MDBX_WRITEMAP) != 0) {
#ifdef MADV_REMOVE_OR_FREE
const size_t used_alined2os = mdbx_roundup2(usedsize, env->me_os_psize);
if (used_alined2os < env->me_mapsize)
(void)madvise(env->me_map + used_alined2os,
env->me_mapsize - used_alined2os, MADV_REMOVE_OR_FREE);
#else
(void)usedsize;
#endif
const size_t used_aligned2os_bytes =
mdbx_roundup2(usedsize, env->me_os_psize);
*env->me_discarded_tail = bytes2pgno(env, used_aligned2os_bytes);
if (used_aligned2os_bytes < env->me_mapsize) {
#if defined(MADV_REMOVE_OR_FREE_OR_DONTNEED)
(void)madvise(env->me_map + used_aligned2os_bytes,
env->me_mapsize - used_aligned2os_bytes,
MADV_REMOVE_OR_FREE_OR_DONTNEED);
#endif /* MADV_REMOVE_OR_FREE_OR_DONTNEED */
}
}
#ifdef POSIX_FADV_RANDOM
@ -6511,6 +6533,7 @@ static int __cold mdbx_setup_lck(MDBX_env *env, char *lck_pathname,
env->me_autosync_period = &env->me_lckless_stub.autosync_period;
env->me_unsynced_pages = &env->me_lckless_stub.autosync_pending;
env->me_autosync_threshold = &env->me_lckless_stub.autosync_threshold;
env->me_discarded_tail = &env->me_lckless_stub.discarded_tail;
env->me_maxreaders = UINT_MAX;
#ifdef MDBX_OSAL_LOCK
env->me_wmutex = &env->me_lckless_stub.wmutex;
@ -6623,6 +6646,7 @@ static int __cold mdbx_setup_lck(MDBX_env *env, char *lck_pathname,
env->me_autosync_period = &env->me_lck->mti_autosync_period;
env->me_unsynced_pages = &env->me_lck->mti_unsynced_pages;
env->me_autosync_threshold = &env->me_lck->mti_autosync_threshold;
env->me_discarded_tail = &env->me_lck->mti_discarded_tail;
#ifdef MDBX_OSAL_LOCK
env->me_wmutex = &env->me_lck->mti_wmutex;
#endif

View File

@ -200,13 +200,15 @@ typedef pthread_mutex_t mdbx_fastmutex_t;
#define MADV_DONTDUMP MADV_NOCORE
#endif /* MADV_NOCORE -> MADV_DONTDUMP */
#ifndef MADV_REMOVE_OR_FREE
#ifndef MADV_REMOVE_OR_FREE_OR_DONTNEED
#ifdef MADV_REMOVE
#define MADV_REMOVE_OR_FREE MADV_REMOVE
#define MADV_REMOVE_OR_FREE_OR_DONTNEED MADV_REMOVE
#elif defined(MADV_FREE)
#define MADV_REMOVE_OR_FREE MADV_FREE
#define MADV_REMOVE_OR_FREE_OR_DONTNEED MADV_FREE
#elif defined(MADV_DONTNEED)
#define MADV_REMOVE_OR_FREE_OR_DONTNEED MADV_DONTNEED
#endif
#endif /* MADV_REMOVE_OR_FREE */
#endif /* MADV_REMOVE_OR_FREE_OR_DONTNEED */
#if defined(i386) || defined(__386) || defined(__i386) || defined(__i386__) || \
defined(i486) || defined(__i486) || defined(__i486__) || \
@ -765,6 +767,10 @@ typedef BOOL(WINAPI *MDBX_PrefetchVirtualMemory)(
PWIN32_MEMORY_RANGE_ENTRY VirtualAddresses, ULONG Flags);
extern MDBX_PrefetchVirtualMemory mdbx_PrefetchVirtualMemory;
typedef DWORD(WINAPI *MDBX_DiscardVirtualMemory)(PVOID VirtualAddress,
SIZE_T Size);
extern MDBX_DiscardVirtualMemory mdbx_DiscardVirtualMemory;
#endif /* Windows */
/*----------------------------------------------------------------------------*/