mdbx: dynamically discarding unused tail pages of DB file.

Change-Id: I1a0eee50cd27de26521e65c9f7ea51a527a0424e
This commit is contained in:
Leonid Yuriev 2019-08-28 04:57:07 +03:00
parent 327e5feb97
commit 51e7159f36
4 changed files with 82 additions and 34 deletions

View File

@ -475,6 +475,9 @@ typedef struct MDBX_lockinfo {
/* Number un-synced-with-disk pages for auto-sync feature. */ /* Number un-synced-with-disk pages for auto-sync feature. */
volatile pgno_t mti_unsynced_pages; volatile pgno_t mti_unsynced_pages;
/* Number of page which was discarded last time by madvise(MADV_FREE). */
volatile pgno_t mti_discarded_tail;
alignas(MDBX_CACHELINE_SIZE) /* cacheline ---------------------------------*/ alignas(MDBX_CACHELINE_SIZE) /* cacheline ---------------------------------*/
#ifdef MDBX_OSAL_LOCK #ifdef MDBX_OSAL_LOCK
@ -820,6 +823,7 @@ struct MDBX_env {
volatile uint64_t *me_autosync_period; volatile uint64_t *me_autosync_period;
volatile pgno_t *me_unsynced_pages; volatile pgno_t *me_unsynced_pages;
volatile pgno_t *me_autosync_threshold; volatile pgno_t *me_autosync_threshold;
volatile pgno_t *me_discarded_tail;
MDBX_oom_func *me_oom_func; /* Callback for kicking laggard readers */ MDBX_oom_func *me_oom_func; /* Callback for kicking laggard readers */
struct { struct {
#ifdef MDBX_OSAL_LOCK #ifdef MDBX_OSAL_LOCK
@ -830,6 +834,7 @@ struct MDBX_env {
uint64_t autosync_period; uint64_t autosync_period;
pgno_t autosync_pending; pgno_t autosync_pending;
pgno_t autosync_threshold; pgno_t autosync_threshold;
pgno_t discarded_tail;
} me_lckless_stub; } me_lckless_stub;
#if MDBX_DEBUG #if MDBX_DEBUG
MDBX_assert_func *me_assert_func; /* Callback for assertion failures */ MDBX_assert_func *me_assert_func; /* Callback for assertion failures */

View File

@ -663,11 +663,21 @@ MDBX_srwlock_function mdbx_srwlock_Init, mdbx_srwlock_AcquireShared,
/*----------------------------------------------------------------------------*/ /*----------------------------------------------------------------------------*/
static DWORD WINAPI stub_DiscardVirtualMemory(PVOID VirtualAddress,
SIZE_T Size) {
return VirtualAlloc(VirtualAddress, Size, MEM_RESET, PAGE_NOACCESS)
? ERROR_SUCCESS
: GetLastError();
}
/*----------------------------------------------------------------------------*/
MDBX_GetFileInformationByHandleEx mdbx_GetFileInformationByHandleEx; MDBX_GetFileInformationByHandleEx mdbx_GetFileInformationByHandleEx;
MDBX_GetVolumeInformationByHandleW mdbx_GetVolumeInformationByHandleW; MDBX_GetVolumeInformationByHandleW mdbx_GetVolumeInformationByHandleW;
MDBX_GetFinalPathNameByHandleW mdbx_GetFinalPathNameByHandleW; MDBX_GetFinalPathNameByHandleW mdbx_GetFinalPathNameByHandleW;
MDBX_SetFileInformationByHandle mdbx_SetFileInformationByHandle; MDBX_SetFileInformationByHandle mdbx_SetFileInformationByHandle;
MDBX_PrefetchVirtualMemory mdbx_PrefetchVirtualMemory; MDBX_PrefetchVirtualMemory mdbx_PrefetchVirtualMemory;
MDBX_DiscardVirtualMemory mdbx_DiscardVirtualMemory;
MDBX_NtFsControlFile mdbx_NtFsControlFile; MDBX_NtFsControlFile mdbx_NtFsControlFile;
static void mdbx_winnt_import(void) { static void mdbx_winnt_import(void) {
@ -700,6 +710,9 @@ static void mdbx_winnt_import(void) {
GET_KERNEL32_PROC(GetFinalPathNameByHandleW); GET_KERNEL32_PROC(GetFinalPathNameByHandleW);
GET_KERNEL32_PROC(SetFileInformationByHandle); GET_KERNEL32_PROC(SetFileInformationByHandle);
GET_KERNEL32_PROC(PrefetchVirtualMemory); GET_KERNEL32_PROC(PrefetchVirtualMemory);
GET_KERNEL32_PROC(DiscardVirtualMemory);
if (!mdbx_DiscardVirtualMemory)
mdbx_DiscardVirtualMemory = stub_DiscardVirtualMemory;
const HINSTANCE hNtdll = GetModuleHandleA("ntdll.dll"); const HINSTANCE hNtdll = GetModuleHandleA("ntdll.dll");
mdbx_NtFsControlFile = mdbx_NtFsControlFile =

View File

@ -5400,31 +5400,51 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags,
flags &= MDBX_WRITEMAP | MDBX_SHRINK_ALLOWED; /* force steady */ flags &= MDBX_WRITEMAP | MDBX_SHRINK_ALLOWED; /* force steady */
} }
/* LY: check conditions to shrink datafile */
const pgno_t backlog_gap =
pending->mm_dbs[FREE_DBI].md_depth + mdbx_backlog_extragap(env);
pgno_t shrink = 0; pgno_t shrink = 0;
if ((flags & MDBX_SHRINK_ALLOWED) && pending->mm_geo.shrink && if (flags & MDBX_SHRINK_ALLOWED) {
pending->mm_geo.now - pending->mm_geo.next > /* LY: check conditions to discard unused pages */
pending->mm_geo.shrink + backlog_gap) { const pgno_t largest_pgno = mdbx_find_largest(
const pgno_t largest = mdbx_find_largest(
env, (head->mm_geo.next > pending->mm_geo.next) ? head->mm_geo.next env, (head->mm_geo.next > pending->mm_geo.next) ? head->mm_geo.next
: pending->mm_geo.next); : pending->mm_geo.next);
if (pending->mm_geo.now > largest && const size_t largest_aligned2os_bytes =
pending->mm_geo.now - largest > pending->mm_geo.shrink + backlog_gap) { pgno_align2os_bytes(env, largest_pgno);
const pgno_t aligner = const pgno_t largest_aligned2os_pgno =
pending->mm_geo.grow ? pending->mm_geo.grow : pending->mm_geo.shrink; bytes2pgno(env, largest_aligned2os_bytes);
const pgno_t with_backlog_gap = largest + backlog_gap; const pgno_t prev_discarded_pgno = *env->me_discarded_tail;
const pgno_t aligned = pgno_align2os_pgno( *env->me_discarded_tail = largest_aligned2os_pgno;
env, with_backlog_gap + aligner - with_backlog_gap % aligner); if (prev_discarded_pgno > largest_aligned2os_pgno) {
const pgno_t bottom = const size_t prev_discarded_bytes =
(aligned > pending->mm_geo.lower) ? aligned : pending->mm_geo.lower; pgno_align2os_bytes(env, prev_discarded_pgno);
if (pending->mm_geo.now > bottom) { mdbx_ensure(env, prev_discarded_bytes > largest_aligned2os_bytes);
flags &= MDBX_WRITEMAP | MDBX_SHRINK_ALLOWED; /* force steady */ #if defined(MADV_REMOVE_OR_FREE_OR_DONTNEED)
shrink = pending->mm_geo.now - bottom; (void)madvise(env->me_map + largest_aligned2os_bytes,
pending->mm_geo.now = bottom; prev_discarded_bytes - largest_aligned2os_bytes,
if (mdbx_meta_txnid_stable(env, head) == pending->mm_txnid_a) MADV_REMOVE_OR_FREE_OR_DONTNEED);
mdbx_meta_set_txnid(env, pending, pending->mm_txnid_a + 1); #endif /* MADV_REMOVE_OR_FREE_OR_DONTNEED */
}
/* LY: check conditions to shrink datafile */
const pgno_t backlog_gap =
pending->mm_dbs[FREE_DBI].md_depth + mdbx_backlog_extragap(env);
if (pending->mm_geo.shrink && pending->mm_geo.now - pending->mm_geo.next >
pending->mm_geo.shrink + backlog_gap) {
if (pending->mm_geo.now > largest_pgno &&
pending->mm_geo.now - largest_pgno >
pending->mm_geo.shrink + backlog_gap) {
const pgno_t aligner = pending->mm_geo.grow ? pending->mm_geo.grow
: pending->mm_geo.shrink;
const pgno_t with_backlog_gap = largest_pgno + backlog_gap;
const pgno_t aligned = pgno_align2os_pgno(
env, with_backlog_gap + aligner - with_backlog_gap % aligner);
const pgno_t bottom =
(aligned > pending->mm_geo.lower) ? aligned : pending->mm_geo.lower;
if (pending->mm_geo.now > bottom) {
flags &= MDBX_WRITEMAP | MDBX_SHRINK_ALLOWED; /* force steady */
shrink = pending->mm_geo.now - bottom;
pending->mm_geo.now = bottom;
if (mdbx_meta_txnid_stable(env, head) == pending->mm_txnid_a)
mdbx_meta_set_txnid(env, pending, pending->mm_txnid_a + 1);
}
} }
} }
} }
@ -5755,14 +5775,16 @@ static int __cold mdbx_env_map(MDBX_env *env, const int is_exclusive,
#endif #endif
if (is_exclusive && (env->me_flags & MDBX_WRITEMAP) != 0) { if (is_exclusive && (env->me_flags & MDBX_WRITEMAP) != 0) {
#ifdef MADV_REMOVE_OR_FREE const size_t used_aligned2os_bytes =
const size_t used_alined2os = mdbx_roundup2(usedsize, env->me_os_psize); mdbx_roundup2(usedsize, env->me_os_psize);
if (used_alined2os < env->me_mapsize) *env->me_discarded_tail = bytes2pgno(env, used_aligned2os_bytes);
(void)madvise(env->me_map + used_alined2os, if (used_aligned2os_bytes < env->me_mapsize) {
env->me_mapsize - used_alined2os, MADV_REMOVE_OR_FREE); #if defined(MADV_REMOVE_OR_FREE_OR_DONTNEED)
#else (void)madvise(env->me_map + used_aligned2os_bytes,
(void)usedsize; env->me_mapsize - used_aligned2os_bytes,
#endif MADV_REMOVE_OR_FREE_OR_DONTNEED);
#endif /* MADV_REMOVE_OR_FREE_OR_DONTNEED */
}
} }
#ifdef POSIX_FADV_RANDOM #ifdef POSIX_FADV_RANDOM
@ -6511,6 +6533,7 @@ static int __cold mdbx_setup_lck(MDBX_env *env, char *lck_pathname,
env->me_autosync_period = &env->me_lckless_stub.autosync_period; env->me_autosync_period = &env->me_lckless_stub.autosync_period;
env->me_unsynced_pages = &env->me_lckless_stub.autosync_pending; env->me_unsynced_pages = &env->me_lckless_stub.autosync_pending;
env->me_autosync_threshold = &env->me_lckless_stub.autosync_threshold; env->me_autosync_threshold = &env->me_lckless_stub.autosync_threshold;
env->me_discarded_tail = &env->me_lckless_stub.discarded_tail;
env->me_maxreaders = UINT_MAX; env->me_maxreaders = UINT_MAX;
#ifdef MDBX_OSAL_LOCK #ifdef MDBX_OSAL_LOCK
env->me_wmutex = &env->me_lckless_stub.wmutex; env->me_wmutex = &env->me_lckless_stub.wmutex;
@ -6623,6 +6646,7 @@ static int __cold mdbx_setup_lck(MDBX_env *env, char *lck_pathname,
env->me_autosync_period = &env->me_lck->mti_autosync_period; env->me_autosync_period = &env->me_lck->mti_autosync_period;
env->me_unsynced_pages = &env->me_lck->mti_unsynced_pages; env->me_unsynced_pages = &env->me_lck->mti_unsynced_pages;
env->me_autosync_threshold = &env->me_lck->mti_autosync_threshold; env->me_autosync_threshold = &env->me_lck->mti_autosync_threshold;
env->me_discarded_tail = &env->me_lck->mti_discarded_tail;
#ifdef MDBX_OSAL_LOCK #ifdef MDBX_OSAL_LOCK
env->me_wmutex = &env->me_lck->mti_wmutex; env->me_wmutex = &env->me_lck->mti_wmutex;
#endif #endif

View File

@ -200,13 +200,15 @@ typedef pthread_mutex_t mdbx_fastmutex_t;
#define MADV_DONTDUMP MADV_NOCORE #define MADV_DONTDUMP MADV_NOCORE
#endif /* MADV_NOCORE -> MADV_DONTDUMP */ #endif /* MADV_NOCORE -> MADV_DONTDUMP */
#ifndef MADV_REMOVE_OR_FREE #ifndef MADV_REMOVE_OR_FREE_OR_DONTNEED
#ifdef MADV_REMOVE #ifdef MADV_REMOVE
#define MADV_REMOVE_OR_FREE MADV_REMOVE #define MADV_REMOVE_OR_FREE_OR_DONTNEED MADV_REMOVE
#elif defined(MADV_FREE) #elif defined(MADV_FREE)
#define MADV_REMOVE_OR_FREE MADV_FREE #define MADV_REMOVE_OR_FREE_OR_DONTNEED MADV_FREE
#elif defined(MADV_DONTNEED)
#define MADV_REMOVE_OR_FREE_OR_DONTNEED MADV_DONTNEED
#endif #endif
#endif /* MADV_REMOVE_OR_FREE */ #endif /* MADV_REMOVE_OR_FREE_OR_DONTNEED */
#if defined(i386) || defined(__386) || defined(__i386) || defined(__i386__) || \ #if defined(i386) || defined(__386) || defined(__i386) || defined(__i386__) || \
defined(i486) || defined(__i486) || defined(__i486__) || \ defined(i486) || defined(__i486) || defined(__i486__) || \
@ -765,6 +767,10 @@ typedef BOOL(WINAPI *MDBX_PrefetchVirtualMemory)(
PWIN32_MEMORY_RANGE_ENTRY VirtualAddresses, ULONG Flags); PWIN32_MEMORY_RANGE_ENTRY VirtualAddresses, ULONG Flags);
extern MDBX_PrefetchVirtualMemory mdbx_PrefetchVirtualMemory; extern MDBX_PrefetchVirtualMemory mdbx_PrefetchVirtualMemory;
typedef DWORD(WINAPI *MDBX_DiscardVirtualMemory)(PVOID VirtualAddress,
SIZE_T Size);
extern MDBX_DiscardVirtualMemory mdbx_DiscardVirtualMemory;
#endif /* Windows */ #endif /* Windows */
/*----------------------------------------------------------------------------*/ /*----------------------------------------------------------------------------*/