mdbx: добавление проверки посредством mincore() с кэшированием присутствия страниц в памяти (опция сборки MDBX_ENABLE_MINCORE).

This commit is contained in:
Леонид Юрьев (Leonid Yuriev) 2022-12-05 10:41:05 +03:00
parent be3ff92772
commit a772a9d3e1
6 changed files with 171 additions and 32 deletions

1
mdbx.h
View File

@ -2594,6 +2594,7 @@ struct MDBX_envinfo {
uint64_t wops; /**< Number of explicit write operations (not a pages)
to a disk */
uint64_t prefault; /**< Number of prefault write operations (not a pages) */
uint64_t mincore; /**< Number of mincore() calls */
uint64_t
msync; /**< Number of explicit msync-to-disk operations (not a pages) */
uint64_t

View File

@ -5599,6 +5599,11 @@ __hot static int __must_check_result page_dirty(MDBX_txn *txn, MDBX_page *mp,
return MDBX_SUCCESS;
}
static void mincore_clean_cache(const MDBX_env *const env) {
memset(env->me_lck->mti_mincore_cache.begin, -1,
sizeof(env->me_lck->mti_mincore_cache.begin));
}
#if !(defined(_WIN32) || defined(_WIN64))
MDBX_MAYBE_UNUSED static __always_inline int ignore_enosys(int err) {
#ifdef ENOSYS
@ -5723,6 +5728,7 @@ __cold static int set_readahead(const MDBX_env *env, const pgno_t edge,
#endif
}
} else {
mincore_clean_cache(env);
#if defined(MADV_RANDOM)
err =
madvise(ptr, length, MADV_RANDOM) ? ignore_enosys(errno) : MDBX_SUCCESS;
@ -5938,6 +5944,7 @@ __cold static int map_resize(MDBX_env *env, const pgno_t used_pgno,
? 0
: bytes2pgno(env, size_bytes);
munlock_after(env, aligned_munlock_pgno, size_bytes);
mincore_clean_cache(env);
#if MDBX_ENABLE_MADVISE
if (size_bytes < prev_size) {
@ -6753,6 +6760,99 @@ __hot static pgno_t pnl_get_sequence(MDBX_PNL pnl, const size_t num,
return 0;
}
#if MDBX_ENABLE_MINCORE
static __inline bool bit_tas(uint64_t *field, char bit) {
const uint64_t m = UINT64_C(1) << bit;
const bool r = (*field & m) != 0;
*field |= m;
return r;
}
static bool mincore_fetch(MDBX_env *const env, const size_t unit_begin) {
MDBX_lockinfo *const lck = env->me_lck;
for (size_t i = 1; i < ARRAY_LENGTH(lck->mti_mincore_cache.begin); ++i) {
const ptrdiff_t dist = unit_begin - lck->mti_mincore_cache.begin[i];
if (likely(dist >= 0 && dist < 64)) {
const pgno_t tmp_begin = lck->mti_mincore_cache.begin[i];
const uint64_t tmp_mask = lck->mti_mincore_cache.mask[i];
do {
lck->mti_mincore_cache.begin[i] = lck->mti_mincore_cache.begin[i - 1];
lck->mti_mincore_cache.mask[i] = lck->mti_mincore_cache.mask[i - 1];
} while (--i);
lck->mti_mincore_cache.begin[0] = tmp_begin;
lck->mti_mincore_cache.mask[0] = tmp_mask;
return bit_tas(lck->mti_mincore_cache.mask, (char)dist);
}
}
size_t pages = 64;
unsigned unit_log = sys_pagesize_ln2;
unsigned shift = 0;
if (env->me_psize > env->me_os_psize) {
unit_log = env->me_psize2log;
shift = env->me_psize2log - sys_pagesize_ln2;
pages <<= shift;
}
const size_t offset = unit_begin << unit_log;
size_t length = pages << sys_pagesize_ln2;
if (offset + length > env->me_dxb_mmap.current) {
length = env->me_dxb_mmap.current - offset;
pages = length >> sys_pagesize_ln2;
}
#if MDBX_ENABLE_PGOP_STAT
env->me_lck->mti_pgop_stat.mincore.weak += 1;
#endif /* MDBX_ENABLE_PGOP_STAT */
uint8_t *const vector = alloca(pages);
if (unlikely(mincore(ptr_disp(env->me_dxb_mmap.base, offset), length,
(void *)vector))) {
NOTICE("mincore(+%zu, %zu), err %d", offset, length, errno);
return false;
}
for (size_t i = 1; i < ARRAY_LENGTH(lck->mti_mincore_cache.begin); ++i) {
lck->mti_mincore_cache.begin[i] = lck->mti_mincore_cache.begin[i - 1];
lck->mti_mincore_cache.mask[i] = lck->mti_mincore_cache.mask[i - 1];
}
lck->mti_mincore_cache.begin[0] = unit_begin;
uint64_t mask = 0;
#ifdef MINCORE_INCORE
STATIC_ASSERT(MINCORE_INCORE == 1);
#endif
for (size_t i = 0; i < pages; ++i) {
uint64_t bit = (vector[i] & 1) == 0;
bit <<= i >> shift;
mask |= bit;
}
lck->mti_mincore_cache.mask[0] = ~mask;
return bit_tas(lck->mti_mincore_cache.mask, 0);
}
#endif /* MDBX_ENABLE_MINCORE */
MDBX_MAYBE_UNUSED static __inline bool mincore_probe(MDBX_env *const env,
const pgno_t pgno) {
#if MDBX_ENABLE_MINCORE
const size_t offset_aligned =
floor_powerof2(pgno2bytes(env, pgno), env->me_os_psize);
const unsigned unit_log2 = (env->me_psize2log > sys_pagesize_ln2)
? env->me_psize2log
: sys_pagesize_ln2;
const size_t unit_begin = offset_aligned >> unit_log2;
eASSERT(env, (unit_begin << unit_log2) == offset_aligned);
const ptrdiff_t dist = unit_begin - env->me_lck->mti_mincore_cache.begin[0];
if (likely(dist >= 0 && dist < 64))
return bit_tas(env->me_lck->mti_mincore_cache.mask, (char)dist);
return mincore_fetch(env, unit_begin);
#else
(void)env;
(void)pgno;
return false;
#endif /* MDBX_ENABLE_MINCORE */
}
static __inline pgr_t page_alloc_finalize(MDBX_env *const env,
MDBX_txn *const txn,
const MDBX_cursor *const mc,
@ -6769,6 +6869,7 @@ static __inline pgr_t page_alloc_finalize(MDBX_env *const env,
ENSURE(env, pgno >= NUM_METAS);
pgr_t ret;
bool need_clean = (env->me_flags & MDBX_PAGEPERTURB) != 0;
if (env->me_flags & MDBX_WRITEMAP) {
ret.page = pgno2page(env, pgno);
MDBX_ASAN_UNPOISON_MEMORY_REGION(ret.page, pgno2bytes(env, num));
@ -6789,51 +6890,62 @@ static __inline pgr_t page_alloc_finalize(MDBX_env *const env,
* с диска. При этом запись на диск должна быть отложена адекватным ядром,
* так как страница отображена в память в режиме чтения-записи и следом в
* неё пишет ЦПУ. */
void *const pattern = ptr_disp(
env->me_pbuf,
(env->me_flags & MDBX_PAGEPERTURB) ? env->me_psize : env->me_psize * 2);
size_t file_offset = pgno2bytes(env, pgno);
/* TODO: добавить проверку через mincore() c кэшированием результатов. */
if (likely(num == 1)) {
osal_pwrite(env->me_lazy_fd, pattern, env->me_psize, file_offset);
} else {
struct iovec iov[MDBX_AUXILARY_IOV_MAX];
iov[0].iov_len = env->me_psize;
iov[0].iov_base = pattern;
size_t n = 1, left = num - 1;
do {
iov[n].iov_len = env->me_psize;
iov[n].iov_base = pattern;
if (++n == MDBX_AUXILARY_IOV_MAX) {
osal_pwritev(env->me_lazy_fd, iov, MDBX_AUXILARY_IOV_MAX,
file_offset);
file_offset += pgno2bytes(env, MDBX_AUXILARY_IOV_MAX);
const bool readahead_enabled = env->me_lck->mti_readahead_anchor & 1;
const pgno_t readahead_edge = env->me_lck->mti_readahead_anchor >> 1;
/* Не суетимся если страница в зоне включенного упреждающего чтения */
if (!readahead_enabled || pgno + num > readahead_edge) {
void *const pattern = ptr_disp(
env->me_pbuf, need_clean ? env->me_psize : env->me_psize * 2);
size_t file_offset = pgno2bytes(env, pgno);
if (likely(num == 1)) {
if (!mincore_probe(env, pgno)) {
osal_pwrite(env->me_lazy_fd, pattern, env->me_psize, file_offset);
#if MDBX_ENABLE_PGOP_STAT
env->me_lck->mti_pgop_stat.prefault.weak += 1;
#endif /* MDBX_ENABLE_PGOP_STAT */
n = 0;
need_clean = false;
}
} while (--left);
osal_pwritev(env->me_lazy_fd, iov, n, file_offset);
}
} else {
struct iovec iov[MDBX_AUXILARY_IOV_MAX];
size_t n = 0, cleared = 0;
for (size_t i = 0; i < num; ++i) {
if (!mincore_probe(env, pgno + (pgno_t)i)) {
++cleared;
iov[n].iov_len = env->me_psize;
iov[n].iov_base = pattern;
if (unlikely(++n == MDBX_AUXILARY_IOV_MAX)) {
osal_pwritev(env->me_lazy_fd, iov, MDBX_AUXILARY_IOV_MAX,
file_offset);
#if MDBX_ENABLE_PGOP_STAT
env->me_lck->mti_pgop_stat.prefault.weak += 1;
env->me_lck->mti_pgop_stat.prefault.weak += 1;
#endif /* MDBX_ENABLE_PGOP_STAT */
#else
if (unlikely(env->me_flags & MDBX_PAGEPERTURB))
memset(ret.page, -1, pgno2bytes(env, num));
file_offset += pgno2bytes(env, MDBX_AUXILARY_IOV_MAX);
n = 0;
}
}
}
if (likely(n > 0)) {
osal_pwritev(env->me_lazy_fd, iov, n, file_offset);
#if MDBX_ENABLE_PGOP_STAT
env->me_lck->mti_pgop_stat.prefault.weak += 1;
#endif /* MDBX_ENABLE_PGOP_STAT */
}
if (cleared == num)
need_clean = false;
}
}
#endif /* MDBX_ENABLE_PREFAULT */
} else {
ret.page = page_malloc(txn, num);
if (unlikely(!ret.page)) {
ret.err = MDBX_ENOMEM;
goto bailout;
}
if (unlikely(env->me_flags & MDBX_PAGEPERTURB))
memset(ret.page, -1, pgno2bytes(env, num));
}
if (unlikely(need_clean))
memset(ret.page, -1, pgno2bytes(env, num));
VALGRIND_MAKE_MEM_UNDEFINED(ret.page, pgno2bytes(env, num));
ret.page->mp_pgno = pgno;
ret.page->mp_leaf2_ksize = 0;
@ -14427,6 +14539,7 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname,
}
}
mincore_clean_cache(env);
const int dxb_rc = setup_dxb(env, lck_rc, mode);
if (MDBX_IS_ERROR(dxb_rc)) {
rc = dxb_rc;
@ -21639,6 +21752,8 @@ __cold static int fetch_envinfo_ex(const MDBX_env *env, const MDBX_txn *txn,
atomic_load64(&lck->mti_pgop_stat.wops, mo_Relaxed);
arg->mi_pgop_stat.prefault =
atomic_load64(&lck->mti_pgop_stat.prefault, mo_Relaxed);
arg->mi_pgop_stat.mincore =
atomic_load64(&lck->mti_pgop_stat.mincore, mo_Relaxed);
arg->mi_pgop_stat.msync =
atomic_load64(&lck->mti_pgop_stat.msync, mo_Relaxed);
arg->mi_pgop_stat.fsync =
@ -24760,6 +24875,7 @@ __dll_export
" MDBX_ENABLE_REFUND=" MDBX_STRINGIFY(MDBX_ENABLE_REFUND)
" MDBX_ENABLE_MADVISE=" MDBX_STRINGIFY(MDBX_ENABLE_MADVISE)
" MDBX_ENABLE_PREFAULT=" MDBX_STRINGIFY(MDBX_ENABLE_PREFAULT)
" MDBX_ENABLE_MINCORE=" MDBX_STRINGIFY(MDBX_ENABLE_MINCORE)
" MDBX_ENABLE_PGOP_STAT=" MDBX_STRINGIFY(MDBX_ENABLE_PGOP_STAT)
" MDBX_ENABLE_PROFGC=" MDBX_STRINGIFY(MDBX_ENABLE_PROFGC)
#if MDBX_DISABLE_VALIDATION

View File

@ -620,6 +620,7 @@ typedef struct pgop_stat {
fsync; /* Number of explicit fsync/flush-to-disk operations */
MDBX_atomic_uint64_t prefault; /* Number of prefault write operations */
MDBX_atomic_uint64_t mincore; /* Number of mincore() calls */
/* Статистика для профилирования GC.
* Логически эти данные может быть стоит вынести в другую структуру,
@ -813,6 +814,12 @@ typedef struct MDBX_lockinfo {
/* Shared anchor for tracking readahead edge and enabled/disabled status. */
pgno_t mti_readahead_anchor;
/* Shared cache for mincore() results */
struct {
pgno_t begin[4];
uint64_t mask[4];
} mti_mincore_cache;
MDBX_ALIGNAS(MDBX_CACHELINE_SIZE) /* cacheline ----------------------------*/
/* Readeaders registration lock. */

View File

@ -99,6 +99,19 @@
#error MDBX_ENABLE_PREFAULT must be defined as 0 or 1
#endif /* MDBX_ENABLE_PREFAULT */
/** Controls using Unix' mincore() to determine whether DB-pages
* are resident in memory. */
#ifndef MDBX_ENABLE_MINCORE
#if MDBX_ENABLE_PREFAULT && \
(defined(MINCORE_INCORE) || !(defined(_WIN32) || defined(_WIN64)))
#define MDBX_ENABLE_MINCORE 1
#else
#define MDBX_ENABLE_MINCORE 0
#endif
#elif !(MDBX_ENABLE_MINCORE == 0 || MDBX_ENABLE_MINCORE == 1)
#error MDBX_ENABLE_MINCORE must be defined as 0 or 1
#endif /* MDBX_ENABLE_MINCORE */
/** Enables chunking long list of retired pages during huge transactions commit
* to avoid use sequences of pages. */
#ifndef MDBX_ENABLE_BIGFOOT

View File

@ -3336,7 +3336,7 @@ __cold int mdbx_get_sysraminfo(intptr_t *page_size, intptr_t *total_pages,
#ifndef xMDBX_ALLOY
unsigned sys_pagesize;
MDBX_MAYBE_UNUSED unsigned sys_allocation_granularity;
MDBX_MAYBE_UNUSED unsigned sys_pagesize_ln2, sys_allocation_granularity;
#endif /* xMDBX_ALLOY */
void osal_ctor(void) {
@ -3362,6 +3362,7 @@ void osal_ctor(void) {
assert(sys_pagesize > 0 && (sys_pagesize & (sys_pagesize - 1)) == 0);
assert(sys_allocation_granularity >= sys_pagesize &&
sys_allocation_granularity % sys_pagesize == 0);
sys_pagesize_ln2 = log2n_powerof2(sys_pagesize);
#if defined(__linux__) || defined(__gnu_linux__)
posix_clockid = choice_monoclock();

View File

@ -211,7 +211,8 @@ typedef pthread_mutex_t osal_fastmutex_t;
/* OS abstraction layer stuff */
MDBX_INTERNAL_VAR unsigned sys_pagesize;
MDBX_MAYBE_UNUSED MDBX_INTERNAL_VAR unsigned sys_allocation_granularity;
MDBX_MAYBE_UNUSED MDBX_INTERNAL_VAR unsigned sys_pagesize_ln2,
sys_allocation_granularity;
/* Get the size of a memory page for the system.
* This is the basic size that the platform's memory manager uses, and is