mirror of
https://github.com/isar/libmdbx.git
synced 2025-01-30 07:32:06 +08:00
mdbx: добавление проверки посредством mincore()
с кэшированием присутствия страниц в памяти (опция сборки MDBX_ENABLE_MINCORE
).
This commit is contained in:
parent
be3ff92772
commit
a772a9d3e1
1
mdbx.h
1
mdbx.h
@ -2594,6 +2594,7 @@ struct MDBX_envinfo {
|
||||
uint64_t wops; /**< Number of explicit write operations (not a pages)
|
||||
to a disk */
|
||||
uint64_t prefault; /**< Number of prefault write operations (not a pages) */
|
||||
uint64_t mincore; /**< Number of mincore() calls */
|
||||
uint64_t
|
||||
msync; /**< Number of explicit msync-to-disk operations (not a pages) */
|
||||
uint64_t
|
||||
|
176
src/core.c
176
src/core.c
@ -5599,6 +5599,11 @@ __hot static int __must_check_result page_dirty(MDBX_txn *txn, MDBX_page *mp,
|
||||
return MDBX_SUCCESS;
|
||||
}
|
||||
|
||||
static void mincore_clean_cache(const MDBX_env *const env) {
|
||||
memset(env->me_lck->mti_mincore_cache.begin, -1,
|
||||
sizeof(env->me_lck->mti_mincore_cache.begin));
|
||||
}
|
||||
|
||||
#if !(defined(_WIN32) || defined(_WIN64))
|
||||
MDBX_MAYBE_UNUSED static __always_inline int ignore_enosys(int err) {
|
||||
#ifdef ENOSYS
|
||||
@ -5723,6 +5728,7 @@ __cold static int set_readahead(const MDBX_env *env, const pgno_t edge,
|
||||
#endif
|
||||
}
|
||||
} else {
|
||||
mincore_clean_cache(env);
|
||||
#if defined(MADV_RANDOM)
|
||||
err =
|
||||
madvise(ptr, length, MADV_RANDOM) ? ignore_enosys(errno) : MDBX_SUCCESS;
|
||||
@ -5938,6 +5944,7 @@ __cold static int map_resize(MDBX_env *env, const pgno_t used_pgno,
|
||||
? 0
|
||||
: bytes2pgno(env, size_bytes);
|
||||
munlock_after(env, aligned_munlock_pgno, size_bytes);
|
||||
mincore_clean_cache(env);
|
||||
|
||||
#if MDBX_ENABLE_MADVISE
|
||||
if (size_bytes < prev_size) {
|
||||
@ -6753,6 +6760,99 @@ __hot static pgno_t pnl_get_sequence(MDBX_PNL pnl, const size_t num,
|
||||
return 0;
|
||||
}
|
||||
|
||||
#if MDBX_ENABLE_MINCORE
|
||||
static __inline bool bit_tas(uint64_t *field, char bit) {
|
||||
const uint64_t m = UINT64_C(1) << bit;
|
||||
const bool r = (*field & m) != 0;
|
||||
*field |= m;
|
||||
return r;
|
||||
}
|
||||
|
||||
static bool mincore_fetch(MDBX_env *const env, const size_t unit_begin) {
|
||||
MDBX_lockinfo *const lck = env->me_lck;
|
||||
for (size_t i = 1; i < ARRAY_LENGTH(lck->mti_mincore_cache.begin); ++i) {
|
||||
const ptrdiff_t dist = unit_begin - lck->mti_mincore_cache.begin[i];
|
||||
if (likely(dist >= 0 && dist < 64)) {
|
||||
const pgno_t tmp_begin = lck->mti_mincore_cache.begin[i];
|
||||
const uint64_t tmp_mask = lck->mti_mincore_cache.mask[i];
|
||||
do {
|
||||
lck->mti_mincore_cache.begin[i] = lck->mti_mincore_cache.begin[i - 1];
|
||||
lck->mti_mincore_cache.mask[i] = lck->mti_mincore_cache.mask[i - 1];
|
||||
} while (--i);
|
||||
lck->mti_mincore_cache.begin[0] = tmp_begin;
|
||||
lck->mti_mincore_cache.mask[0] = tmp_mask;
|
||||
return bit_tas(lck->mti_mincore_cache.mask, (char)dist);
|
||||
}
|
||||
}
|
||||
|
||||
size_t pages = 64;
|
||||
unsigned unit_log = sys_pagesize_ln2;
|
||||
unsigned shift = 0;
|
||||
if (env->me_psize > env->me_os_psize) {
|
||||
unit_log = env->me_psize2log;
|
||||
shift = env->me_psize2log - sys_pagesize_ln2;
|
||||
pages <<= shift;
|
||||
}
|
||||
|
||||
const size_t offset = unit_begin << unit_log;
|
||||
size_t length = pages << sys_pagesize_ln2;
|
||||
if (offset + length > env->me_dxb_mmap.current) {
|
||||
length = env->me_dxb_mmap.current - offset;
|
||||
pages = length >> sys_pagesize_ln2;
|
||||
}
|
||||
|
||||
#if MDBX_ENABLE_PGOP_STAT
|
||||
env->me_lck->mti_pgop_stat.mincore.weak += 1;
|
||||
#endif /* MDBX_ENABLE_PGOP_STAT */
|
||||
uint8_t *const vector = alloca(pages);
|
||||
if (unlikely(mincore(ptr_disp(env->me_dxb_mmap.base, offset), length,
|
||||
(void *)vector))) {
|
||||
NOTICE("mincore(+%zu, %zu), err %d", offset, length, errno);
|
||||
return false;
|
||||
}
|
||||
|
||||
for (size_t i = 1; i < ARRAY_LENGTH(lck->mti_mincore_cache.begin); ++i) {
|
||||
lck->mti_mincore_cache.begin[i] = lck->mti_mincore_cache.begin[i - 1];
|
||||
lck->mti_mincore_cache.mask[i] = lck->mti_mincore_cache.mask[i - 1];
|
||||
}
|
||||
lck->mti_mincore_cache.begin[0] = unit_begin;
|
||||
|
||||
uint64_t mask = 0;
|
||||
#ifdef MINCORE_INCORE
|
||||
STATIC_ASSERT(MINCORE_INCORE == 1);
|
||||
#endif
|
||||
for (size_t i = 0; i < pages; ++i) {
|
||||
uint64_t bit = (vector[i] & 1) == 0;
|
||||
bit <<= i >> shift;
|
||||
mask |= bit;
|
||||
}
|
||||
|
||||
lck->mti_mincore_cache.mask[0] = ~mask;
|
||||
return bit_tas(lck->mti_mincore_cache.mask, 0);
|
||||
}
|
||||
#endif /* MDBX_ENABLE_MINCORE */
|
||||
|
||||
MDBX_MAYBE_UNUSED static __inline bool mincore_probe(MDBX_env *const env,
|
||||
const pgno_t pgno) {
|
||||
#if MDBX_ENABLE_MINCORE
|
||||
const size_t offset_aligned =
|
||||
floor_powerof2(pgno2bytes(env, pgno), env->me_os_psize);
|
||||
const unsigned unit_log2 = (env->me_psize2log > sys_pagesize_ln2)
|
||||
? env->me_psize2log
|
||||
: sys_pagesize_ln2;
|
||||
const size_t unit_begin = offset_aligned >> unit_log2;
|
||||
eASSERT(env, (unit_begin << unit_log2) == offset_aligned);
|
||||
const ptrdiff_t dist = unit_begin - env->me_lck->mti_mincore_cache.begin[0];
|
||||
if (likely(dist >= 0 && dist < 64))
|
||||
return bit_tas(env->me_lck->mti_mincore_cache.mask, (char)dist);
|
||||
return mincore_fetch(env, unit_begin);
|
||||
#else
|
||||
(void)env;
|
||||
(void)pgno;
|
||||
return false;
|
||||
#endif /* MDBX_ENABLE_MINCORE */
|
||||
}
|
||||
|
||||
static __inline pgr_t page_alloc_finalize(MDBX_env *const env,
|
||||
MDBX_txn *const txn,
|
||||
const MDBX_cursor *const mc,
|
||||
@ -6769,6 +6869,7 @@ static __inline pgr_t page_alloc_finalize(MDBX_env *const env,
|
||||
ENSURE(env, pgno >= NUM_METAS);
|
||||
|
||||
pgr_t ret;
|
||||
bool need_clean = (env->me_flags & MDBX_PAGEPERTURB) != 0;
|
||||
if (env->me_flags & MDBX_WRITEMAP) {
|
||||
ret.page = pgno2page(env, pgno);
|
||||
MDBX_ASAN_UNPOISON_MEMORY_REGION(ret.page, pgno2bytes(env, num));
|
||||
@ -6789,51 +6890,62 @@ static __inline pgr_t page_alloc_finalize(MDBX_env *const env,
|
||||
* с диска. При этом запись на диск должна быть отложена адекватным ядром,
|
||||
* так как страница отображена в память в режиме чтения-записи и следом в
|
||||
* неё пишет ЦПУ. */
|
||||
void *const pattern = ptr_disp(
|
||||
env->me_pbuf,
|
||||
(env->me_flags & MDBX_PAGEPERTURB) ? env->me_psize : env->me_psize * 2);
|
||||
size_t file_offset = pgno2bytes(env, pgno);
|
||||
/* TODO: добавить проверку через mincore() c кэшированием результатов. */
|
||||
if (likely(num == 1)) {
|
||||
osal_pwrite(env->me_lazy_fd, pattern, env->me_psize, file_offset);
|
||||
} else {
|
||||
struct iovec iov[MDBX_AUXILARY_IOV_MAX];
|
||||
iov[0].iov_len = env->me_psize;
|
||||
iov[0].iov_base = pattern;
|
||||
size_t n = 1, left = num - 1;
|
||||
do {
|
||||
iov[n].iov_len = env->me_psize;
|
||||
iov[n].iov_base = pattern;
|
||||
if (++n == MDBX_AUXILARY_IOV_MAX) {
|
||||
osal_pwritev(env->me_lazy_fd, iov, MDBX_AUXILARY_IOV_MAX,
|
||||
file_offset);
|
||||
file_offset += pgno2bytes(env, MDBX_AUXILARY_IOV_MAX);
|
||||
const bool readahead_enabled = env->me_lck->mti_readahead_anchor & 1;
|
||||
const pgno_t readahead_edge = env->me_lck->mti_readahead_anchor >> 1;
|
||||
/* Не суетимся если страница в зоне включенного упреждающего чтения */
|
||||
if (!readahead_enabled || pgno + num > readahead_edge) {
|
||||
void *const pattern = ptr_disp(
|
||||
env->me_pbuf, need_clean ? env->me_psize : env->me_psize * 2);
|
||||
size_t file_offset = pgno2bytes(env, pgno);
|
||||
if (likely(num == 1)) {
|
||||
if (!mincore_probe(env, pgno)) {
|
||||
osal_pwrite(env->me_lazy_fd, pattern, env->me_psize, file_offset);
|
||||
#if MDBX_ENABLE_PGOP_STAT
|
||||
env->me_lck->mti_pgop_stat.prefault.weak += 1;
|
||||
#endif /* MDBX_ENABLE_PGOP_STAT */
|
||||
n = 0;
|
||||
need_clean = false;
|
||||
}
|
||||
} while (--left);
|
||||
osal_pwritev(env->me_lazy_fd, iov, n, file_offset);
|
||||
}
|
||||
} else {
|
||||
struct iovec iov[MDBX_AUXILARY_IOV_MAX];
|
||||
size_t n = 0, cleared = 0;
|
||||
for (size_t i = 0; i < num; ++i) {
|
||||
if (!mincore_probe(env, pgno + (pgno_t)i)) {
|
||||
++cleared;
|
||||
iov[n].iov_len = env->me_psize;
|
||||
iov[n].iov_base = pattern;
|
||||
if (unlikely(++n == MDBX_AUXILARY_IOV_MAX)) {
|
||||
osal_pwritev(env->me_lazy_fd, iov, MDBX_AUXILARY_IOV_MAX,
|
||||
file_offset);
|
||||
#if MDBX_ENABLE_PGOP_STAT
|
||||
env->me_lck->mti_pgop_stat.prefault.weak += 1;
|
||||
env->me_lck->mti_pgop_stat.prefault.weak += 1;
|
||||
#endif /* MDBX_ENABLE_PGOP_STAT */
|
||||
#else
|
||||
if (unlikely(env->me_flags & MDBX_PAGEPERTURB))
|
||||
memset(ret.page, -1, pgno2bytes(env, num));
|
||||
file_offset += pgno2bytes(env, MDBX_AUXILARY_IOV_MAX);
|
||||
n = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (likely(n > 0)) {
|
||||
osal_pwritev(env->me_lazy_fd, iov, n, file_offset);
|
||||
#if MDBX_ENABLE_PGOP_STAT
|
||||
env->me_lck->mti_pgop_stat.prefault.weak += 1;
|
||||
#endif /* MDBX_ENABLE_PGOP_STAT */
|
||||
}
|
||||
if (cleared == num)
|
||||
need_clean = false;
|
||||
}
|
||||
}
|
||||
#endif /* MDBX_ENABLE_PREFAULT */
|
||||
|
||||
} else {
|
||||
ret.page = page_malloc(txn, num);
|
||||
if (unlikely(!ret.page)) {
|
||||
ret.err = MDBX_ENOMEM;
|
||||
goto bailout;
|
||||
}
|
||||
if (unlikely(env->me_flags & MDBX_PAGEPERTURB))
|
||||
memset(ret.page, -1, pgno2bytes(env, num));
|
||||
}
|
||||
|
||||
if (unlikely(need_clean))
|
||||
memset(ret.page, -1, pgno2bytes(env, num));
|
||||
|
||||
VALGRIND_MAKE_MEM_UNDEFINED(ret.page, pgno2bytes(env, num));
|
||||
ret.page->mp_pgno = pgno;
|
||||
ret.page->mp_leaf2_ksize = 0;
|
||||
@ -14427,6 +14539,7 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname,
|
||||
}
|
||||
}
|
||||
|
||||
mincore_clean_cache(env);
|
||||
const int dxb_rc = setup_dxb(env, lck_rc, mode);
|
||||
if (MDBX_IS_ERROR(dxb_rc)) {
|
||||
rc = dxb_rc;
|
||||
@ -21639,6 +21752,8 @@ __cold static int fetch_envinfo_ex(const MDBX_env *env, const MDBX_txn *txn,
|
||||
atomic_load64(&lck->mti_pgop_stat.wops, mo_Relaxed);
|
||||
arg->mi_pgop_stat.prefault =
|
||||
atomic_load64(&lck->mti_pgop_stat.prefault, mo_Relaxed);
|
||||
arg->mi_pgop_stat.mincore =
|
||||
atomic_load64(&lck->mti_pgop_stat.mincore, mo_Relaxed);
|
||||
arg->mi_pgop_stat.msync =
|
||||
atomic_load64(&lck->mti_pgop_stat.msync, mo_Relaxed);
|
||||
arg->mi_pgop_stat.fsync =
|
||||
@ -24760,6 +24875,7 @@ __dll_export
|
||||
" MDBX_ENABLE_REFUND=" MDBX_STRINGIFY(MDBX_ENABLE_REFUND)
|
||||
" MDBX_ENABLE_MADVISE=" MDBX_STRINGIFY(MDBX_ENABLE_MADVISE)
|
||||
" MDBX_ENABLE_PREFAULT=" MDBX_STRINGIFY(MDBX_ENABLE_PREFAULT)
|
||||
" MDBX_ENABLE_MINCORE=" MDBX_STRINGIFY(MDBX_ENABLE_MINCORE)
|
||||
" MDBX_ENABLE_PGOP_STAT=" MDBX_STRINGIFY(MDBX_ENABLE_PGOP_STAT)
|
||||
" MDBX_ENABLE_PROFGC=" MDBX_STRINGIFY(MDBX_ENABLE_PROFGC)
|
||||
#if MDBX_DISABLE_VALIDATION
|
||||
|
@ -620,6 +620,7 @@ typedef struct pgop_stat {
|
||||
fsync; /* Number of explicit fsync/flush-to-disk operations */
|
||||
|
||||
MDBX_atomic_uint64_t prefault; /* Number of prefault write operations */
|
||||
MDBX_atomic_uint64_t mincore; /* Number of mincore() calls */
|
||||
|
||||
/* Статистика для профилирования GC.
|
||||
* Логически эти данные может быть стоит вынести в другую структуру,
|
||||
@ -813,6 +814,12 @@ typedef struct MDBX_lockinfo {
|
||||
/* Shared anchor for tracking readahead edge and enabled/disabled status. */
|
||||
pgno_t mti_readahead_anchor;
|
||||
|
||||
/* Shared cache for mincore() results */
|
||||
struct {
|
||||
pgno_t begin[4];
|
||||
uint64_t mask[4];
|
||||
} mti_mincore_cache;
|
||||
|
||||
MDBX_ALIGNAS(MDBX_CACHELINE_SIZE) /* cacheline ----------------------------*/
|
||||
|
||||
/* Readeaders registration lock. */
|
||||
|
@ -99,6 +99,19 @@
|
||||
#error MDBX_ENABLE_PREFAULT must be defined as 0 or 1
|
||||
#endif /* MDBX_ENABLE_PREFAULT */
|
||||
|
||||
/** Controls using Unix' mincore() to determine whether DB-pages
|
||||
* are resident in memory. */
|
||||
#ifndef MDBX_ENABLE_MINCORE
|
||||
#if MDBX_ENABLE_PREFAULT && \
|
||||
(defined(MINCORE_INCORE) || !(defined(_WIN32) || defined(_WIN64)))
|
||||
#define MDBX_ENABLE_MINCORE 1
|
||||
#else
|
||||
#define MDBX_ENABLE_MINCORE 0
|
||||
#endif
|
||||
#elif !(MDBX_ENABLE_MINCORE == 0 || MDBX_ENABLE_MINCORE == 1)
|
||||
#error MDBX_ENABLE_MINCORE must be defined as 0 or 1
|
||||
#endif /* MDBX_ENABLE_MINCORE */
|
||||
|
||||
/** Enables chunking long list of retired pages during huge transactions commit
|
||||
* to avoid use sequences of pages. */
|
||||
#ifndef MDBX_ENABLE_BIGFOOT
|
||||
|
@ -3336,7 +3336,7 @@ __cold int mdbx_get_sysraminfo(intptr_t *page_size, intptr_t *total_pages,
|
||||
|
||||
#ifndef xMDBX_ALLOY
|
||||
unsigned sys_pagesize;
|
||||
MDBX_MAYBE_UNUSED unsigned sys_allocation_granularity;
|
||||
MDBX_MAYBE_UNUSED unsigned sys_pagesize_ln2, sys_allocation_granularity;
|
||||
#endif /* xMDBX_ALLOY */
|
||||
|
||||
void osal_ctor(void) {
|
||||
@ -3362,6 +3362,7 @@ void osal_ctor(void) {
|
||||
assert(sys_pagesize > 0 && (sys_pagesize & (sys_pagesize - 1)) == 0);
|
||||
assert(sys_allocation_granularity >= sys_pagesize &&
|
||||
sys_allocation_granularity % sys_pagesize == 0);
|
||||
sys_pagesize_ln2 = log2n_powerof2(sys_pagesize);
|
||||
|
||||
#if defined(__linux__) || defined(__gnu_linux__)
|
||||
posix_clockid = choice_monoclock();
|
||||
|
@ -211,7 +211,8 @@ typedef pthread_mutex_t osal_fastmutex_t;
|
||||
/* OS abstraction layer stuff */
|
||||
|
||||
MDBX_INTERNAL_VAR unsigned sys_pagesize;
|
||||
MDBX_MAYBE_UNUSED MDBX_INTERNAL_VAR unsigned sys_allocation_granularity;
|
||||
MDBX_MAYBE_UNUSED MDBX_INTERNAL_VAR unsigned sys_pagesize_ln2,
|
||||
sys_allocation_granularity;
|
||||
|
||||
/* Get the size of a memory page for the system.
|
||||
* This is the basic size that the platform's memory manager uses, and is
|
||||
|
Loading…
x
Reference in New Issue
Block a user